In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from path import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sqlalchemy import create_engine

import psycopg2
# import the psycopg2 database adapter for PostgreSQL
from psycopg2 import connect, extensions, sql

In [2]:
import os,sys
parentdir = Path(os.path.abspath("../.."))
sys.path.insert(0,parentdir)

# Get the DB password
from config import db_password

In [3]:
# Create Engine for covid_property_pandemic DB
db_string = f"postgres://postgres:{db_password}@127.0.0.1:5432/covid_property_pandemic"
engine = create_engine(db_string)
conn = engine.connect()

In [5]:
## Get Data from Housing Table

sql_str = 'SELECT * FROM "FL_CA_Housing_Data"'

df = pd.read_sql(sql_str,conn)
df.drop(columns=['index'],axis=1,inplace=True)
print(df.shape)
df.sample(n=10)


(612, 4)


Unnamed: 0,RegionName,StateName,Date,Avg_Price
549,"Jacksonville, FL",FL,2020-07-31,10150
447,"Fort Myers, FL",FL,2020-01-31,10794
174,"Ventura, CA",CA,2018-10-31,2646
295,"Orlando, FL",FL,2019-05-31,14840
389,"Bakersfield, CA",CA,2019-10-31,2955
10,"Fresno, CA",CA,2018-01-31,1930
368,"San Jose, CA",CA,2019-09-30,3599
449,"Daytona Beach, FL",FL,2020-01-31,6356
562,"San Diego, CA",CA,2020-08-31,7998
46,"Fresno, CA",CA,2018-03-31,2014


In [6]:
FL_df = df.loc[df['StateName'] == 'FL']
print(FL_df.shape)
FL_df.sample(n=10)

(272, 4)


Unnamed: 0,RegionName,StateName,Date,Avg_Price
394,"Lakeland, FL",FL,2019-10-31,4379
297,"Jacksonville, FL",FL,2019-05-31,12204
437,"Tampa, FL",FL,2020-01-31,18856
340,"Lakeland, FL",FL,2019-07-31,4270
67,"North Port-Sarasota-Bradenton, FL",FL,2018-04-30,10935
279,"Jacksonville, FL",FL,2019-04-30,11780
339,"Fort Myers, FL",FL,2019-07-31,11512
45,"Jacksonville, FL",FL,2018-03-31,9905
583,"Orlando, FL",FL,2020-09-30,13008
193,"North Port-Sarasota-Bradenton, FL",FL,2018-11-30,8918


In [8]:
## Filter out the Dataset for prices between 2010 and  2019
FL_df.dtypes


RegionName    object
StateName     object
Date          object
Avg_Price      int64
dtype: object

In [9]:
# Date column is of type String. Convert it to datetime first
FL_df['Date'] = pd.to_datetime(FL_df['Date'])
FL_df.dtypes

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  FL_df['Date'] = pd.to_datetime(FL_df['Date'])


RegionName            object
StateName             object
Date          datetime64[ns]
Avg_Price              int64
dtype: object

In [10]:
start_date_train = '1/1/2010'
end_date_train = '12/31/2019'

In [11]:
mask = (FL_df['Date'] > start_date_train) & (FL_df['Date'] <= end_date_train)
FL_df_train = FL_df.loc[mask]
FL_df_train.head()

Unnamed: 0,RegionName,StateName,Date,Avg_Price
1,"Miami-Fort Lauderdale, FL",FL,2018-01-31,54091
5,"Tampa, FL",FL,2018-01-31,18385
7,"Orlando, FL",FL,2018-01-31,12341
9,"Jacksonville, FL",FL,2018-01-31,8932
13,"North Port-Sarasota-Bradenton, FL",FL,2018-01-31,9390


In [12]:
start_date_test = '1/1/2020'
end_date_test = '12/31/2020'

In [13]:
mask = (FL_df['Date'] > start_date_test) & (FL_df['Date'] <= end_date_test)
FL_df_test = FL_df.loc[mask]
FL_df_test

Unnamed: 0,RegionName,StateName,Date,Avg_Price
433,"Miami-Fort Lauderdale, FL",FL,2020-01-31,55650
437,"Tampa, FL",FL,2020-01-31,18856
439,"Orlando, FL",FL,2020-01-31,13267
441,"Jacksonville, FL",FL,2020-01-31,10435
445,"North Port-Sarasota-Bradenton, FL",FL,2020-01-31,9382
...,...,...,...,...
603,"Jacksonville, FL",FL,2020-10-31,8204
607,"North Port-Sarasota-Bradenton, FL",FL,2020-10-31,7112
609,"Fort Myers, FL",FL,2020-10-31,7886
610,"Lakeland, FL",FL,2020-10-31,3377


In [35]:
# Get the model ready
import numpy as np
X = np.array(FL_df_train['Date']).reshape(-1,1)
y = np.array(FL_df_train['Avg_Price']).reshape(-1,1)

In [36]:
model = LinearRegression()
model.fit(X, y)
y_pred = model.predict(X)

TypeError: invalid type promotion

In [7]:
CA_df = df.loc[df['StateName'] == 'CA']
print(CA_df.shape)
CA_df.sample(n=10)

(186240, 6)


Unnamed: 0,RegionName,State,Metro,CountyName,Date,Avg_Price
254705,Del Rey,CA,Fresno,Fresno County,2014-10-31,148486
281309,Stanton,CA,Los Angeles-Long Beach-Anaheim,Orange County,2016-10-31,498022
127466,Belvedere,CA,San Francisco-Oakland-Hayward,Marin County,2005-05-31,1957855
66024,Nipomo,CA,San Luis Obispo-Paso Robles-Arroyo Grande,San Luis Obispo County,2000-11-30,293151
32994,Colton,CA,Riverside-San Bernardino-Ontario,San Bernardino County,1998-06-30,94502
171859,Newark,CA,San Francisco-Oakland-Hayward,Alameda County,2008-09-30,476738
325821,Ross,CA,San Francisco-Oakland-Hayward,Marin County,2020-01-31,2902280
171606,Hayward,CA,San Francisco-Oakland-Hayward,Alameda County,2008-09-30,388390
39713,Lynwood,CA,Los Angeles-Long Beach-Anaheim,Los Angeles County,1998-12-31,142807
261704,San Francisco,CA,San Francisco-Oakland-Hayward,San Francisco County,2015-05-31,1110157


In [None]:
#Target Variable is 2020 Price. We want to see the relationship between Year and price
plt.scatter(df.Date, df.price)
plt.xlabel("Months")
plt.ylabel("Housing price USD $")
plt.show()

In [None]:
#Declaring features and targets

X = df.drop("Target_price")
y=df.Target_price

In [None]:
#instantized and fit data to model and predictions
model = LinearRegression()

model.fit(X, y)

y_pred = model.predict(X)

In [None]:
#Print the coeffiecient and intercept of the nodel

print(model.coef_)
print(model.intercept_)

In [None]:
#Print regression line

plt.scatter(X,y)
plt.plot(X, y_pred, color="red")
plt.show()