Using a Linear Model that is encoded to see if performance improved

In [1]:
from sklearn.model_selection import train_test_split
import pandas as pd
from path import Path
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
from sklearn import metrics
#from sqlalchemy import create_engine
#import psycopg2 
# import the psycopg2 database adapter for PostgreSQL
#from psycopg2 import connect, extensions, sql
import plotly.express as px
import datetime as dt
import numpy as np

In [30]:
#import the data

data_df= pd.read_csv("data/raw/Housing_cleaned.csv")

data_df

Unnamed: 0.1,Unnamed: 0,RegionName,State,Metro,CountyName,Date,Avg_Price
0,0,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles County,1996-01-31,192855.0
1,1,San Diego,CA,San Diego-Carlsbad,San Diego County,1996-01-31,214715.0
2,2,San Jose,CA,San Jose-Sunnyvale-Santa Clara,Santa Clara County,1996-01-31,232040.0
3,3,Jacksonville,FL,Jacksonville,Duval County,1996-01-31,85992.0
4,4,San Francisco,CA,San Francisco-Oakland-Hayward,San Francisco County,1996-01-31,299060.0
...,...,...,...,...,...,...,...
522922,522922,Goodland,FL,Naples-Immokalee-Marco Island,Collier County,2020-03-31,252414.0
522923,522923,McIntosh,FL,Ocala,Marion County,2020-03-31,206490.0
522924,522924,Camp Meeker,CA,Santa Rosa,Sonoma County,2020-03-31,396362.0
522925,522925,Bear Valley,CA,,Alpine County,2020-03-31,448221.0


In [31]:
data_df['Date'] = pd.to_datetime(data_df['Date'])
data_df["Avg_Price"] = data_df["Avg_Price"].astype('Int64')
data_df

Unnamed: 0.1,Unnamed: 0,RegionName,State,Metro,CountyName,Date,Avg_Price
0,0,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles County,1996-01-31,192855
1,1,San Diego,CA,San Diego-Carlsbad,San Diego County,1996-01-31,214715
2,2,San Jose,CA,San Jose-Sunnyvale-Santa Clara,Santa Clara County,1996-01-31,232040
3,3,Jacksonville,FL,Jacksonville,Duval County,1996-01-31,85992
4,4,San Francisco,CA,San Francisco-Oakland-Hayward,San Francisco County,1996-01-31,299060
...,...,...,...,...,...,...,...
522922,522922,Goodland,FL,Naples-Immokalee-Marco Island,Collier County,2020-03-31,252414
522923,522923,McIntosh,FL,Ocala,Marion County,2020-03-31,206490
522924,522924,Camp Meeker,CA,Santa Rosa,Sonoma County,2020-03-31,396362
522925,522925,Bear Valley,CA,,Alpine County,2020-03-31,448221


# Preprocess

In [32]:
#2010 and later dates and without 2020
start = "2010-1-1"
end = "2020-1-1"
base_df = data_df[(data_df["Date"] > start) & (data_df["Date"]<end)]
actual_2020_df = data_df[data_df["Date"] > end]
base_df.tail()

Unnamed: 0.1,Unnamed: 0,RegionName,State,Metro,CountyName,Date,Avg_Price
517531,517531,Goodland,FL,Naples-Immokalee-Marco Island,Collier County,2019-12-31,254930
517532,517532,McIntosh,FL,Ocala,Marion County,2019-12-31,204755
517533,517533,Camp Meeker,CA,Santa Rosa,Sonoma County,2019-12-31,362685
517534,517534,Bear Valley,CA,,Alpine County,2019-12-31,447779
517535,517535,North Palm Springs,CA,Riverside-San Bernardino-Ontario,Riverside County,2019-12-31,256209


In [33]:
actual_2020_df

Unnamed: 0.1,Unnamed: 0,RegionName,State,Metro,CountyName,Date,Avg_Price
517536,517536,Los Angeles,CA,Los Angeles-Long Beach-Anaheim,Los Angeles County,2020-01-31,770853
517537,517537,San Diego,CA,San Diego-Carlsbad,San Diego County,2020-01-31,733084
517538,517538,San Jose,CA,San Jose-Sunnyvale-Santa Clara,Santa Clara County,2020-01-31,1113006
517539,517539,Jacksonville,FL,Jacksonville,Duval County,2020-01-31,197045
517540,517540,San Francisco,CA,San Francisco-Oakland-Hayward,San Francisco County,2020-01-31,1504169
...,...,...,...,...,...,...,...
522922,522922,Goodland,FL,Naples-Immokalee-Marco Island,Collier County,2020-03-31,252414
522923,522923,McIntosh,FL,Ocala,Marion County,2020-03-31,206490
522924,522924,Camp Meeker,CA,Santa Rosa,Sonoma County,2020-03-31,396362
522925,522925,Bear Valley,CA,,Alpine County,2020-03-31,448221


In [6]:
ml_df = base_df[["State", "Date", "Avg_Price"]]

In [37]:
ml_df.isnull().sum()

State           0
Date            0
Avg_Price    6914
dtype: int64

In [39]:
ml_df = ml_df[ml_df["Avg_Price"].notna()]
ml_df.isnull().sum()

State        0
Date         0
Avg_Price    0
dtype: int64

In [40]:
dummies_df = pd.get_dummies(ml_df.State)

dummies_df

Unnamed: 0,CA,FL
301896,1,0
301897,1,0
301898,1,0
301899,0,1
301900,1,0
...,...,...
517531,0,1
517532,0,1
517533,1,0
517534,1,0


In [41]:
merged = pd.concat([ml_df, dummies_df], axis= 1)

In [42]:
merged.dtypes

State                object
Date         datetime64[ns]
Avg_Price             Int64
CA                    uint8
FL                    uint8
dtype: object

In [43]:
final_df = merged.drop(["State"], axis = 1)
final_df

Unnamed: 0,Date,Avg_Price,CA,FL
301896,2010-01-31,459638,1,0
301897,2010-01-31,467595,1,0
301898,2010-01-31,565040,1,0
301899,2010-01-31,146582,0,1
301900,2010-01-31,768010,1,0
...,...,...,...,...
517531,2019-12-31,254930,0,1
517532,2019-12-31,204755,0,1
517533,2019-12-31,362685,1,0
517534,2019-12-31,447779,1,0


In [44]:
#converting Date to oridinal
final_df['Date']= final_df['Date'].map(dt.datetime.toordinal)

final_df

Unnamed: 0,Date,Avg_Price,CA,FL
301896,733803,459638,1,0
301897,733803,467595,1,0
301898,733803,565040,1,0
301899,733803,146582,0,1
301900,733803,768010,1,0
...,...,...,...,...
517531,737424,254930,0,1
517532,737424,204755,0,1
517533,737424,362685,1,0
517534,737424,447779,1,0


In [75]:
ca_df = final_df[final_df["CA"] == True]

ca_df

Unnamed: 0,Date,Avg_Price,CA,FL
301896,733803,459638,1,0
301897,733803,467595,1,0
301898,733803,565040,1,0
301900,733803,768010,1,0
301902,733803,183859,1,0
...,...,...,...,...
517529,737424,70255,1,0
517530,737424,113799,1,0
517533,737424,362685,1,0
517534,737424,447779,1,0


In [76]:
fl_df = final_df[final_df["FL"] == True]

fl_df

Unnamed: 0,Date,Avg_Price,CA,FL
301899,733803,146582,0,1
301901,733803,155102,0,1
301904,733803,223207,0,1
301907,733803,151244,0,1
301912,733803,278643,0,1
...,...,...,...,...
517513,737424,124523,0,1
517523,737424,52243,0,1
517524,737424,126052,0,1
517531,737424,254930,0,1


# Linear model

## Florida 

In [45]:
from sklearn.linear_model import LinearRegression
model = LinearRegression()

In [78]:
X = fl_df.drop(columns=["Avg_Price","FL"])
y = fl_df.Avg_Price

In [79]:
#Train test split

X_train, X_test, y_train, y_test = train_test_split(X,y, test_size=0.2, random_state=1)

In [80]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [81]:
model.fit(X_train, y_train)

LinearRegression()

In [84]:
y_pred = model.predict(X_test)

In [85]:
#Print the coeffiecient and intercept of the nodel
print(f"Coefficient: {model.coef_}, Intercept: {model.intercept_}")
print('Mean Absolute Error:', metrics.mean_absolute_error(y_test, y_pred))
print('Mean Squared Error:', metrics.mean_squared_error(y_test, y_pred))
print('Root Mean Squared Error:', np.sqrt(metrics.mean_squared_error(y_test, y_pred)))

Coefficient: [50479.48839054     0.        ], Intercept: 284089.2623075028
Mean Absolute Error: 193040.68927152152
Mean Squared Error: 292656097234.3723
Root Mean Squared Error: 540976.984015376
