In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sqlalchemy import create_engine
import statsmodels.api as sm
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
from statsmodels.tools.eval_measures import mse, rmse
from sklearn.linear_model import LinearRegression, LassoCV, RidgeCV, ElasticNetCV

import warnings
warnings.filterwarnings('ignore')

postgres_user = 'dsbc_student'
postgres_pw = '7*.8G9QH21'
postgres_host = '142.93.121.174'
postgres_port = '5432'
postgres_db = 'houseprices'

engine = create_engine('postgresql://{}:{}@{}:{}/{}'.format(
    postgres_user, postgres_pw, postgres_host, postgres_port, postgres_db))
house_prices = pd.read_sql_query('select * from houseprices',con=engine)

engine.dispose()

In [2]:
clean_house_prices = house_prices.dropna(axis=1)

In [3]:
clean_house_prices['totalsf'] = clean_house_prices['totalbsmtsf'] + clean_house_prices['firstflrsf'] + clean_house_prices['secondflrsf']

clean_house_prices['int_over_sf'] = clean_house_prices['totalsf'] * clean_house_prices['overallqual']

In [4]:
clean_house_prices = pd.concat([clean_house_prices,pd.get_dummies(clean_house_prices.mszoning, prefix="mszoning", drop_first=True)], axis=1)
dummy_column_names = list(pd.get_dummies(clean_house_prices.mszoning, prefix="mszoning", drop_first=True).columns)

In [5]:
Y = np.log1p(clean_house_prices['saleprice'])
X = clean_house_prices[['overallqual','garagecars','totalsf','int_over_sf']+ dummy_column_names]
X = sm.add_constant(X)

X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size = 0.2, random_state = 465)

results = sm.OLS(y_train, X_train).fit()

results.summary()

0,1,2,3
Dep. Variable:,saleprice,R-squared:,0.828
Model:,OLS,Adj. R-squared:,0.827
Method:,Least Squares,F-statistic:,697.0
Date:,"Wed, 01 Jan 2020",Prob (F-statistic):,0.0
Time:,23:16:12,Log-Likelihood:,449.52
No. Observations:,1168,AIC:,-881.0
Df Residuals:,1159,BIC:,-835.5
Df Model:,8,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,9.8926,0.076,129.682,0.000,9.743,10.042
overallqual,0.1887,0.009,20.263,0.000,0.170,0.207
garagecars,0.1045,0.009,12.253,0.000,0.088,0.121
totalsf,0.0003,2.32e-05,14.801,0.000,0.000,0.000
int_over_sf,-2.553e-05,2.99e-06,-8.533,0.000,-3.14e-05,-1.97e-05
mszoning_FV,0.3730,0.065,5.766,0.000,0.246,0.500
mszoning_RH,0.2602,0.074,3.518,0.000,0.115,0.405
mszoning_RL,0.3615,0.060,6.053,0.000,0.244,0.479
mszoning_RM,0.1975,0.061,3.262,0.001,0.079,0.316

0,1,2,3
Omnibus:,358.604,Durbin-Watson:,1.883
Prob(Omnibus):,0.0,Jarque-Bera (JB):,2851.236
Skew:,-1.19,Prob(JB):,0.0
Kurtosis:,10.275,Cond. No.,520000.0


In [6]:
alphas = [np.power(10.0,p) for p in np.arange(-10,40,1)]

In [10]:

lasso_cv = LassoCV(alphas=alphas, cv=5)

lasso_cv.fit(X_train, y_train)

# We are making predictions here
y_preds_train = lasso_cv.predict(X_train)
y_preds_test = lasso_cv.predict(X_test)

print("Best alpha value is: {}".format(lasso_cv.alpha_))
print("R-squared of the model in training set is: {}".format(lasso_cv.score(X_train, y_train) * 100))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(lasso_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 0.0001
R-squared of the model in training set is: 82.77586544638419
-----Test set statistics-----
R-squared of the model in test set is: 0.8241379398924935
Mean absolute error of the prediction is: 0.1266300935894734
Mean squared error of the prediction is: 0.02932423331490239
Root mean squared error of the prediction is: 0.1712431993245349
Mean absolute percentage error of the prediction is: 1.058038531760228


In [11]:

ridge_cv = RidgeCV(alphas=alphas, cv=5)

ridge_cv.fit(X_train, y_train)

# We are making predictions here
y_preds_train = ridge_cv.predict(X_train)
y_preds_test = ridge_cv.predict(X_test)

print("Best alpha value is: {}".format(ridge_cv.alpha_))
print("R-squared of the model in training set is: {}".format(ridge_cv.score(X_train, y_train)* 100))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(ridge_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 1.0
R-squared of the model in training set is: 82.74811319189159
-----Test set statistics-----
R-squared of the model in test set is: 0.8221287465272364
Mean absolute error of the prediction is: 0.12702317660161475
Mean squared error of the prediction is: 0.02965925756619079
Root mean squared error of the prediction is: 0.17221863304007146
Mean absolute percentage error of the prediction is: 1.0616021610757127


In [9]:
elasticnet_cv = ElasticNetCV(alphas=alphas, cv=5)

elasticnet_cv.fit(X_train, y_train)

# We are making predictions here
y_preds_train = elasticnet_cv.predict(X_train)
y_preds_test = elasticnet_cv.predict(X_test)

print("Best alpha value is: {}".format(elasticnet_cv.alpha_))
print("R-squared of the model in training set is: {}".format(elasticnet_cv.score(X_train, y_train)* 100))
print("-----Test set statistics-----")
print("R-squared of the model in test set is: {}".format(elasticnet_cv.score(X_test, y_test)))
print("Mean absolute error of the prediction is: {}".format(mean_absolute_error(y_test, y_preds_test)))
print("Mean squared error of the prediction is: {}".format(mse(y_test, y_preds_test)))
print("Root mean squared error of the prediction is: {}".format(rmse(y_test, y_preds_test)))
print("Mean absolute percentage error of the prediction is: {}".format(np.mean(np.abs((y_test - y_preds_test) / y_test)) * 100))

Best alpha value is: 0.0001
R-squared of the model in training set is: 0.8278553804267494
-----Test set statistics-----
R-squared of the model in test set is: 0.825242808212644
Mean absolute error of the prediction is: 0.12639409133726026
Mean squared error of the prediction is: 0.029140001330001677
Root mean squared error of the prediction is: 0.17070442680259254
Mean absolute percentage error of the prediction is: 1.0559048853858188
