<img src="http://imgur.com/1ZcRyrc.png" style="float: left; margin: 20px; height: 55px">

# Project 2 - Factors Affecting the Value of Your Home

# Modeling and Predictions

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.linear_model import LinearRegression, Lasso, Ridge
from sklearn.preprocessing import StandardScaler, OneHotEncoder, PolynomialFeatures
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.metrics import mean_squared_error
from sklearn.pipeline import Pipeline
from sklearn.compose import make_column_transformer

import statsmodels.api as sm

In [2]:
import warnings
from sklearn.exceptions import ConvergenceWarning
from scipy.linalg import LinAlgWarning

warnings.filterwarnings("ignore", category=ConvergenceWarning)
warnings.filterwarnings("ignore", category=LinAlgWarning)

In [3]:
# read in datasets
train_selected = pd.read_csv('../data/train_selected.csv')
test_cleaned = pd.read_csv('../data/test_cleaned.csv')

In [4]:
# declare numerical and categorical columns
numerical_columns = test_cleaned.select_dtypes(include=['int', 'float']).columns
categorical_columns = test_cleaned.select_dtypes(include=['object']).columns

In [5]:
# dummify dataset
test_cleaned = pd.get_dummies(test_cleaned, columns=categorical_columns, drop_first=True)

In [6]:
# declare X and y variables
X = train_selected.drop(columns=['SalePrice'])
y = train_selected['SalePrice']

In [7]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=999)

In [8]:
# establish baseline rmse
baseline_preds = np.full_like(y_test, fill_value=y_train.mean())
baseline_rmse = mean_squared_error(y_test, baseline_preds, squared=False)
baseline_rmse

76408.06363301

In [9]:
train_selected.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1538 entries, 0 to 1537
Data columns (total 21 columns):
 #   Column                Non-Null Count  Dtype  
---  ------                --------------  -----  
 0   SalePrice             1538 non-null   int64  
 1   Overall Qual          1538 non-null   int64  
 2   Gr Liv Area           1538 non-null   int64  
 3   Garage Cars           1538 non-null   float64
 4   Garage Area           1538 non-null   float64
 5   Total Bsmt SF         1538 non-null   float64
 6   1st Flr SF            1538 non-null   int64  
 7   Exter Qual_TA         1538 non-null   int64  
 8   Year Built            1538 non-null   int64  
 9   Year Remod/Add        1538 non-null   int64  
 10  Full Bath             1538 non-null   int64  
 11  Kitchen Qual_TA       1538 non-null   int64  
 12  Foundation_PConc      1538 non-null   int64  
 13  TotRms AbvGrd         1538 non-null   int64  
 14  Mas Vnr Area          1538 non-null   float64
 15  Garage Yr Blt        

In [10]:
# define and create transformer with scaling and polynomial features
scaler = StandardScaler()
poly = PolynomialFeatures(degree=2)

numerical_columns = train_selected.drop(columns=['SalePrice']).columns

transformer = make_column_transformer((scaler, numerical_columns), (poly, numerical_columns))

X_train_transform = transformer.fit_transform(X_train)
X_test_transform = transformer.transform(X_test)

transformer

In [11]:
# create and fit linear regression model
lr = LinearRegression()
lr.fit(X_train_transform, y_train)

In [12]:
# display lr train R-Squared score
lr.score(X_train_transform, y_train)

0.9206808869704901

In [13]:
# display lr test R-Squared score
lr.score(X_test_transform, y_test)

0.8944649351502988

In [14]:
# display cross val score
cross_val_score(lr, X_train_transform, y_train, cv=5).mean()

0.7575221322275937

In [15]:
# predict lr
y_train_pred = lr.predict(X_train_transform)
y_test_pred = lr.predict(X_test_transform)

In [16]:
# display lr train rmse
mean_squared_error(y_train, y_train_pred, squared=False)

22616.255591463254

In [17]:
# display lr test rmse
mean_squared_error(y_test, y_test_pred, squared=False)

24725.474494533413

In [18]:
# create pipeline with Lasso regression
lasso_pipe = Pipeline([('transformer', transformer), ('lasso', Lasso(alpha=0.1))])
lasso_pipe

In [19]:
# fit Lasso regression
lasso_pipe.fit(X_train, y_train)

In [20]:
# display Lasso train R-Squared score
lasso_pipe.score(X_train, y_train)

0.9148974189677032

In [21]:
# display Lasso test R-Squared score
lasso_pipe.score(X_test, y_test)

0.9072268668208475

In [22]:
# predict Lasso regression
lasso_train_preds = lasso_pipe.predict(X_train)
lasso_test_preds = lasso_pipe.predict(X_test)

In [23]:
# display Lasso train rmse
mean_squared_error(y_train, lasso_train_preds, squared=False)

23426.27002351576

In [24]:
# display Lasso test rmse
mean_squared_error(y_test, lasso_test_preds, squared=False)

23182.34453645812

In [25]:
# create pipeline with Ridge regression
ridge_pipe = Pipeline([('transformer', transformer), ('ridge', Ridge(alpha=0.1))])
ridge_pipe

In [26]:
# fit Ridge regression
ridge_pipe.fit(X_train, y_train)

In [27]:
# display Ridge train R-Squared score
ridge_pipe.score(X_train, y_train)

0.9195974079845656

In [28]:
# display Ridge test R-Squared score
ridge_pipe.score(X_test, y_test)

0.8972249932421588

In [29]:
# predict Ridge regression
ridge_train_preds = ridge_pipe.predict(X_train)
ridge_test_preds = ridge_pipe.predict(X_test)

In [30]:
# display Ridge train rmse
mean_squared_error(y_train, ridge_train_preds, squared=False)

22770.197831958743

In [31]:
# display Ridge test rmse
mean_squared_error(y_test, ridge_test_preds, squared=False)

24400.009813841192

## Select Columns, Transform, and Make Predictions on Test Data

In [32]:
selected_columns = train_selected.columns.drop('SalePrice')
test_selected = test_cleaned[selected_columns]
test_selected = test_selected.copy()
test_selected['Unnamed: 0'] = test_cleaned['Unnamed: 0']

In [33]:
test_transform = transformer.transform(test_selected)

In [34]:
preds = lr.predict(test_transform)

In [35]:
submission = pd.DataFrame(index=test_cleaned['Unnamed: 0'])
submission['SalePrice'] = preds
submission.reset_index(inplace=True)

In [36]:
# save submission dataset
submission.to_csv('../data/kaggle_submission.csv', index=False)

## OLS

In [37]:
X = train_selected.drop(columns=['SalePrice'])
y = train_selected['SalePrice']

In [38]:
X = sm.add_constant(X)

In [39]:
ols = sm.OLS(y, X).fit()

In [40]:
ols.summary()

0,1,2,3
Dep. Variable:,SalePrice,R-squared:,0.807
Model:,OLS,Adj. R-squared:,0.804
Method:,Least Squares,F-statistic:,316.3
Date:,"Tue, 02 Apr 2024",Prob (F-statistic):,0.0
Time:,11:50:58,Log-Likelihood:,-18274.0
No. Observations:,1538,AIC:,36590.0
Df Residuals:,1517,BIC:,36700.0
Df Model:,20,,
Covariance Type:,nonrobust,,

0,1,2,3,4,5,6
,coef,std err,t,P>|t|,[0.025,0.975]
const,-8.943e+05,1.56e+05,-5.751,0.000,-1.2e+06,-5.89e+05
Overall Qual,1.566e+04,1111.259,14.095,0.000,1.35e+04,1.78e+04
Gr Liv Area,32.8060,4.207,7.798,0.000,24.554,41.058
Garage Cars,7853.9138,2948.275,2.664,0.008,2070.787,1.36e+04
Garage Area,22.6179,10.060,2.248,0.025,2.885,42.351
Total Bsmt SF,11.4146,3.816,2.992,0.003,3.930,18.899
1st Flr SF,9.4287,4.343,2.171,0.030,0.909,17.948
Exter Qual_TA,-4.628e+04,5010.347,-9.236,0.000,-5.61e+04,-3.64e+04
Year Built,182.8049,58.435,3.128,0.002,68.182,297.427

0,1,2,3
Omnibus:,833.468,Durbin-Watson:,1.952
Prob(Omnibus):,0.0,Jarque-Bera (JB):,87666.127
Skew:,-1.576,Prob(JB):,0.0
Kurtosis:,39.852,Cond. No.,711000.0


Interpretation: p-value is less than alpha of 0.05 for overall quality, above ground living area, and garage car capacity when correlating with sale price of houses, which means data results are statistically significant.