## Model Development / Evaluation 

In [124]:
# Imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# sklearn <- our machine learning library. We'll need LinearRegression and the metrics module
from sklearn.linear_model import LinearRegression
from sklearn import metrics
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OrdinalEncoder
%matplotlib inline
from sklearn.linear_model import Ridge
from sklearn.linear_model import RidgeCV
from sklearn.linear_model import Lasso

In [57]:
#Read in datasets

In [58]:
train = pd.read_csv('../datasets/train_modified.csv')
test = pd.read_csv('../datasets/test_modified.csv')

In [59]:
# HELPS YOU SEE ALL COLUMNS IN A DF
pd.set_option('display.max_rows', 500)

In [60]:
train.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,SalePrice
0,109,533352170,60,RL,69.0552,13517,Pave,Grvl,IR1,Lvl,...,0,0,Gd,MnPrv,Shed,0,3,2010,WD,130500
1,544,531379050,60,RL,43.0,11492,Pave,Grvl,IR1,Lvl,...,0,0,Gd,MnPrv,Shed,0,4,2009,WD,220000
2,153,535304180,20,RL,68.0,7922,Pave,Grvl,Reg,Lvl,...,0,0,Gd,MnPrv,Shed,0,1,2010,WD,109000
3,318,916386060,60,RL,73.0,9802,Pave,Grvl,Reg,Lvl,...,0,0,Gd,MnPrv,Shed,0,4,2010,WD,174000
4,255,906425045,50,RL,82.0,14235,Pave,Grvl,IR1,Lvl,...,0,0,Gd,MnPrv,Shed,0,3,2010,WD,138500


In [61]:
test.head()

Unnamed: 0,Id,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,3Ssn Porch,Screen Porch,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type
0,2658,902301120,190,RM,69.0,9142,Pave,Grvl,Reg,Lvl,...,0,0,0,Ex,MnPrv,Shed,0,4,2006,WD
1,2718,905108090,90,RL,69.545961,9662,Pave,Grvl,IR1,Lvl,...,0,0,0,Ex,MnPrv,Shed,0,8,2006,WD
2,2414,528218130,60,RL,58.0,17104,Pave,Grvl,IR1,Lvl,...,0,0,0,Ex,MnPrv,Shed,0,9,2006,New
3,1989,902207150,30,RM,60.0,8520,Pave,Grvl,Reg,Lvl,...,0,0,0,Ex,MnPrv,Shed,0,7,2007,WD
4,625,535105100,20,RL,69.545961,9500,Pave,Grvl,IR1,Lvl,...,0,185,0,Ex,MnPrv,Shed,0,7,2009,WD


In [62]:
corr_matrix = train.corr()
corr_matrix['SalePrice'].sort_values(ascending=False)

SalePrice          1.000000
Overall Qual       0.800207
Gr Liv Area        0.697038
Garage Area        0.650246
Garage Cars        0.648197
Total Bsmt SF      0.628668
1st Flr SF         0.618486
Year Built         0.571849
Year Remod/Add     0.550370
Full Bath          0.537969
Garage Yr Blt      0.516738
Mas Vnr Area       0.508602
TotRms AbvGrd      0.504014
Fireplaces         0.471093
BsmtFin SF 1       0.423346
Open Porch SF      0.333476
Wood Deck SF       0.326490
Lot Frontage       0.325850
Lot Area           0.296566
Bsmt Full Bath     0.283429
Half Bath          0.283001
2nd Flr SF         0.248452
Bsmt Unf SF        0.190132
Bedroom AbvGr      0.137067
Screen Porch       0.134581
3Ssn Porch         0.048732
Mo Sold            0.032735
Pool Area          0.023106
BsmtFin SF 2       0.016249
Misc Val          -0.007375
Yr Sold           -0.015203
Low Qual Fin SF   -0.041594
Bsmt Half Bath    -0.045290
Id                -0.051398
MS SubClass       -0.087335
Overall Cond      -0

In [63]:
# another way to join df to dummy data
dummy_df = pd.get_dummies(data=train)

In [64]:
dummy_df.head()

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Misc Feature_TenC,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_WD
0,109,533352170,60,69.0552,13517,6,8,1976,2005,289.0,...,0,0,0,0,0,0,0,0,0,1
1,544,531379050,60,43.0,11492,7,5,1996,1997,132.0,...,0,0,0,0,0,0,0,0,0,1
2,153,535304180,20,68.0,7922,5,7,1953,2007,0.0,...,0,0,0,0,0,0,0,0,0,1
3,318,916386060,60,73.0,9802,5,5,2006,2007,0.0,...,0,0,0,0,0,0,0,0,0,1
4,255,906425045,50,82.0,14235,6,8,1900,1993,0.0,...,0,0,0,0,0,0,0,0,0,1


In [65]:
# Correlations can only be done on train dataset
corr_matrix = dummy_df.corr()
corr_matrix['SalePrice'].sort_values(ascending=False)


SalePrice               1.000000
Overall Qual            0.800207
Gr Liv Area             0.697038
Garage Area             0.650246
Garage Cars             0.648197
Total Bsmt SF           0.628668
1st Flr SF              0.618486
Bsmt Qual_Ex            0.586497
Year Built              0.571849
Kitchen Qual_Ex         0.551284
Year Remod/Add          0.550370
Full Bath               0.537969
Foundation_PConc        0.529047
Garage Yr Blt           0.516738
Mas Vnr Area            0.508602
TotRms AbvGrd           0.504014
Exter Qual_Ex           0.493861
Fireplaces              0.471093
Heating QC_Ex           0.453255
Neighborhood_NridgHt    0.448647
Exter Qual_Gd           0.446685
BsmtFin SF 1            0.423346
Garage Finish_Fin       0.422936
BsmtFin Type 1_GLQ      0.398661
Bsmt Exposure_Gd        0.377032
Sale Type_New           0.358102
Exterior 1st_VinylSd    0.342146
Exterior 2nd_VinylSd    0.337563
Open Porch SF           0.333476
Wood Deck SF            0.326490
Lot Fronta

In [66]:
#train[train['Neighborhood'].str.contains("'")]

In [67]:
#train['Neighborhood'] = train['Neighborhood'].str.replace(' _ ' , "'")

In [68]:
dummy_df.shape

(2051, 292)

## Model # 1(Predictions based on Train Dataset)

In [69]:
#X = train.drop('SalePrice', axis=1)
xvars = ['Overall Qual',
         'Gr Liv Area', 
         'Garage Area', 
         'Garage Cars',
         'Total Bsmt SF',
         '1st Flr SF',
         'Year Built',
         'Year Remod/Add',
         'Full Bath',
         'TotRms AbvGrd', 
         'Mas Vnr Area',
         'Fireplaces',
         'BsmtFin SF 1',
         'Open Porch SF',
         'Wood Deck SF', 
         'Lot Area',
         'Bsmt Full Bath',
         'Half Bath', 
         'Garage Yr Blt', 
         '2nd Flr SF', 
         'Bsmt Unf SF', 
         'Lot Frontage', 
         'Bedroom AbvGr', 
         'Screen Porch',
         'TotRms AbvGrd',
         'Fireplaces',
         'Neighborhood_NridgHt',
         'Bsmt Qual_TA',      
         'Garage Finish_Unf',
         'Kitchen Qual_TA'
        
]

X = dummy_df[xvars]
y = dummy_df['SalePrice']

In [70]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2022)

In [71]:
lr = LinearRegression()
lr.fit(X_train, y_train);

In [72]:
# Train score
lr.score(X_train, y_train)

0.8339705763573879

In [73]:
# Test score
lr.score(X_test, y_test)

0.8161311159939342

In [74]:
# this is what will be submitted to kaggle
preds = lr.predict(dummy_test[xvars])
#preds

In [75]:
dummy_test['SalePrice'] = preds

In [76]:
lr_submission = dummy_test[['Id', 'SalePrice']]

In [77]:
lr_submission.head()

Unnamed: 0,Id,SalePrice
0,2658,150952.762836
1,2718,172858.854344
2,2414,218263.62249
3,1989,111134.885136
4,625,199313.205729


In [78]:
lr_submission.set_index('Id', inplace = True)

In [79]:
#save submission

lr_submission.to_csv('v3_dummy_submission.csv')

## Model #2 (Test Dataset)
This model geenerates misleading results becasue it is now shifted from predicting the true sales price to predicting what the model will do, as per discussion with Tan

In [80]:
# another way to join df to dummy data, ## do exact same steps with Test.csv incl get dummyies
dummy_test = pd.get_dummies(data=test)


In [81]:
dummy_test.shape

(878, 272)

In [82]:
dummy_test.head()

Unnamed: 0,Id,PID,MS SubClass,Lot Frontage,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,...,Sale Type_COD,Sale Type_CWD,Sale Type_Con,Sale Type_ConLD,Sale Type_ConLI,Sale Type_ConLw,Sale Type_New,Sale Type_Oth,Sale Type_VWD,Sale Type_WD
0,2658,902301120,190,69.0,9142,6,8,1910,1950,0.0,...,0,0,0,0,0,0,0,0,0,1
1,2718,905108090,90,69.545961,9662,5,4,1977,1977,0.0,...,0,0,0,0,0,0,0,0,0,1
2,2414,528218130,60,58.0,17104,7,5,2006,2006,0.0,...,0,0,0,0,0,0,1,0,0,0
3,1989,902207150,30,60.0,8520,5,6,1923,2006,0.0,...,0,0,0,0,0,0,0,0,0,1
4,625,535105100,20,69.545961,9500,6,5,1963,1963,247.0,...,0,0,0,0,0,0,0,0,0,1


In [95]:
xvars = ['Overall Qual',
         'Gr Liv Area', 
         'Garage Area', 
         'Garage Cars',
         'Total Bsmt SF',
         '1st Flr SF',
         'Year Built',
         'Year Remod/Add',
         'Full Bath',
         'TotRms AbvGrd', 
         'Mas Vnr Area',
         'Fireplaces',
         'BsmtFin SF 1',
         'Open Porch SF',
         'Wood Deck SF', 
         'Lot Area',
         'Bsmt Full Bath',
         'Half Bath', 
         'Garage Yr Blt', 
         '2nd Flr SF', 
         'Bsmt Unf SF', 
         'Lot Frontage', 
         'Bedroom AbvGr', 
         'Screen Porch',
         'TotRms AbvGrd',
         'Fireplaces',
         'Sale Type_New',
         'Bsmt Qual_TA',      
         'Garage Finish_Unf',
         'Kitchen Qual_TA',
         'Sale Type_CWD',
         'Sale Type_Con',
         'Sale Type_ConLD',
         'Sale Type_ConLI',
         'Sale Type_ConLw',
         'Sale Type_Oth',
         'Sale Type_VWD'
            
         
        
]

X = dummy_test[xvars]
y = dummy_test['SalePrice']

In [96]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2022)

In [97]:
lr = LinearRegression()
lr.fit(X_train, y_train);

In [98]:
# Train score
lr.score(X_train, y_train)

0.9840817290953393

In [99]:
# Test score
lr.score(X_test, y_test)

0.9885515930879932

In [100]:
# this is what will be submitted to kaggle
preds = lr.predict(dummy_test[xvars])
#preds

In [101]:
dummy_test['SalePrice'] = preds

In [102]:
lr_submission = dummy_test[['Id', 'SalePrice']]

In [103]:
lr_submission.head()

Unnamed: 0,Id,SalePrice
0,2658,150097.52844
1,2718,172642.303595
2,2414,227079.373093
3,1989,111811.92014
4,625,201299.928378


In [104]:
lr_submission.set_index('Id', inplace = True)

In [105]:
#save submission

lr_submission.to_csv('v5_dummy_submission.csv')

## Model # 3 - All Highly Correlated Variables

In [133]:
xvars

In [41]:
## Help from Katie S
col_list = dummy_df.columns

high_corr = []
for col in col_list:
    if dummy_df[col].corr(dummy_df['SalePrice']) > 0.5 or dummy_df[col].corr(dummy_df['SalePrice']) < -0.5:
        high_corr.append(col)
print(high_corr)

['Overall Qual', 'Year Built', 'Year Remod/Add', 'Mas Vnr Area', 'Total Bsmt SF', '1st Flr SF', 'Gr Liv Area', 'Full Bath', 'TotRms AbvGrd', 'Garage Yr Blt', 'Garage Cars', 'Garage Area', 'SalePrice', 'Exter Qual_TA', 'Foundation_PConc', 'Bsmt Qual_Ex', 'Bsmt Qual_TA', 'Kitchen Qual_Ex', 'Kitchen Qual_TA', 'Garage Finish_Unf']


In [42]:
high_corr.remove('SalePrice')

In [43]:
xvars = high_corr

X = dummy_df[xvars]
y = dummy_df['SalePrice']

In [44]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=2022)

In [45]:
lr = LinearRegression()
lr.fit(X_train, y_train);

In [46]:
# Train score
lr.score(X_train, y_train)

0.8352267508294431

In [47]:
# Test score
lr.score(X_test, y_test)

0.821123456849235

In [48]:
# this is what will be submitted to kaggle
preds = lr.predict(dummy_test[xvars])
#preds

In [49]:
dummy_test['SalePrice'] = preds

In [50]:
lr_submission = dummy_test[['Id', 'SalePrice']]

In [51]:
lr_submission.head()

Unnamed: 0,Id,SalePrice
0,2658,160376.746143
1,2718,196020.029726
2,2414,196645.20066
3,1989,129976.37791
4,625,178390.963741


In [52]:
lr_submission.set_index('Id', inplace = True)

In [53]:
#save submission

lr_submission.to_csv('v4_dummy_submission.csv')

## Model 4 Lasso Regression

In [109]:
#Lasso Model (Regularization Lesson)

xvars = high_corr

X = dummy_df[xvars]
y = dummy_df['SalePrice']

#Instantiate polynomial features
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)


#Fit and transform X data

X_overfit = poly.fit_transform(X)



In [111]:
poly.get_feature_names(X.columns)



['Overall Qual',
 'Year Built',
 'Year Remod/Add',
 'Mas Vnr Area',
 'Total Bsmt SF',
 '1st Flr SF',
 'Gr Liv Area',
 'Full Bath',
 'TotRms AbvGrd',
 'Garage Yr Blt',
 'Garage Cars',
 'Garage Area',
 'Exter Qual_TA',
 'Foundation_PConc',
 'Bsmt Qual_Ex',
 'Bsmt Qual_TA',
 'Kitchen Qual_Ex',
 'Kitchen Qual_TA',
 'Garage Finish_Unf',
 'Overall Qual^2',
 'Overall Qual Year Built',
 'Overall Qual Year Remod/Add',
 'Overall Qual Mas Vnr Area',
 'Overall Qual Total Bsmt SF',
 'Overall Qual 1st Flr SF',
 'Overall Qual Gr Liv Area',
 'Overall Qual Full Bath',
 'Overall Qual TotRms AbvGrd',
 'Overall Qual Garage Yr Blt',
 'Overall Qual Garage Cars',
 'Overall Qual Garage Area',
 'Overall Qual Exter Qual_TA',
 'Overall Qual Foundation_PConc',
 'Overall Qual Bsmt Qual_Ex',
 'Overall Qual Bsmt Qual_TA',
 'Overall Qual Kitchen Qual_Ex',
 'Overall Qual Kitchen Qual_TA',
 'Overall Qual Garage Finish_Unf',
 'Year Built^2',
 'Year Built Year Remod/Add',
 'Year Built Mas Vnr Area',
 'Year Built Total Bs

In [112]:
# Check dimensions of X_overfit
X_overfit.shape

(2051, 209)

In [113]:
#Create train/test split
X_train, X_test, y_train, y_test = train_test_split(
    X_overfit,
    y,
    test_size=0.70,
    random_state=2022


)

In [114]:
#Scale data & Relabel
sc = StandardScaler()
Z_train = sc.fit_transform(X_train)
Z_test = sc.transform(X_test)

In [115]:
print(f'Z_train shape is: {Z_train.shape}')
print(f'y_train shape is: {y_train.shape}')
print(f'Z_test shape is: {Z_test.shape}')
print(f'y_test shape is: {y_test.shape}')

Z_train shape is: (615, 209)
y_train shape is: (615,)
Z_test shape is: (1436, 209)
y_test shape is: (1436,)


In [128]:
#Received help from Tan
# Standardize predictors
lasso = Lasso(alpha=100, max_iter=10000)
lasso.fit(Z_train, y_train)

Lasso(alpha=100, max_iter=10000)

In [130]:
#How does the model score on the training and test data?
#negative score indicates the model isnt doing well...use lasso
print(lasso.score(Z_train, y_train))
print(lasso.score(Z_test, y_test))

0.9228432155252597
0.8575940709866966


## Model 5 Ridge Regression

In [122]:
#Required Ridge Model 
ridge_model = Ridge(alpha=10)

# Fit.
ridge_model.fit(Z_train, y_train)

# Evaluate model using R2.
print(ridge_model.score(Z_train, y_train))
print(ridge_model.score(Z_test, y_test))

0.9225866864637838
0.8612464628134358
