# Machine Learning Project

Group Name: Hidden Figures 

Group Member: Neha Chanu

<p><a name="sections"></a></p>


## Sections

- <a href="#preprocessing">Preprocessing</a><br>


- <a href="#models">Training Models</a><br>

    - <a href="#ENet">Elastic Net</a><br>
    - <a href="#gbm">Gradient Boosting Regression</a><br>
    - <a href="#XGboost">XGBoost</a><br>
    - <a href="#Lgbm">LightGBM</a><br>
    - <a href="#randomforest">Random Forest</a><br>
    
- <a href="#finetune">Fine-tuning our models</a><br>
    - <a href="#expertapproach">"Expert Approach"</a><br>

- <a href="#bayesop">Bayesian Optimization</a><br>
    
- <a href="#finetune">Stacking/Ensembling</a><br>

- <a href="#submission">Submission</a><br>


<p><a name="preprocessing"></a></p>
## Preprocessing

In [1]:
#data manipulation tools

import pandas as pd
import numpy as np

#data visualization tools

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [2]:
#set display option

pd.set_option('display.max_columns', 100)
pd.set_option('display.float_format', lambda x: '{:.3f}'.format(x))

In [3]:
#import train data | 1458 rows, 220 independent variables, 1 y variable

df_train = pd.read_csv('train_clean.csv')

In [4]:
df_train.shape

(1458, 222)

In [5]:
#import test data 

df_test = pd.read_csv('test_clean.csv')

In [6]:
df_test.shape

(1459, 221)

In [7]:
df_train.head()

Unnamed: 0,Id,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFinType2,BsmtFullBath,BsmtHalfBath,BsmtQual,BsmtUnfSF,CentralAir,EnclosedPorch,ExterCond,ExterQual,Fence,FireplaceQu,Fireplaces,FullBath,Functional,GarageArea,GarageCars,GarageCond,GarageFinish,GarageQual,GarageYrBlt,GrLivArea,HalfBath,HeatingQC,KitchenAbvGr,KitchenQual,LandSlope,LotArea,LotFrontage,LotShape,LowQualFinSF,MSSubClass,MasVnrArea,MiscVal,MoSold,OpenPorchSF,OverallCond,OverallQual,PavedDrive,PoolArea,...,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SalesPrice
0,0,11.693,11.686,0.0,0.73,1.541,1.82,1.541,11.17,0.0,1.194,2.26,0.73,0.0,1.194,7.483,0.73,0.0,1.82,1.194,1.82,1.541,0.0,1.194,2.26,10.506,1.194,2.056,1.194,2.056,14.188,13.699,0.73,0.0,0.73,1.194,0.0,19.212,5.831,1.541,0.0,2.886,8.059,0.0,1.82,5.715,1.82,2.44,1.194,0.0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,12.248
1,1,12.792,0.0,0.0,0.73,1.541,1.82,0.73,12.063,0.0,0.0,2.26,0.0,0.73,1.194,8.898,0.73,0.0,1.82,1.541,1.82,2.056,0.73,1.194,2.26,10.062,1.194,2.056,1.194,2.056,14.145,12.792,0.0,0.0,0.73,1.541,0.0,19.712,6.221,1.541,0.0,2.056,0.0,0.0,2.44,0.0,2.44,2.26,1.194,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,12.109
2,2,11.892,11.725,0.0,0.73,1.541,1.82,1.194,10.2,0.0,1.194,2.26,0.73,0.0,1.194,9.917,0.73,0.0,1.82,1.194,1.82,2.056,0.73,1.194,2.26,10.776,1.194,2.056,1.194,2.056,14.184,13.832,0.73,0.0,0.73,1.194,0.0,20.347,5.915,0.0,0.0,2.886,7.647,0.0,3.011,5.053,1.82,2.44,1.194,0.0,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,12.317
3,3,12.014,11.354,0.0,0.73,1.541,0.73,1.541,8.274,0.0,0.0,2.26,0.73,0.0,1.82,10.468,0.73,8.798,1.82,1.541,1.82,1.194,0.73,0.73,2.26,10.918,1.541,2.056,1.541,2.056,14.18,13.711,0.0,1.194,0.73,1.194,0.0,19.692,5.685,0.0,0.0,3.011,0.0,0.0,1.82,4.745,1.82,2.44,1.194,0.0,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,11.849
4,4,12.511,12.271,0.0,0.73,1.82,1.82,0.0,10.971,0.0,1.194,2.26,0.73,0.0,1.194,10.221,0.73,0.0,1.82,1.194,1.82,2.056,0.73,1.194,2.26,11.628,1.541,2.056,1.194,2.056,14.183,14.48,0.73,0.0,0.73,1.194,0.0,21.325,6.315,0.0,0.0,2.886,9.392,0.0,1.541,6.315,1.82,2.603,1.194,0.0,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,12.429


In [8]:
df_test.head()

Unnamed: 0,Id,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFinType2,BsmtFullBath,BsmtHalfBath,BsmtQual,BsmtUnfSF,CentralAir,EnclosedPorch,ExterCond,ExterQual,Fence,FireplaceQu,Fireplaces,FullBath,Functional,GarageArea,GarageCars,GarageCond,GarageFinish,GarageQual,GarageYrBlt,GrLivArea,HalfBath,HeatingQC,KitchenAbvGr,KitchenQual,LandSlope,LotArea,LotFrontage,LotShape,LowQualFinSF,MSSubClass,MasVnrArea,MiscVal,MoSold,OpenPorchSF,OverallCond,OverallQual,PavedDrive,PoolArea,...,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD
0,1458,11.819,0.0,0.0,0.73,1.194,1.82,1.541,10.105,7.397,2.056,1.541,0.0,0.0,1.82,8.781,0.73,0.0,1.82,1.541,1.194,1.541,0.0,0.73,2.26,11.26,0.73,2.056,1.541,2.056,14.121,11.819,0.0,1.82,0.73,1.541,0.0,20.479,6.221,1.541,0.0,2.056,0.0,0.0,2.603,0.0,2.056,2.056,1.194,0.0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,1459,12.944,0.0,0.0,0.73,1.541,1.82,1.541,11.901,0.0,0.0,2.26,0.0,0.0,1.82,9.752,0.73,0.0,1.82,1.541,1.82,1.541,0.0,0.73,2.26,9.118,0.73,2.056,1.541,2.056,14.117,12.944,0.73,1.82,0.73,1.194,0.0,21.327,6.245,0.0,0.0,2.056,6.808,20.778,2.603,4.792,2.056,2.26,1.194,0.0,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,1460,11.916,11.151,0.0,0.73,1.541,1.82,1.541,11.477,0.0,1.194,2.26,0.0,0.0,1.194,7.294,0.73,0.0,1.82,1.541,1.194,2.056,0.73,1.194,2.26,10.179,1.194,2.056,0.0,2.056,14.178,13.551,0.73,1.194,0.73,1.541,0.0,21.197,6.073,0.0,0.0,2.886,0.0,0.0,2.056,4.697,1.82,2.056,1.194,0.0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
3,1461,11.91,11.063,0.0,0.73,1.541,1.82,1.541,10.75,0.0,1.194,2.26,0.0,0.0,1.82,9.208,0.73,0.0,1.82,1.541,1.82,1.194,0.73,1.194,2.26,10.116,1.194,2.056,0.0,2.056,14.18,13.504,0.73,0.0,0.73,1.194,0.0,19.865,6.173,0.0,0.0,2.886,3.859,0.0,2.603,4.792,2.056,2.26,1.194,0.0,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
4,1462,12.834,0.0,0.0,0.73,1.194,1.82,1.541,8.72,0.0,0.0,2.26,0.0,0.0,1.194,12.173,0.73,0.0,1.82,1.194,1.82,1.541,0.0,1.194,2.26,10.302,1.194,2.056,1.194,2.056,14.17,12.834,0.0,0.0,0.73,1.194,0.0,17.257,5.094,0.0,0.0,0.0,0.0,0.0,0.0,6.268,1.82,2.603,1.194,0.0,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


In [9]:
#Save the test_ID

train_ID = df_train['Id']

In [11]:


df_train = df_train.drop(['Id'], axis=1)

In [12]:
#drop 'Id' column in train data because it is not a response variable 

df_train.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFinType2,BsmtFullBath,BsmtHalfBath,BsmtQual,BsmtUnfSF,CentralAir,EnclosedPorch,ExterCond,ExterQual,Fence,FireplaceQu,Fireplaces,FullBath,Functional,GarageArea,GarageCars,GarageCond,GarageFinish,GarageQual,GarageYrBlt,GrLivArea,HalfBath,HeatingQC,KitchenAbvGr,KitchenQual,LandSlope,LotArea,LotFrontage,LotShape,LowQualFinSF,MSSubClass,MasVnrArea,MiscVal,MoSold,OpenPorchSF,OverallCond,OverallQual,PavedDrive,PoolArea,PoolQC,...,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD,SalesPrice
0,11.693,11.686,0.0,0.73,1.541,1.82,1.541,11.17,0.0,1.194,2.26,0.73,0.0,1.194,7.483,0.73,0.0,1.82,1.194,1.82,1.541,0.0,1.194,2.26,10.506,1.194,2.056,1.194,2.056,14.188,13.699,0.73,0.0,0.73,1.194,0.0,19.212,5.831,1.541,0.0,2.886,8.059,0.0,1.82,5.715,1.82,2.44,1.194,0.0,1.541,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,12.248
1,12.792,0.0,0.0,0.73,1.541,1.82,0.73,12.063,0.0,0.0,2.26,0.0,0.73,1.194,8.898,0.73,0.0,1.82,1.541,1.82,2.056,0.73,1.194,2.26,10.062,1.194,2.056,1.194,2.056,14.145,12.792,0.0,0.0,0.73,1.541,0.0,19.712,6.221,1.541,0.0,2.056,0.0,0.0,2.44,0.0,2.44,2.26,1.194,0.0,1.541,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,12.109
2,11.892,11.725,0.0,0.73,1.541,1.82,1.194,10.2,0.0,1.194,2.26,0.73,0.0,1.194,9.917,0.73,0.0,1.82,1.194,1.82,2.056,0.73,1.194,2.26,10.776,1.194,2.056,1.194,2.056,14.184,13.832,0.73,0.0,0.73,1.194,0.0,20.347,5.915,0.0,0.0,2.886,7.647,0.0,3.011,5.053,1.82,2.44,1.194,0.0,1.541,...,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,12.317
3,12.014,11.354,0.0,0.73,1.541,0.73,1.541,8.274,0.0,0.0,2.26,0.73,0.0,1.82,10.468,0.73,8.798,1.82,1.541,1.82,1.194,0.73,0.73,2.26,10.918,1.541,2.056,1.541,2.056,14.18,13.711,0.0,1.194,0.73,1.194,0.0,19.692,5.685,0.0,0.0,3.011,0.0,0.0,1.82,4.745,1.82,2.44,1.194,0.0,1.541,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1,11.849
4,12.511,12.271,0.0,0.73,1.82,1.82,0.0,10.971,0.0,1.194,2.26,0.73,0.0,1.194,10.221,0.73,0.0,1.82,1.194,1.82,2.056,0.73,1.194,2.26,11.628,1.541,2.056,1.194,2.056,14.183,14.48,0.73,0.0,0.73,1.194,0.0,21.325,6.315,0.0,0.0,2.886,9.392,0.0,1.541,6.315,1.82,2.603,1.194,0.0,1.541,...,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,12.429


In [13]:
#Save the test_ID

test_ID = df_test['Id']

In [14]:
#drop 'Id' column in test data because it is not a response variable 

df_test = df_test.drop(['Id'], axis=1)

In [15]:
df_test.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFinType2,BsmtFullBath,BsmtHalfBath,BsmtQual,BsmtUnfSF,CentralAir,EnclosedPorch,ExterCond,ExterQual,Fence,FireplaceQu,Fireplaces,FullBath,Functional,GarageArea,GarageCars,GarageCond,GarageFinish,GarageQual,GarageYrBlt,GrLivArea,HalfBath,HeatingQC,KitchenAbvGr,KitchenQual,LandSlope,LotArea,LotFrontage,LotShape,LowQualFinSF,MSSubClass,MasVnrArea,MiscVal,MoSold,OpenPorchSF,OverallCond,OverallQual,PavedDrive,PoolArea,PoolQC,...,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD
0,11.819,0.0,0.0,0.73,1.194,1.82,1.541,10.105,7.397,2.056,1.541,0.0,0.0,1.82,8.781,0.73,0.0,1.82,1.541,1.194,1.541,0.0,0.73,2.26,11.26,0.73,2.056,1.541,2.056,14.121,11.819,0.0,1.82,0.73,1.541,0.0,20.479,6.221,1.541,0.0,2.056,0.0,0.0,2.603,0.0,2.056,2.056,1.194,0.0,1.541,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,12.944,0.0,0.0,0.73,1.541,1.82,1.541,11.901,0.0,0.0,2.26,0.0,0.0,1.82,9.752,0.73,0.0,1.82,1.541,1.82,1.541,0.0,0.73,2.26,9.118,0.73,2.056,1.541,2.056,14.117,12.944,0.73,1.82,0.73,1.194,0.0,21.327,6.245,0.0,0.0,2.056,6.808,20.778,2.603,4.792,2.056,2.26,1.194,0.0,1.541,...,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,11.916,11.151,0.0,0.73,1.541,1.82,1.541,11.477,0.0,1.194,2.26,0.0,0.0,1.194,7.294,0.73,0.0,1.82,1.541,1.194,2.056,0.73,1.194,2.26,10.179,1.194,2.056,0.0,2.056,14.178,13.551,0.73,1.194,0.73,1.541,0.0,21.197,6.073,0.0,0.0,2.886,0.0,0.0,2.056,4.697,1.82,2.056,1.194,0.0,1.541,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
3,11.91,11.063,0.0,0.73,1.541,1.82,1.541,10.75,0.0,1.194,2.26,0.0,0.0,1.82,9.208,0.73,0.0,1.82,1.541,1.82,1.194,0.73,1.194,2.26,10.116,1.194,2.056,0.0,2.056,14.18,13.504,0.73,0.0,0.73,1.194,0.0,19.865,6.173,0.0,0.0,2.886,3.859,0.0,2.603,4.792,2.056,2.26,1.194,0.0,1.541,...,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
4,12.834,0.0,0.0,0.73,1.194,1.82,1.541,8.72,0.0,0.0,2.26,0.0,0.0,1.194,12.173,0.73,0.0,1.82,1.194,1.82,1.541,0.0,1.194,2.26,10.302,1.194,2.056,1.194,2.056,14.17,12.834,0.0,0.0,0.73,1.194,0.0,17.257,5.094,0.0,0.0,0.0,0.0,0.0,0.0,6.268,1.82,2.603,1.194,0.0,1.541,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


In [16]:
#seperate response variables versus predictor variables

y = df_train['SalesPrice']                  #response variable

In [17]:
X = df_train.drop(['SalesPrice'], axis=1)     #predictor variables

In [18]:
X.head()

Unnamed: 0,1stFlrSF,2ndFlrSF,3SsnPorch,Alley,BedroomAbvGr,BsmtCond,BsmtExposure,BsmtFinSF1,BsmtFinSF2,BsmtFinType1,BsmtFinType2,BsmtFullBath,BsmtHalfBath,BsmtQual,BsmtUnfSF,CentralAir,EnclosedPorch,ExterCond,ExterQual,Fence,FireplaceQu,Fireplaces,FullBath,Functional,GarageArea,GarageCars,GarageCond,GarageFinish,GarageQual,GarageYrBlt,GrLivArea,HalfBath,HeatingQC,KitchenAbvGr,KitchenQual,LandSlope,LotArea,LotFrontage,LotShape,LowQualFinSF,MSSubClass,MasVnrArea,MiscVal,MoSold,OpenPorchSF,OverallCond,OverallQual,PavedDrive,PoolArea,PoolQC,...,Neighborhood_BrkSide,Neighborhood_ClearCr,Neighborhood_CollgCr,Neighborhood_Crawfor,Neighborhood_Edwards,Neighborhood_Gilbert,Neighborhood_IDOTRR,Neighborhood_MeadowV,Neighborhood_Mitchel,Neighborhood_NAmes,Neighborhood_NPkVill,Neighborhood_NWAmes,Neighborhood_NoRidge,Neighborhood_NridgHt,Neighborhood_OldTown,Neighborhood_SWISU,Neighborhood_Sawyer,Neighborhood_SawyerW,Neighborhood_Somerst,Neighborhood_StoneBr,Neighborhood_Timber,Neighborhood_Veenker,RoofMatl_CompShg,RoofMatl_Membran,RoofMatl_Metal,RoofMatl_Roll,RoofMatl_Tar&Grv,RoofMatl_WdShake,RoofMatl_WdShngl,RoofStyle_Flat,RoofStyle_Gable,RoofStyle_Gambrel,RoofStyle_Hip,RoofStyle_Mansard,RoofStyle_Shed,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SaleType_COD,SaleType_CWD,SaleType_Con,SaleType_ConLD,SaleType_ConLI,SaleType_ConLw,SaleType_New,SaleType_Oth,SaleType_WD
0,11.693,11.686,0.0,0.73,1.541,1.82,1.541,11.17,0.0,1.194,2.26,0.73,0.0,1.194,7.483,0.73,0.0,1.82,1.194,1.82,1.541,0.0,1.194,2.26,10.506,1.194,2.056,1.194,2.056,14.188,13.699,0.73,0.0,0.73,1.194,0.0,19.212,5.831,1.541,0.0,2.886,8.059,0.0,1.82,5.715,1.82,2.44,1.194,0.0,1.541,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
1,12.792,0.0,0.0,0.73,1.541,1.82,0.73,12.063,0.0,0.0,2.26,0.0,0.73,1.194,8.898,0.73,0.0,1.82,1.541,1.82,2.056,0.73,1.194,2.26,10.062,1.194,2.056,1.194,2.056,14.145,12.792,0.0,0.0,0.73,1.541,0.0,19.712,6.221,1.541,0.0,2.056,0.0,0.0,2.44,0.0,2.44,2.26,1.194,0.0,1.541,...,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
2,11.892,11.725,0.0,0.73,1.541,1.82,1.194,10.2,0.0,1.194,2.26,0.73,0.0,1.194,9.917,0.73,0.0,1.82,1.194,1.82,2.056,0.73,1.194,2.26,10.776,1.194,2.056,1.194,2.056,14.184,13.832,0.73,0.0,0.73,1.194,0.0,20.347,5.915,0.0,0.0,2.886,7.647,0.0,3.011,5.053,1.82,2.44,1.194,0.0,1.541,...,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1
3,12.014,11.354,0.0,0.73,1.541,0.73,1.541,8.274,0.0,0.0,2.26,0.73,0.0,1.82,10.468,0.73,8.798,1.82,1.541,1.82,1.194,0.73,0.73,2.26,10.918,1.541,2.056,1.541,2.056,14.18,13.711,0.0,1.194,0.73,1.194,0.0,19.692,5.685,0.0,0.0,3.011,0.0,0.0,1.82,4.745,1.82,2.44,1.194,0.0,1.541,...,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,1
4,12.511,12.271,0.0,0.73,1.82,1.82,0.0,10.971,0.0,1.194,2.26,0.73,0.0,1.194,10.221,0.73,0.0,1.82,1.194,1.82,2.056,0.73,1.194,2.26,11.628,1.541,2.056,1.194,2.056,14.183,14.48,0.73,0.0,0.73,1.194,0.0,21.325,6.315,0.0,0.0,2.886,9.392,0.0,1.541,6.315,1.82,2.603,1.194,0.0,1.541,...,0,0,0,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,1


In [19]:
def checkNull(df):
   if not isinstance(df, pd.DataFrame):
       return("No dataframe find!")
   if df.isnull().sum().sum() != 0:
       byCol = df.isnull().sum()
       return(byCol[byCol!=0])
   else:
       return ("The dataframe is NA free. Contains {0} rows with {1} attributes".format(df.shape[0], df.shape[1]))

In [20]:
checkNull(X)

'The dataframe is NA free. Contains 1458 rows with 220 attributes'

In [21]:
checkNull(y)

'No dataframe find!'

In [23]:
checkNull(df_test)

'The dataframe is NA free. Contains 1459 rows with 220 attributes'

<p><a name="models"></a></p>
## Training & Evaluating Models

### Set up

In [24]:
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import RobustScaler

from sklearn.linear_model import ElasticNet, LinearRegression, Lasso
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.model_selection import KFold, cross_val_score, train_test_split
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingClassifier

%load_ext autoreload
%autoreload 2

from stacking import stacking_regression
import xgboost as xgb
import lightgbm as lgb

### Cross Validation 

#### - Train-test split

In [25]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .20, random_state=0)

#### - CV 

In [26]:
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(X_train.values)
    rmse= np.sqrt(-cross_val_score(model, X_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmse)

### Elastic Net Regression 

#### - Building model

In [27]:
ENet = make_pipeline(RobustScaler(), 
                     ElasticNet(alpha=0.0005, 
                                l1_ratio=.9, 
                                random_state=0))

In [28]:
# Fit the model to the training data
ENet_trainedmodel = ENet.fit(X_train, y_train)

##How did we pick the optimal alpha and l1_ratio???

#### - Evaluating model

In [29]:
score = rmsle_cv(ENet_trainedmodel)
print("ElasticNet score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

ElasticNet score: 0.1160 (0.0122)



### Gradient Boosting Regression 

#### - Building model: Conservative

In [30]:
GBoost_con = GradientBoostingRegressor(n_estimators=3000, 
                                   learning_rate=0.005,
                                   max_depth=4, 
                                   max_features='sqrt',
                                   min_samples_leaf=15, 
                                   min_samples_split=10, 
                                   loss='huber', 
                                   random_state =0)

#'Huber' loss makes the model robust to outliers

In [31]:
# Fit the model to the training data
GBoost_con_trainedmodel = GBoost_con.fit(X_train, y_train)

#### - Evaluating model

In [32]:
%%time

score = rmsle_cv(GBoost_con_trainedmodel)
print("Gradient Boosting Regression score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Gradient Boosting Regression score: 0.1188 (0.0157)

CPU times: user 40.6 s, sys: 200 ms, total: 40.8 s
Wall time: 41.2 s


#### - Building model: Aggressive

In [33]:
GBoost_agr = GradientBoostingRegressor(n_estimators=1000, 
                                   learning_rate=0.01,
                                   max_depth=9, 
                                   max_features='sqrt',
                                   min_samples_leaf=10, 
                                   min_samples_split=5, 
                                   loss='huber', 
                                   random_state =5)

In [34]:
# Fit the model to the training data
GBoost_agr_trainedmodel = GBoost_agr.fit(X_train, y_train)

#### - Evaluating model

In [35]:
%%time

score = rmsle_cv(GBoost_agr_trainedmodel)
print("Gradient Boosting Regression score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Gradient Boosting Regression score: 0.1231 (0.0143)

CPU times: user 36.9 s, sys: 147 ms, total: 37 s
Wall time: 37.2 s


### XGBoost

#### - Building model

In [36]:
model_xgb = xgb.XGBRegressor(colsample_bytree=0.4603, 
                             gamma=0.0468, 
                             learning_rate=0.05, 
                             max_depth=3, 
                             min_child_weight=1.7817, 
                             n_estimators=2200,
                             reg_alpha=0.4640, 
                             reg_lambda=0.8571,
                             subsample=0.5213, 
                             silent=1,
                             random_state =7, 
                             nthread = -1)

In [37]:
## What do all these arguments mean and why did we pick them???

In [38]:
# Fit the model to the training data
model_xgb_trainedmodel = model_xgb.fit(X_train, y_train)

#### - Evaluating model

In [39]:
%%time

score = rmsle_cv(model_xgb_trainedmodel)
print("XGBoost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

XGBoost score: 0.1227 (0.0160)

CPU times: user 53.5 s, sys: 10.4 s, total: 1min 3s
Wall time: 31.6 s


### LightGBM

#### - Building model

In [40]:
model_lgb = lgb.LGBMRegressor(objective='regression',
                              num_leaves=5,
                              learning_rate=0.05, 
                              n_estimators=720,
                              max_bin = 55, 
                              bagging_fraction = 0.8,
                              bagging_freq = 5, 
                              feature_fraction = 0.2319,
                              feature_fraction_seed=9, 
                              bagging_seed=9,
                              min_data_in_leaf =6, 
                              min_sum_hessian_in_leaf = 11)

In [41]:
## What do all these arguments mean and why did we pick them???

In [42]:
# Fit the model to the training data
model_lgb_trainedmodel = model_lgb.fit(X_train, y_train)

In [105]:
%%time

score = rmsle_cv(model_lgb_trainedmodel)
print("XGBoost score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

XGBoost score: 0.1219 (0.0145)

CPU times: user 4.49 s, sys: 4.34 s, total: 8.83 s
Wall time: 4.84 s


#### - Evaluating model

In [43]:
%%time

score = rmsle_cv(model_lgb_trainedmodel)
print("LightGBM score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

LightGBM score: 0.1219 (0.0145)

CPU times: user 4.52 s, sys: 4.41 s, total: 8.94 s
Wall time: 5.19 s


### Random Forest

#### - Conservative

In [44]:
rand_forest_con = RandomForestRegressor(random_state=0, 
                                        n_estimators=1000,
                                        max_depth=6,  
                                        max_features='sqrt')

In [45]:
rand_forest_con_trainedmodel = rand_forest_con.fit(X_train, y_train)

In [46]:
%%time

score = rmsle_cv(rand_forest_con_trainedmodel)
print("Random Forest score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Random Forest score: 0.1615 (0.0151)

CPU times: user 11 s, sys: 136 ms, total: 11.2 s
Wall time: 13 s


#### - Aggressive 

In [108]:
rand_forest_agr = RandomForestRegressor(random_state=0, 
                                        n_estimators=100,
                                        max_depth=15,
                                        max_features='auto')

In [109]:
rand_forest_agr_trainedmodel = rand_forest_agr.fit(X_train, y_train)

In [110]:
%%time

score = rmsle_cv(rand_forest_agr_trainedmodel)
print("Random Forest score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))

Random Forest score: 0.1441 (0.0154)

CPU times: user 10 s, sys: 42.2 ms, total: 10.1 s
Wall time: 10.1 s


## Lasso

In [50]:
lasso = make_pipeline(RobustScaler(), Lasso(alpha =0.0005, random_state=1))

In [51]:
def rmsle(y, y_pred):
    return np.sqrt(mean_squared_error(y), y_pred)

## Submission

In [69]:
from sklearn.base import BaseEstimator, RegressorMixin, TransformerMixin, clone

In [70]:
class AveragingModels(BaseEstimator, RegressorMixin, TransformerMixin):
    def __init__(self, models):
        self.models = models
        
    # we define clones of the original models to fit the data in
    def fit(self, X, y):
        self.models_ = [clone(x) for x in self.models]
        
        # Train cloned base models
        for model in self.models_:
            model.fit(X, y)

        return self
    
    #Now we do the predictions for cloned models and average them
    def predict(self, X):
        predictions = np.column_stack([
            model.predict(X) for model in self.models_
        ])
        return np.mean(predictions, axis=1) 

In [111]:
%%time

averaged_models = AveragingModels(models = (ENet, 
                                            GBoost_con, 
                                            GBoost_agr,  
                                            model_xgb,
                                            model_lgb,
                                            lasso))

score = rmsle_cv(averaged_models)
print(" Averaged base models score: {:.4f} ({:.4f})\n".format(score.mean(), score.std()))


 Averaged base models score: 0.1140 (0.0144)

CPU times: user 2min 14s, sys: 15.7 s, total: 2min 30s
Wall time: 1min 50s


In [112]:
test = averaged_models.fit(X_train, y_train)

In [113]:
test_pred = test.predict(df_test)

In [114]:
sub = pd.DataFrame()
sub['Id'] = test_ID
sub['SalePrice'] = np.expm1(test_pred)
sub.to_csv('submission.csv',index=False)