In [1]:
import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
from math import log
import warnings

warnings.filterwarnings("ignore")

# Understanding and preparing the data

In [2]:
# import the training file

df_train = pd.read_csv('train.csv')

df_train.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [3]:
# import the test file

df_test = pd.read_csv('test.csv')

df_test.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [4]:
# check the shape of each dataframe

print('Train data shape is: ', df_train.shape)
print('Test data shape is: ', df_test.shape)

Train data shape is:  (1460, 81)
Test data shape is:  (1459, 80)


In [5]:
# lets combine the training data and the test data for cleaning and feature engineering

df = pd.concat([df_train.iloc[:, :80], df_test])

print('The total shape is: ', df.shape) # (2919, 80) seems about right

The total shape is:  (2919, 80)


In [6]:
# see what type of data each column is holding

df.dtypes

Id                 int64
MSSubClass         int64
MSZoning          object
LotFrontage      float64
LotArea            int64
                  ...   
MiscVal            int64
MoSold             int64
YrSold             int64
SaleType          object
SaleCondition     object
Length: 80, dtype: object

In [7]:
# find any NaN values

df.isnull().sum()[df.isnull().sum()>0]

MSZoning           4
LotFrontage      486
Alley           2721
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        24
MasVnrArea        23
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinSF1         1
BsmtFinType2      80
BsmtFinSF2         1
BsmtUnfSF          1
TotalBsmtSF        1
Electrical         1
BsmtFullBath       2
BsmtHalfBath       2
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageYrBlt      159
GarageFinish     159
GarageCars         1
GarageArea         1
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
SaleType           1
dtype: int64

In [8]:
# MSZoning NaN can be replaced with 'none'

df['MSZoning'] = df['MSZoning'].fillna('None')

# NaN implies no utilities

df['Utilities'] = df['Utilities'].fillna('None')

# NaN implies no exterior

df['Exterior1st'] = df['Exterior1st'].fillna('None')

df['Exterior2nd'] = df['Exterior2nd'].fillna('None')

# lot frontage NaN represents 0 feet of street connected and so can be replaced with 0

df['LotFrontage'] = df['LotFrontage'].fillna(0)

# alley NaN values represent no alley thus can be replaced with 'none'

df['Alley'] = df['Alley'].fillna('None')

# I will assume that NaN is meant to mean that there is no MasVnrType and so replace it with 'None'

df['MasVnrType'] = df['MasVnrType'].fillna('None')

# Similar with type and so will replace NaN with 0 for MasVnrArea

df['MasVnrArea'] = df['MasVnrArea'].fillna(0)

# for all the basement features, NaN means no beasment and so will be replace with 'None'

df['BsmtQual'] = df['BsmtQual'].fillna('None')
df['BsmtCond'] = df['BsmtCond'].fillna('None')
df['BsmtExposure'] = df['BsmtExposure'].fillna('None')
df['BsmtFinType1'] = df['BsmtFinType1'].fillna('None')
df['BsmtFinType2'] = df['BsmtFinType2'].fillna('None')
df['BsmtFinSF1'] = df['BsmtFinSF1'].fillna(0)
df['BsmtFinSF2'] = df['BsmtFinSF2'].fillna(0)
df['BsmtUnfSF'] = df['BsmtUnfSF'].fillna(0)
df['TotalBsmtSF'] = df['TotalBsmtSF'].fillna(0)
df['BsmtFullBath'] = df['BsmtFullBath'].fillna(0)
df['BsmtHalfBath'] = df['BsmtHalfBath'].fillna(0)

# KitchenQual may mean no kitchen

df['KitchenQual'] = df['KitchenQual'].fillna('None')

# I will assume that NaN electrical means no electricity, there's only instance anyway

df['Electrical'] = df['Electrical'].fillna('None')

# NaN means no fireplace

df['FireplaceQu'] = df['FireplaceQu'].fillna('None')

# Garage is similar to basement

df['GarageType'] = df['GarageType'].fillna('None')
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)
df['GarageFinish'] = df['GarageFinish'].fillna('None')
df['GarageQual'] = df['GarageQual'].fillna('None')
df['GarageCond'] = df['GarageCond'].fillna('None')
df['GarageCars'] = df['GarageCars'].fillna(0)
df['GarageArea'] = df['GarageArea'].fillna(0)

# NaN PoolQc means no pool

df['PoolQC'] = df['PoolQC'].fillna('None')

# NaN for functional could mean anything but there's only 2 astray

df['Functional'] = df['Functional'].fillna('None')

# NaN fence means no fence

df['Fence'] = df['Fence'].fillna('None')

# NaN means no MiscFeature

df['MiscFeature'] = df['MiscFeature'].fillna('None')

# Sale Type NaN could mean anything but there's only one

df['SaleType'] = df['SaleType'].fillna('None')

# find any NaN values

df.isnull().sum()[df.isnull().sum()>0]

Series([], dtype: int64)

In [9]:
# some continuous features will need to be made into categorical features for the decision tree models

categories = ['YrSold', 'MoSold', 'GarageYrBlt', 'YearBuilt', 'YearRemodAdd']

df[categories] = df[categories].astype(str)

In [10]:
# we must utilise one hot encoding so that we have numeric values for SVM or KNN models
# a new dataframe will be made as we don't need the extra features for decision tree based models

df_one_hot = df

# list the categorical features for one ot encoding

categorical_features = ['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature', 'SaleType', 'SaleCondition']

In [11]:
# for each category get it's dummy features and replace the original feature

# get the dummy features

dummies = pd.get_dummies(df_one_hot[categorical_features])

# add them on to the original dataframe

df_one_hot = pd.concat([df_one_hot, dummies], axis=1)

# remove the old features

df_one_hot = df_one_hot.drop(categorical_features, axis=1)

print('Original number of features: ', df.shape[1])
print('Number of features to be removed: ', len(categorical_features))
print('Number of dummy features to be added: ',dummies.shape[1])
print('New number of features: ',df_one_hot.shape[1])


Original number of features:  80
Number of features to be removed:  44
Number of dummy features to be added:  275
New number of features:  310


In [12]:
# finally we normalise the numeric dataframe

# import the module

from sklearn.preprocessing import MinMaxScaler

# fit the scaler to the numeric dataframe

norm = MinMaxScaler().fit(df_one_hot)

# transform the data

df_one_hot = norm.transform(df_one_hot)

In [13]:
# rename all of our final dataframes

X_train_og = df.iloc[0:df_train.shape[0], :]

X_train_numeric = df_one_hot[0:df_train.shape[0], :]

X_test_og = df.iloc[df_train.shape[0]:, :]

X_test_numeric = df_one_hot[df_train.shape[0]:, :]

# we take the log as the evaluation is RMSE of the log of y (please remember to change it back)

y_train = df_train.iloc[:,80].apply(lambda x: log(x))

# Base model parameter tuning and fitting

I'll just be using simple models that I have a good grasp of as the main intention of this notebook is to learn about stacking and parameter tuning.

In [14]:
# import the libraries

from sklearn import svm
from sklearn.model_selection import GridSearchCV
from sklearn.neighbors import KNeighborsRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.neural_network import MLPRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.model_selection import cross_validate

# SVM

In [15]:
# conduct a grid search with chosen parameters on a SVM regression model

svm_parameters = {'gamma':[10**(-1), 10**(0), 10**(1)], 'C':[10**(0), 10**(1), 20, 10**(2)]}

# define the model

svm_model = GridSearchCV(svm.SVR(), svm_parameters, scoring='neg_root_mean_squared_error')

# fit the model to the numeric training set

svm_model.fit(X_train_numeric, y_train)

GridSearchCV(estimator=SVR(),
             param_grid={'C': [1, 10, 20, 100], 'gamma': [0.1, 1, 10]},
             scoring='neg_root_mean_squared_error')

In [16]:
# check the scores for best parameters

print('The best parameters are: ', svm_model.best_params_)
print('The scores were: ', svm_model.best_score_)

The best parameters are:  {'C': 10, 'gamma': 0.1}
The scores were:  -0.18825010345261123


# KNN

In [17]:
# conduct a grid search with chosen parameters on a KNN regression model

knn_parameters = {'n_neighbors':[5, 7, 9, 11], 'weights':('uniform', 'distance')}

# define the model

knn_model = GridSearchCV(KNeighborsRegressor(), knn_parameters, scoring='neg_root_mean_squared_error')

# fit the model to the numeric training set

knn_model.fit(X_train_numeric, y_train)

GridSearchCV(estimator=KNeighborsRegressor(),
             param_grid={'n_neighbors': [5, 7, 9, 11],
                         'weights': ('uniform', 'distance')},
             scoring='neg_root_mean_squared_error')

In [18]:
# check the scores for best parameters

print('The best parameters are: ', knn_model.best_params_)
print('The scores were: ', knn_model.best_score_)

The best parameters are:  {'n_neighbors': 9, 'weights': 'distance'}
The scores were:  -0.19830458750144186


# Random Forest

In [19]:
# conduct a grid search with chosen parameters on a random forest regression model

rf_parameters = {'max_depth':[6, 8, 10, 12], 'n_estimators':[50, 100, 150, 200]}

# define the model

rf_model = GridSearchCV(RandomForestRegressor(), rf_parameters, scoring='neg_root_mean_squared_error')

# fit the model to the numerical set

rf_model.fit(X_train_numeric, y_train)

GridSearchCV(estimator=RandomForestRegressor(),
             param_grid={'max_depth': [6, 8, 10, 12],
                         'n_estimators': [50, 100, 150, 200]},
             scoring='neg_root_mean_squared_error')

In [20]:
# check the scores for best parameters

print('The best parameters are: ', rf_model.best_params_)
print('The scores were: ', rf_model.best_score_)

The best parameters are:  {'max_depth': 12, 'n_estimators': 100}
The scores were:  -0.14365330341455124


# Multilayer Perceptron

In [21]:
# conduct a grid search with chosen parameters on a random forest regression model

mlp_parameters = {'hidden_layer_sizes':[[20, 20], [40, 40], [60, 60]], 'alpha':[0.1, 0.5, 1, 1.5]}

# define the model

mlp_model = GridSearchCV(MLPRegressor(activation='relu', solver='adam', max_iter=300), mlp_parameters, scoring='neg_root_mean_squared_error')

# fit the model to the numerical set

mlp_model.fit(X_train_numeric, y_train)

GridSearchCV(estimator=MLPRegressor(max_iter=300),
             param_grid={'alpha': [0.1, 0.5, 1, 1.5],
                         'hidden_layer_sizes': [[20, 20], [40, 40], [60, 60]]},
             scoring='neg_root_mean_squared_error')

In [22]:
# check the scores for best parameters

print('The best parameters are: ', mlp_model.best_params_)
print('The scores were: ', mlp_model.best_score_)

The best parameters are:  {'alpha': 0.5, 'hidden_layer_sizes': [40, 40]}
The scores were:  -0.15027787899625294


# Stacking

In [23]:
# create a list of models with the ideal parameters

estimators = [('svm', svm.SVR(C=10, gamma=0.1)), ('knn', KNeighborsRegressor(n_neighbors=9, weights='distance')), ('rf', RandomForestRegressor(max_depth=12, n_estimators=150)), ('mlp', MLPRegressor(activation='relu', solver='adam', max_iter=300, alpha=0.5, hidden_layer_sizes=[60, 60]))]

# create a stacked model

reg = StackingRegressor(estimators=estimators, final_estimator=RandomForestRegressor(n_estimators=100))

# check how well the stacked model does

scores = cross_validate(reg, X_train_numeric, y_train, cv=5, scoring='neg_mean_squared_error', return_train_score=True)

In [24]:
print('The negative RMSE stacked score is: ',scores['test_score'].mean())

The negative RMSE stacked score is:  -0.018425585863616513


In [25]:
# fit the stacked model to the training data

reg.fit(X_train_numeric, y_train)

# predict the test data

submission_array = reg.predict(X_test_numeric)

In [27]:
# import the correct format submission

submission = pd.read_csv('sample_submission.csv')

# change the predictions

submission['SalePrice'] = submission_array

#reverse the log(x) function that was applied peviously

submission['SalePrice'] = submission['SalePrice'].apply(lambda x: np.exp(x))

submission.head()

Unnamed: 0,Id,SalePrice
0,1461,121606.906189
1,1462,150800.315449
2,1463,173138.808937
3,1464,185710.687571
4,1465,190002.889652


In [28]:
# save to a csv

submission.to_csv("submission.csv", index=False)