# Kaggle House Prices - Fit Boost Model

# Libraries

In [1]:
# Data
import pandas as pd
import numpy as np
import statistics
from IPython.display import display
pd.options.display.max_columns = None

# ML
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error, r2_score

# Plots
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(font_scale=1.5, style="ticks")
fs = (14, 6) # make the figure wider than the default

# Load Data

## Load Training Data

In [2]:
houses = pd.read_csv('data/cleaned_data/clean_train2.csv')
houses.shape

(1460, 81)

## Clean The Training Data

In [3]:
def cleanme(houses):
    # Override spurious NAs
    # PoolQu
    houses.PoolQC.fillna("NA", inplace=True)

    #MiscFeature
    houses.MiscFeature.fillna("NA", inplace=True)

    #Alley
    houses.Alley.fillna("NA", inplace=True)

    # Fence
    houses.Fence.fillna("NA", inplace=True)

    # FireplaceQu
    houses.FireplaceQu.fillna("NA", inplace=True)

    # Garages
    houses.GarageType.fillna("NA",inplace=True)
    houses.GarageFinish.fillna("NA",inplace=True)
    houses.GarageQual.fillna("NA",inplace=True)
    houses.GarageCond.fillna("NA",inplace=True)
    houses.GarageCars.fillna(0, inplace=True)
    houses.GarageArea.fillna(0, inplace=True)

    # Basements
    houses.BsmtExposure.fillna("NA",inplace=True)
    houses.BsmtFinType1.fillna("NA",inplace=True)
    houses.BsmtFinType2.fillna("NA",inplace=True)
    houses.BsmtQual.fillna("NA",inplace=True)
    houses.BsmtCond.fillna("NA",inplace=True)
    houses.BsmtFullBath.fillna(0,inplace=True)
    houses.BsmtFinSF1.fillna(0,inplace=True)
    houses.BsmtFinSF2.fillna(0,inplace=True)
    houses.BsmtUnfSF.fillna(0,inplace=True)
    houses.BsmtHalfBath.fillna(0,inplace=True)
    houses.TotalBsmtSF.fillna(0,inplace=True)

    # Masonry
    houses.MasVnrType.fillna("NA",inplace=True)
    houses.MasVnrArea.fillna(0,inplace=True)

    # KitchenQual
    houses.KitchenQual.fillna("TA",inplace=True)

    # Utilities
    houses.Utilities.fillna("AllPub",inplace=True)

    #Functional
    houses.Functional.fillna("Typ",inplace=True)

    #Exterior Siding
    houses.Exterior1st.fillna("VinylSd", inplace=True)
    houses.Exterior2nd.fillna("VinylSd", inplace=True)

    # Zoning
    houses.MSZoning.fillna("RL",inplace=True)

    # Sale Type
    houses.SaleType.fillna("WD",inplace=True)

    # Electrical
    houses.Electrical.fillna("SRbkr", inplace=True)
    
    # Change some strings to ordinal values
    # ExterQual
    exterqual = {"Po" : 0, "Fa": 1, 'TA':2, 'Gd':3, 'Ex':4}
    houses.ExterQual = houses.ExterQual.apply(lambda v: exterqual[v])
    # ExterCond
    extercond = {"Po" : 0, "Fa": 1, 'TA':2, 'Gd':3, 'Ex':4}
    houses.ExterCond = houses.ExterCond.apply(lambda v: extercond[v])
    # BsmtQual
    basmtqual = {"NA":0, "Po" : 1, "Fa": 2, 'TA':3, 'Gd':4, 'Ex':5}
    houses.BsmtQual = houses.BsmtQual.apply(lambda v: basmtqual[v])
    # BsmtCond
    basmtcond = {"NA":0, "Po" : 1, "Fa": 2, 'TA':3, 'Gd':4, 'Ex':5}
    houses.BsmtCond = houses.BsmtCond.apply(lambda v: basmtcond[v])
    # BsmtExposure
    basmtexposure = {"NA" : 0, "No": 1, 'Mn':2, 'Av':3, 'Gd': 4}
    houses.BsmtExposure = houses.BsmtExposure.apply(lambda v: basmtexposure[v])
    # BsmtFinType_1
    bsmtfintype = {"NA":0, "Unf" : 1, "LwQ": 2, 'Rec':3, 'BLQ':4, 'ALQ':5, "GLQ": 6}
    houses.BsmtFinType1 = houses.BsmtFinType1.apply(lambda v: bsmtfintype[v])
    # KitchenQual
    kitqual = {'Fa':0,'TA':1,'Gd':2, 'Ex':3}
    houses.KitchenQual = houses.KitchenQual.apply(lambda v: kitqual[v])
    # Functional
    functional = {'Typ':7, 'Min1':6, 'Min2':5, 'Mod':4, 'Maj1':3, 'Maj2':2, 'Sev':1, 'Sal':0}
    houses.Functional = houses.Functional.apply(lambda v: functional[v])
    # FireplaceQu
    fireplacequ = {'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
    houses.FireplaceQu = houses.FireplaceQu.apply(lambda v: fireplacequ[v])
    # GarageFinish
    garagefinish = {'NA':0,'Unf':1,'RFn':2,'Fin':3}
    houses.GarageFinish = houses.GarageFinish.apply(lambda v: garagefinish[v])

In [4]:
cleanme(houses)
# What % data (if any) is missing for each column?
nulls = houses.isnull().sum()
nulls = pd.DataFrame(nulls)
nulls.rename(columns={0:"cnt_missing"},inplace=True)
nulls = nulls[nulls.cnt_missing>0]
nulls['pct_missing'] = round(nulls.cnt_missing/houses.shape[0] * 100,2)
nulls.sort_values(by="pct_missing",ascending=False)

Unnamed: 0,cnt_missing,pct_missing
GarageYrBlt,81,5.55


In [5]:
houses.sample(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,FirstFlrSF,SecondFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,ThirdSsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
974,975,70,RL,60,11414,Pave,,IR1,Lvl,AllPub,Corner,Gtl,BrkSide,RRAn,Feedr,1Fam,2Story,7,8,1910,1993,Gable,CompShg,HdBoard,HdBoard,,0.0,2,3,BrkTil,4,3,1,1,0,Unf,0,728,728,GasA,TA,N,SBrkr,1136,883,0,2019,0,0,1,0,3,1,2,8,7,0,0,Detchd,1997.0,1,2,532,TA,TA,Y,509,135,0,0,0,0,,GdPrv,,0,10,2009,WD,Normal,167500
686,687,60,FV,84,10207,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Somerst,Norm,Norm,1Fam,2Story,7,6,2007,2007,Gable,CompShg,VinylSd,VinylSd,,0.0,3,2,PConc,4,3,1,1,0,Unf,0,874,874,GasA,Ex,Y,SBrkr,874,887,0,1761,0,0,3,0,3,1,2,7,7,0,0,Attchd,2007.0,3,2,578,TA,TA,Y,144,105,0,0,0,0,,,,0,8,2007,New,Partial,227875
604,605,20,RL,88,12803,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,1Story,7,5,2002,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,99.0,3,2,PConc,4,3,2,6,922,Unf,0,572,1494,GasA,Ex,Y,SBrkr,1494,0,0,1494,1,0,2,0,3,1,2,6,7,1,3,Attchd,2002.0,2,2,530,TA,TA,Y,192,36,0,0,0,0,,,,0,9,2008,WD,Normal,221000
500,501,160,RM,21,1890,Pave,,Reg,Lvl,AllPub,Inside,Gtl,BrDale,Norm,Norm,Twnhs,2Story,6,5,1973,1973,Gable,CompShg,HdBoard,HdBoard,BrkFace,285.0,2,2,CBlock,3,3,1,4,356,Unf,0,316,672,GasA,TA,Y,SBrkr,672,546,0,1218,0,0,1,1,3,1,1,7,7,0,0,Detchd,1973.0,1,1,264,TA,TA,Y,144,28,0,0,0,0,,,,0,5,2007,WD,Normal,113000
1009,1010,50,RL,60,6000,Pave,,Reg,Lvl,AllPub,Inside,Gtl,SWISU,Norm,Norm,1Fam,1.5Fin,5,5,1926,1950,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,2,1,BrkTil,3,3,1,1,0,Unf,0,1008,1008,GasA,Ex,Y,SBrkr,1008,0,514,1522,0,0,2,0,4,1,1,7,7,0,0,,,0,0,0,,,P,0,0,138,0,0,0,,,,0,6,2006,WD,Normal,102000


## Load Test Data

In [6]:
test_houses = pd.read_csv('data/cleaned_data/clean_test.csv')
test_houses.shape

(1459, 80)

## Clean Test Data

In [7]:
cleanme(test_houses)

# What % data (if any) is missing for each column?
nulls = test_houses.isnull().sum()
nulls = pd.DataFrame(nulls)
nulls.rename(columns={0:"cnt_missing"},inplace=True)
nulls = nulls[nulls.cnt_missing>0]
nulls['pct_missing'] = round(nulls.cnt_missing/houses.shape[0] * 100,2)
nulls.sort_values(by="pct_missing",ascending=False)

Unnamed: 0,cnt_missing,pct_missing
GarageYrBlt,78,5.34


# Train XG Boost

In [8]:
X = houses.drop('GarageYrBlt', axis=1).drop('SalePrice', axis=1).drop('Id',axis=1) # features (drop ID and GarageYrBlt)
X = pd.get_dummies(X, drop_first=True, dummy_na=True)                              # encoded features
y = houses.SalePrice                                                               # response variable

# 80% train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [9]:
# Train XGBoost
model = XGBRegressor(random_state=0)
model.fit(X_train, y_train)
# print(model)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

## Initial Predictions

In [10]:
def get_err(length,predicted,truth):
    # Mean absolute error
    abs_err = [0] * length
    for i in range(0,length):
            abs_err[i] = abs(predicted[i] - truth[i])

    mae = statistics.mean(abs_err)
    print("Mean Absolute Error:",round(mae,2))

    # Mean abs percent error
    pct_err = [0] * length
    for i in range(0,length):
        pct_err[i] = abs_err[i] / truth[i] * 100
    mpae = statistics.mean(pct_err)

    print("Mean Abs Pct Error:",round(mpae,2),"%")

    accuracy = 100 - mpae
    print('Accuracy:', round(accuracy, 2), '%.')

    # Mean Square Error
    err_sq = [0] * length
    for i in range(0,length):
        err_sq[i] = abs_err[i]**2

    print("Mean Square Error: ",statistics.mean(err_sq))

In [11]:
intl_predictions = model.predict(X_test)
get_err(len(intl_predictions), intl_predictions, [i for i in y_test])

Mean Absolute Error: 15877.03
Mean Abs Pct Error: 9.05 %
Accuracy: 90.95 %.
Mean Square Error:  670488684.753418


In [12]:
# Get feature importances
importances = list(model.feature_importances_)
var_list = list(X_train.columns)

var_importances = pd.DataFrame({"Predictor":var_list, "Importance":importances})
print("Importances > 0.03")
var_importances[var_importances.Importance>0.03].sort_values("Importance",ascending=False)

Importances > 0.03


Unnamed: 0,Predictor,Importance
21,GrLivArea,0.09589
2,LotArea,0.079148
14,BsmtFinSF1,0.079148
3,OverallQual,0.048706
17,TotalBsmtSF,0.048706
18,FirstFlrSF,0.038052
5,YearBuilt,0.03653
35,GarageArea,0.035008
4,OverallCond,0.030441
19,SecondFlrSF,0.030441


## Retrain with only the top predictors

In [33]:
top_predictors = var_importances[var_importances.Importance>0.03].sort_values("Importance",ascending=False).Predictor
top_predictors = [i for i in top_predictors]

X_red = houses[top_predictors]                             # features
X_red = pd.get_dummies(X, drop_first=True, dummy_na=True)  # encoded features

# 80% train-test-split
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(X, y, test_size = 0.2)

# Train XGBoost
model_red = XGBRegressor(random_state=0)
model_red.fit(X_train, y_train)
print(model_red)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)


## Check new results

In [34]:
red_predictions = model_red.predict(X_train_red)
get_err(len(red_predictions),red_predictions,[i for i in y_train_red])

Mean Absolute Error: 11368.49
Mean Abs Pct Error: 6.99 %
Accuracy: 93.01 %.
Mean Square Error:  290496865.3689836


In [35]:
red_predictions = model_red.predict(X_test_red)
get_err(len(red_predictions),red_predictions,[i for i in y_test_red])

Mean Absolute Error: 11304.22
Mean Abs Pct Error: 6.53 %
Accuracy: 93.47 %.
Mean Square Error:  276839538.8428562


~3% increase in accuracy

## Cross Validation to Tune Hyperparameters

In [38]:
# Let's use the X_red and y_red sets again
pipeline = make_pipeline(preprocessing.StandardScaler(), 
                         XGBRegressor(n_estimators=100, random_state=0))

In [41]:
# Declare hyperparameters to tune
hyperparameters = {
    'xgbregressor__max_depth':[3,4,5,6,7,8,9,10],
    'xgbregressor__min_child_weight': [1,2,3,4,5],
    'xgbregressor__subsample': np.arange(0.5,1,1),
    'xgbregressor__gamma':[0,0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]
}

In [42]:
clf = GridSearchCV(pipeline, hyperparameters, cv=5, n_jobs=-1)
 
# Fit and tune model
clf.fit(X_train_red, y_train_red)

GridSearchCV(cv=5, error_score='raise',
       estimator=Pipeline(memory=None,
     steps=[('standardscaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('xgbregressor', XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, ...       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1))]),
       fit_params=None, iid=True, n_jobs=-1,
       param_grid={'xgbregressor__max_depth': [3, 4, 5, 6, 7, 8, 9, 10], 'xgbregressor__min_child_weight': [1, 2, 3, 4, 5], 'xgbregressor__subsample': array([0.5]), 'xgbregressor__gamma': [0, 0.1, 0.2, 0.3, 0.4, 0.5, 0.6, 0.7, 0.8, 0.9]},
       pre_dispatch='2*n_jobs', refit=True, return_train_score='warn',
       scoring=None, verbose=0)

In [43]:
clf.get_params()

{'cv': 5,
 'error_score': 'raise',
 'estimator__memory': None,
 'estimator__steps': [('standardscaler',
   StandardScaler(copy=True, with_mean=True, with_std=True)),
  ('xgbregressor',
   XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
          colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
          max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
          n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
          reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
          silent=True, subsample=1))],
 'estimator__standardscaler': StandardScaler(copy=True, with_mean=True, with_std=True),
 'estimator__xgbregressor': XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
        colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
        max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
        n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
        r

In [44]:
# on the training data
ypred = clf.predict(X_train_red)
get_err(len(ypred),ypred,[i for i in y_train_red])

Mean Absolute Error: 7392.52
Mean Abs Pct Error: 4.74 %
Accuracy: 95.26 %.
Mean Square Error:  95498040.00629146


In [45]:
# on the test data
ypred = clf.predict(X_test_red)
get_err(len(ypred),ypred,[i for i in y_test_red])

Mean Absolute Error: 16683.84
Mean Abs Pct Error: 9.35 %
Accuracy: 90.65 %.
Mean Square Error:  1286505729.7361324


# Test on "Actual" Test Data