# Kaggle House Prices - Fit Boost Model

# Libraries

In [87]:
# Data
import pandas as pd
import numpy as np
import statistics
from IPython.display import display
pd.options.display.max_columns = None

# ML
from xgboost import XGBRegressor
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

# Plots
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
sns.set(font_scale=1.5, style="ticks")
fs = (14, 6) # make the figure wider than the default

# Load Data

## Load Training Data

In [210]:
houses = pd.read_csv('data/cleaned_data/clean_train2.csv')
houses.shape

(1460, 81)

## Clean The Training Data

In [213]:
def cleanme(houses):
    # Override spurious NAs
    # PoolQu
    houses.PoolQC.fillna("NA", inplace=True)

    #MiscFeature
    houses.MiscFeature.fillna("NA", inplace=True)

    #Alley
    houses.Alley.fillna("NA", inplace=True)

    # Fence
    houses.Fence.fillna("NA", inplace=True)

    # FireplaceQu
    houses.FireplaceQu.fillna("NA", inplace=True)

    # Garages
    houses.GarageType.fillna("NA",inplace=True)
    houses.GarageFinish.fillna("NA",inplace=True)
    houses.GarageQual.fillna("NA",inplace=True)
    houses.GarageCond.fillna("NA",inplace=True)
    houses.GarageCars.fillna(0, inplace=True)
    houses.GarageArea.fillna(0, inplace=True)

    # Basements
    houses.BsmtExposure.fillna("NA",inplace=True)
    houses.BsmtFinType1.fillna("NA",inplace=True)
    houses.BsmtFinType2.fillna("NA",inplace=True)
    houses.BsmtQual.fillna("NA",inplace=True)
    houses.BsmtCond.fillna("NA",inplace=True)
    houses.BsmtFullBath.fillna(0,inplace=True)
    houses.BsmtFinSF1.fillna(0,inplace=True)
    houses.BsmtFinSF2.fillna(0,inplace=True)
    houses.BsmtUnfSF.fillna(0,inplace=True)
    houses.BsmtHalfBath.fillna(0,inplace=True)
    houses.TotalBsmtSF.fillna(0,inplace=True)

    # Masonry
    houses.MasVnrType.fillna("NA",inplace=True)
    houses.MasVnrArea.fillna(0,inplace=True)

    # KitchenQual
    houses.KitchenQual.fillna("TA",inplace=True)

    # Utilities
    houses.Utilities.fillna("AllPub",inplace=True)

    #Functional
    houses.Functional.fillna("Typ",inplace=True)

    #Exterior Siding
    houses.Exterior1st.fillna("VinylSd", inplace=True)
    houses.Exterior2nd.fillna("VinylSd", inplace=True)

    # Zoning
    houses.MSZoning.fillna("RL",inplace=True)

    # Sale Type
    houses.SaleType.fillna("WD",inplace=True)

    # Electrical
    houses.Electrical.fillna("SRbkr", inplace=True)
    
    # Change some strings to ordinal values
    # ExterQual
    exterqual = {"Po" : 0, "Fa": 1, 'TA':2, 'Gd':3, 'Ex':4}
    houses.ExterQual = houses.ExterQual.apply(lambda v: exterqual[v])
    # ExterCond
    extercond = {"Po" : 0, "Fa": 1, 'TA':2, 'Gd':3, 'Ex':4}
    houses.ExterCond = houses.ExterCond.apply(lambda v: extercond[v])
    # BsmtQual
    basmtqual = {"NA":0, "Po" : 1, "Fa": 2, 'TA':3, 'Gd':4, 'Ex':5}
    houses.BsmtQual = houses.BsmtQual.apply(lambda v: basmtqual[v])
    # BsmtCond
    basmtcond = {"NA":0, "Po" : 1, "Fa": 2, 'TA':3, 'Gd':4, 'Ex':5}
    houses.BsmtCond = houses.BsmtCond.apply(lambda v: basmtcond[v])
    # BsmtExposure
    basmtexposure = {"NA" : 0, "No": 1, 'Mn':2, 'Av':3, 'Gd': 4}
    houses.BsmtExposure = houses.BsmtExposure.apply(lambda v: basmtexposure[v])
    # BsmtFinType_1
    bsmtfintype = {"NA":0, "Unf" : 1, "LwQ": 2, 'Rec':3, 'BLQ':4, 'ALQ':5, "GLQ": 6}
    houses.BsmtFinType1 = houses.BsmtFinType1.apply(lambda v: bsmtfintype[v])
    # KitchenQual
    kitqual = {'Fa':0,'TA':1,'Gd':2, 'Ex':3}
    houses.KitchenQual = houses.KitchenQual.apply(lambda v: kitqual[v])
    # Functional
    functional = {'Typ':7, 'Min1':6, 'Min2':5, 'Mod':4, 'Maj1':3, 'Maj2':2, 'Sev':1, 'Sal':0}
    houses.Functional = houses.Functional.apply(lambda v: functional[v])
    # FireplaceQu
    fireplacequ = {'NA':0,'Po':1,'Fa':2,'TA':3,'Gd':4,'Ex':5}
    houses.FireplaceQu = houses.FireplaceQu.apply(lambda v: fireplacequ[v])
    # GarageFinish
    garagefinish = {'NA':0,'Unf':1,'RFn':2,'Fin':3}
    houses.GarageFinish = houses.GarageFinish.apply(lambda v: garagefinish[v])

In [214]:
cleanme(houses)
# What % data (if any) is missing for each column?
nulls = houses.isnull().sum()
nulls = pd.DataFrame(nulls)
nulls.rename(columns={0:"cnt_missing"},inplace=True)
nulls = nulls[nulls.cnt_missing>0]
nulls['pct_missing'] = round(nulls.cnt_missing/houses.shape[0] * 100,2)
nulls.sort_values(by="pct_missing",ascending=False)

Unnamed: 0,cnt_missing,pct_missing
GarageYrBlt,81,5.55


In [4]:
houses.sample(5)

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,FirstFlrSF,SecondFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,ThirdSsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
867,868,20,RL,85,6970,Pave,,Reg,Lvl,AllPub,Corner,Gtl,Sawyer,Feedr,Norm,1Fam,1Story,4,5,1961,1961,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,CBlock,TA,TA,No,ALQ,932,Unf,0,108,1040,GasA,TA,Y,SBrkr,1120,0,0,1120,1,0,1,1,3,1,Fa,5,Typ,0,,Attchd,1961.0,RFn,2,544,TA,TA,Y,168,0,0,0,0,0,,,Shed,400,5,2007,WD,Normal,129000
1433,1434,60,RL,93,10261,Pave,,IR1,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,318.0,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,936,936,GasA,Ex,Y,SBrkr,962,830,0,1792,1,0,2,1,3,1,TA,8,Typ,1,TA,Attchd,2000.0,Fin,2,451,TA,TA,Y,0,0,0,0,0,0,,,,0,5,2008,WD,Normal,186500
1113,1114,20,RL,66,8923,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,7,1953,2006,Gable,CompShg,Wd Sdng,Wd Sdng,,0.0,TA,TA,CBlock,TA,TA,No,BLQ,643,Unf,0,365,1008,GasA,Gd,Y,SBrkr,1008,0,0,1008,1,0,1,0,2,1,Gd,6,Typ,0,,Attchd,1953.0,Unf,1,240,TA,TA,Y,0,18,0,0,0,0,,,,0,5,2007,WD,Normal,134500
443,444,120,RL,53,3922,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Blmngtn,Norm,Norm,TwnhsE,1Story,7,5,2006,2007,Gable,CompShg,WdShing,Wd Shng,BrkFace,72.0,Gd,TA,PConc,Ex,TA,Av,Unf,0,Unf,0,1258,1258,GasA,Ex,Y,SBrkr,1258,0,0,1258,0,0,2,0,2,1,Gd,6,Typ,1,Gd,Attchd,2007.0,Fin,3,648,TA,TA,Y,144,16,0,0,0,0,,,,0,6,2007,New,Partial,172500
786,787,50,RM,60,10800,Pave,,Reg,Lvl,AllPub,Inside,Gtl,OldTown,Artery,Norm,1Fam,1.5Fin,5,6,1915,1950,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,Gd,PConc,Fa,TA,No,LwQ,686,Unf,0,0,686,GasA,TA,Y,SBrkr,966,686,0,1652,1,0,2,0,4,1,TA,7,Typ,0,,Detchd,1961.0,Unf,1,416,TA,TA,Y,0,0,196,0,0,0,,,Shed,1200,6,2010,WD,Normal,139000


## Load Test Data

In [5]:
test_houses = pd.read_csv('data/cleaned_data/clean_test.csv')
test_houses.shape

(1459, 80)

## Clean Test Data

In [206]:
cleanme(test_houses)

# What % data (if any) is missing for each column?
nulls = test_houses.isnull().sum()
nulls = pd.DataFrame(nulls)
nulls.rename(columns={0:"cnt_missing"},inplace=True)
nulls = nulls[nulls.cnt_missing>0]
nulls['pct_missing'] = round(nulls.cnt_missing/houses.shape[0] * 100,2)
nulls.sort_values(by="pct_missing",ascending=False)

Unnamed: 0,cnt_missing,pct_missing
GarageYrBlt,78,5.34


# Train XG Boost

In [174]:
X = houses.drop('GarageYrBlt', axis=1).drop('SalePrice', axis=1).drop('Id',axis=1) # features (drop ID and GarageYrBlt)
X = pd.get_dummies(X, drop_first=True, dummy_na=True)                              # encoded features
y = houses.SalePrice                                                               # response variable

# 80% train-test-split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2)

In [175]:
# Train XGBoost
model = XGBRegressor()
model.fit(X_train, y_train)
# print(model)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)

## Initial Predictions

In [189]:
def get_err(length,predicted,truth):
    # Mean absolute error
    abs_err = [0] * length
    for i in range(0,length):
            abs_err[i] = abs(predicted[i] - truth[i])

    mae = statistics.mean(abs_err)
    print("Mean Absolute Error:",round(mae,2))

    # Mean abs percent error
    pct_err = [0] * length
    for i in range(0,length):
        pct_err[i] = abs_err[i] / truth[i] * 100
    mpae = statistics.mean(pct_err)

    print("Mean Abs Pct Error:",round(mpae,2),"%")

    accuracy = 100 - mpae
    print('Accuracy:', round(accuracy, 2), '%.')

    # Mean Square Error
    err_sq = [0] * length
    for i in range(0,length):
        err_sq[i] = abs_err[i]**2

    print("Mean Square Error: ",statistics.mean(err_sq))

In [190]:
intl_predictions = model.predict(X_test)
get_err(len(intl_predictions), intl_predictions, [i for i in y_test])

Mean Absolute Error: 16515.52
Mean Abs Pct Error: 9.78 %
Accuracy: 90.22 %.
Mean Square Error:  860101595.9955845


In [191]:
# Get feature importances
importances = list(model.feature_importances_)
var_list = list(X_train.columns)

var_importances = pd.DataFrame({"Predictor":var_list, "Importance":importances})
print("Importances > 0.03")
var_importances[var_importances.Importance>0.03].sort_values("Importance",ascending=False)

Importances > 0.03


Unnamed: 0,Predictor,Importance
15,GrLivArea,0.099225
2,LotArea,0.065116
8,BsmtFinSF1,0.060465
1,LotFrontage,0.055814
11,TotalBsmtSF,0.049612
3,OverallQual,0.046512
4,OverallCond,0.04186
25,GarageArea,0.037209
13,SecondFlrSF,0.035659
5,YearBuilt,0.034109


## Retrain with only the top predictors

In [202]:
top_predictors = var_importances[var_importances.Importance>0.03].sort_values("Importance",ascending=False).Predictor
top_predictors = [i for i in top_predictors]

X_red = houses[top_predictors]                             # features
X_red = pd.get_dummies(X, drop_first=True, dummy_na=True)  # encoded features

# 80% train-test-split
X_train_red, X_test_red, y_train_red, y_test_red = train_test_split(X, y, test_size = 0.2)

# Train XGBoost
model_red = XGBRegressor()
model_red.fit(X_train, y_train)
print(model_red)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1, gamma=0, learning_rate=0.1, max_delta_step=0,
       max_depth=3, min_child_weight=1, missing=None, n_estimators=100,
       n_jobs=1, nthread=None, objective='reg:linear', random_state=0,
       reg_alpha=0, reg_lambda=1, scale_pos_weight=1, seed=None,
       silent=True, subsample=1)


## Check new results

In [207]:
red_predictions = model_red.predict(X_test_red)
get_err(len(red_predictions),red_predictions,[i for i in y_test_red])

Mean Absolute Error: 12147.61
Mean Abs Pct Error: 7.61 %
Accuracy: 92.39 %.
Mean Square Error:  295572974.63597655


~3% increase in accuracy

## Cross Validation to Tune Hyperparameters

# Test on "Actual" Test Data