# Model building

## Load libraries

In [1]:
# Library loading
import numpy as np
import pandas as pd
import sklearn.ensemble 
import category_encoders as ce
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.4f}'.format

## Set up data and encoders to support model build

In [3]:
df_orig = pd.read_csv('../data/iowa_full.csv')
df = df_orig.copy()

In [4]:
# Drop the row ID column as this is not something that should impart any information.
df.drop('Id',axis=1,inplace=True)

# Capture all adjustments to deal with NaN values.
def denote_null_values(df):
    """Denotes whether or not there are null values or not"""
    empty_cols_query = df.isnull().sum() > 0
    empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()
    for col in empty_df_cols:
        col_name = f"{col}_missing"
        df[col_name] = pd.isnull(df[col])
    return df

df = denote_null_values(df)

# LotFrontage Functions to populate training, test and validation
def LotFrontage_na_calc(training_df):
    lotfrontage_neighborhood_mean = training_df.groupby(by=['Neighborhood'])[['LotFrontage']].mean().reset_index()
    lotfrontage_neighborhood_mean.columns = ['Neighborhood','LotFrontage_Neighborhood_Mean']
    return lotfrontage_neighborhood_mean

def LotFrontage_na_apply(training_df, testing_df, validation_df=None):
    # Calc mean based on training data
    lnm = LotFrontage_na_calc(training_df)
    
    # Apply mean to training data - for neighbourhood
    # Reset LotFrontage NaN in case they have been filled in a prior run
    training_df['LotFrontage'] = np.where(training_df['LotFrontage_missing']==True,np.nan,training_df['LotFrontage'])
    training_df = training_df.merge(lnm,how='left',left_on='Neighborhood',right_on='Neighborhood')
    training_df['LotFrontage'] = training_df['LotFrontage'].fillna(training_df.LotFrontage_Neighborhood_Mean)
    training_df.drop('LotFrontage_Neighborhood_Mean',axis=1,inplace=True)
    
    # Apply mean to testing data
    # Reset LotFrontage NaN in case they have been filled in a prior run
    testing_df['LotFrontage'] = np.where(testing_df['LotFrontage_missing']==True,np.nan,testing_df['LotFrontage'])
    testing_df = testing_df.merge(lnm,how='left',left_on='Neighborhood',right_on='Neighborhood')
    testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(testing_df.LotFrontage_Neighborhood_Mean)
    testing_df.drop('LotFrontage_Neighborhood_Mean',axis=1,inplace=True)
    # Fill the training sample mean if a specific neighborhood is missing from the training sample
    testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(training_df['LotFrontage'].mean())

    if validation_df is None:
        return training_df, testing_df
    else:
        # Apply mean to validation data set
        validation_df['LotFrontage'] = np.where(validation_df['LotFrontage_missing']==True,np.nan,validation_df['LotFrontage'])
        validation_df = validation_df.merge(lnm,how='left',left_on='Neighborhood',right_on='Neighborhood')
        validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(validation_df.LotFrontage_Neighborhood_Mean)
        validation_df.drop('LotFrontage_Neighborhood_Mean',axis=1,inplace=True)        
        validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(training_df['LotFrontage'].mean())
        return training_df, testing_df,validation_df


# Other fills don't rely on knowledge of full sample to update
df['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(),0,1)
df['MasVnrType'] = df['MasVnrType'].fillna('None')
df['MasVnrArea'] = df['MasVnrArea'].fillna(0)
df['BsmtQual'] = df['BsmtQual'].fillna('NA')
df['BsmtCond'] = df['BsmtCond'].fillna('NA')
df['BsmtExposure'] = df['BsmtExposure'].fillna('NA')
df['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')
df['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')
df['Electrical'] = df['Electrical'].fillna('SBrkr')
df['FireplaceQu'] = df['FireplaceQu'].fillna('NA')
df['GarageType'] = df['GarageType'].fillna('NA')
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)
df['GarageFinish'] = df['GarageFinish'].fillna('NA')
df['GarageQual'] = df['GarageQual'].fillna('NA')
df['GarageCond'] = df['GarageCond'].fillna('NA')
df['PoolQC'] = df['PoolQC'].fillna('NA')
df['Fence'] = df['Fence'].fillna('NA')
df['MiscFeature'] = df['MiscFeature'].fillna('no_misc_feature_recorded')


# Additional data features to tidy things up; potentially drop some others
df['Functional_Typical_flag']=np.where(df['Functional']=='Typ',1,0)
df['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF']+df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])
df['HasPorch_flag']=np.where(df['PorchSF_Total']>0,1,0)
df['HasPool_flag']=np.where(df['PoolQC']!='NA',1,0)

# df['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2'] 
# Removed this, since during modelling realised that TotalBsmtSF was already a variable, and this just duplicates it.


In [5]:
def ManualOneHotEncoding(df,column_list,ohc_prefix):
    # Identify values for new one hot encoded columns
    
    unique_col_vals = []
    
    for i,col in enumerate(column_list):
        if i == 0:
            unique_col_vals = df[col].unique().tolist()
        else:
            [unique_col_vals.append(j) for j in df[col].unique().tolist()]

    # Limit to unique values to generate columns
    unique_col_vals_set = set(unique_col_vals)
    new_cols = sorted(list(unique_col_vals_set))
    
    # Create and populate columns for data set
    for col in new_cols:
        new_col = ohc_prefix + '_' + col
        df[new_col] = 0 #Create new columns and set to 0
        onehot_target = col
        for i,target_cols in enumerate(column_list):
            if i == 0:
                where_conditions = (df[target_cols] == onehot_target) 
            else:
                where_conditions = where_conditions | (df[target_cols] == onehot_target) 
        # Populate with 0s & 1s
        df[new_col] = np.where(where_conditions,1,0)
        
    return df

# Populate OneHotEncoded Columns
df = ManualOneHotEncoding(df,['Condition1','Condition2'],'Conditions')
df = ManualOneHotEncoding(df,['Exterior1st','Exterior2nd'],'Exterior')
df = ManualOneHotEncoding(df,['BsmtFinType1','BsmtFinType2'],'BsmtFinType')

# Drop OneHotEncoded Columns
df.drop('Condition1',axis=1,inplace=True)
df.drop('Condition2',axis=1,inplace=True)
df.drop('Exterior1st',axis=1,inplace=True)
df.drop('Exterior2nd',axis=1,inplace=True)
df.drop('BsmtFinType1',axis=1,inplace=True)
df.drop('BsmtFinType2',axis=1,inplace=True)

***
**Important Step: Set up training, validation, and test data sets:**

In [6]:
# Train/test sets
train = df.sample(frac=0.9,random_state=743)
test = df.drop(train.index)
train,val = train.iloc[:-100],train.iloc[-100:]

train,test,val = LotFrontage_na_apply(train, test, val)

X_train, y_train = train.drop('SalePrice',axis=1), train['SalePrice']
X_val, y_val = val.drop('SalePrice',axis=1), val['SalePrice']
X_test, y_test  = test.drop('SalePrice',axis=1), test['SalePrice']


***

In [7]:
# Set up encoders

targ_enc_cols = [
    'MSSubClass',
    'MSZoning',
    'LandContour',
    'Neighborhood',
    'BldgType',
    'HouseStyle',
    'RoofStyle',
    'RoofMatl',
    'MasVnrType',
    'Foundation',
    'Heating',
    'Electrical',
    'Functional',
    'GarageType',
    'Fence',
    'SaleType',
    'SaleCondition',
]
target_enc = ce.TargetEncoder(verbose=1,cols=targ_enc_cols,min_samples_leaf=5,smoothing=0.1)

ordenc_cols = [
'LotShape',
'Utilities',
'LotConfig',
'LandSlope',
'ExterQual',
'ExterCond',
'BsmtQual',
'BsmtCond',
'BsmtExposure',
'HeatingQC',
'KitchenQual',
'FireplaceQu',
'GarageFinish',
'GarageQual',
'GarageCond',
'PavedDrive',
'PoolQC',
]

ordenc_maps = [
{'col':'LotShape', 'mapping':{"Reg":0,"IR1":1,"IR2":2,"IR3":3}},
{'col':'Utilities', 'mapping':{"AllPub":0,"NoSwer":1,"NoSeWa":2,"ELO":3}},
{'col':'LotConfig', 'mapping':{'Gtl':1,'Mod':2,'Sev':3,}},
{'col':'LandSlope', 'mapping':{'Gtl':1,'Mod':2,'Sev':3,}},
{'col':'ExterQual', 'mapping':{'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'ExterCond', 'mapping':{'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'BsmtQual', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'BsmtCond', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'BsmtExposure', 'mapping':{'Gd':1,'Av':2,'Mn':3,'No':4,'NA':5,}},
{'col':'HeatingQC', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'KitchenQual', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'FireplaceQu', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'GarageFinish', 'mapping':{'Fin':1,'RFn':2,'Unf':3,'NA':4,}},
{'col':'GarageQual', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'GarageCond', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'PavedDrive', 'mapping':{'Y':1,'P':2,'N':3}},
{'col':'PoolQC', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
]

ordinal_enc = ce.OrdinalEncoder(cols=ordenc_cols,mapping=ordenc_maps,verbose=1)

onehot_enc = ce.OneHotEncoder(verbose=1,cols=['Street','Alley','CentralAir','MiscFeature'],use_cat_names=True)


## First round of modelling


In [8]:
gbm = GradientBoostingRegressor(max_depth=5,min_samples_leaf=5,n_estimators=400)
gbm.get_params()


{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 5,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 400,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [9]:
s1 = target_enc.fit_transform(X_train, y_train,return_df=True,)
s2 = ordinal_enc.fit_transform(s1, y_train,return_df=True)
s3 = onehot_enc.fit_transform(s2, y_train,return_df=True)
feature_names = s3.columns.to_list()

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [10]:
len(feature_names)

140

In [11]:
pipe1 = make_pipeline(target_enc,ordinal_enc,onehot_enc,gbm)

In [12]:
pipe1.fit(X_train,y_train)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('targetencoder',
                 TargetEncoder(cols=['MSSubClass', 'MSZoning', 'LandContour',
                                     'Neighborhood', 'BldgType', 'HouseStyle',
                                     'RoofStyle', 'RoofMatl', 'MasVnrType',
                                     'Foundation', 'Heating', 'Electrical',
                                     'Functional', 'GarageType', 'Fence',
                                     'SaleType', 'SaleCondition'],
                               min_samples_leaf=5, smoothing=0.1, verbose=1)),
                ('ordinalencoder',
                 OrdinalEncoder(cols=['L...
                                         {'col': 'PavedDrive',
                                          'mapping': {'N': 3, 'P': 2, 'Y': 1}},
                                         {'col': 'PoolQC',
                                          'mapping': {'Ex': 1, 'Fa': 4, 'Gd': 2,
                                                      'NA': 0, 'Po': 5,
   

In [13]:
pipe1.score(X_train,y_train)

0.9994150681853274

In [14]:
pipe1.score(X_val,y_val)

0.9068570153037336

In [15]:
pipe1.score(X_test,y_test)

0.9009609809703963

In [16]:
# let's create our feature importance dataframe
feats = pd.DataFrame({
    'Columns': feature_names,
    'Importance': pipe1[3].feature_importances_
}).sort_values(by='Importance', ascending=False)
# and here we go
feats[(feats['Importance']>0.01)]

Unnamed: 0,Columns,Importance
17,OverallQual,0.5076
14,Neighborhood,0.1117
43,GrLivArea,0.1099
58,GarageCars,0.056
31,BsmtFinSF1,0.0354
40,1stFlrSF,0.028
34,TotalBsmtSF,0.0251
3,LotArea,0.013
102,PorchSF_Total,0.0126
50,KitchenQual,0.0104


In [17]:
# Features that contribute betwen 0.1% and 1%
feats[((feats['Importance']<=0.01) & (feats['Importance']>0.001))]

Unnamed: 0,Columns,Importance
59,GarageArea,0.0068
51,TotRmsAbvGrd,0.0064
20,YearRemodAdd,0.0057
56,GarageYrBlt,0.0057
18,OverallCond,0.0049
19,YearBuilt,0.0047
2,LotFrontage,0.0041
41,2ndFlrSF,0.0039
25,ExterQual,0.0037
33,BsmtUnfSF,0.0035


## 2nd round of modelling 
## - test reduced feature set to those with at least 1% impact

In [18]:
over_1perc_feats = feats[(feats['Importance']>0.01)]['Columns'].to_list()

In [19]:
X_train2 = X_train[over_1perc_feats]
X_val2 = X_val[over_1perc_feats]
X_test2 = X_test[over_1perc_feats]

In [20]:
X_train2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1214 entries, 0 to 1213
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   OverallQual    1214 non-null   int64 
 1   Neighborhood   1214 non-null   object
 2   GrLivArea      1214 non-null   int64 
 3   GarageCars     1214 non-null   int64 
 4   BsmtFinSF1     1214 non-null   int64 
 5   1stFlrSF       1214 non-null   int64 
 6   TotalBsmtSF    1214 non-null   int64 
 7   LotArea        1214 non-null   int64 
 8   PorchSF_Total  1214 non-null   int64 
 9   KitchenQual    1214 non-null   object
dtypes: int64(8), object(2)
memory usage: 144.3+ KB


In [21]:
X_train2.head(5)

Unnamed: 0,OverallQual,Neighborhood,GrLivArea,GarageCars,BsmtFinSF1,1stFlrSF,TotalBsmtSF,LotArea,PorchSF_Total,KitchenQual
0,5,BrkSide,1167,2,645,1167,915,8731,342,TA
1,7,Somerst,1478,2,578,1478,1470,4403,144,Gd
2,3,OldTown,1699,2,440,1014,978,10615,74,TA
3,5,ClearCr,3086,0,152,1636,1598,18030,122,Ex
4,5,Edwards,1144,1,739,1144,1144,9571,44,TA


In [22]:
# Set up encoders and GBM

targ_enc_cols = [
    'Neighborhood',
]
target_enc2 = ce.TargetEncoder(verbose=1,cols=targ_enc_cols,min_samples_leaf=5,smoothing=0.1)

ordenc_cols = [
'KitchenQual',
]

ordenc_maps = [
{'col':'KitchenQual', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
]

ordinal_enc2 = ce.OrdinalEncoder(cols=ordenc_cols,mapping=ordenc_maps,verbose=1)

gbm2 = GradientBoostingRegressor(max_depth=5,min_samples_leaf=5,n_estimators=400)

In [23]:
pipe2 = make_pipeline(target_enc2,ordinal_enc2,gbm2)

In [24]:
pipe2.fit(X_train2,y_train)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('targetencoder',
                 TargetEncoder(cols=['Neighborhood'], min_samples_leaf=5,
                               smoothing=0.1, verbose=1)),
                ('ordinalencoder',
                 OrdinalEncoder(cols=['KitchenQual'],
                                mapping=[{'col': 'KitchenQual',
                                          'mapping': {'Ex': 1, 'Fa': 4, 'Gd': 2,
                                                      'NA': 0, 'Po': 5,
                                                      'TA': 3}}],
                                verbose=1)),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(max_depth=5, min_samples_leaf=5,
                                           n_estimators=400))])

In [26]:
pipe2.score(X_train2,y_train)

0.9975146453869387

In [27]:
pipe2.score(X_val2,y_val)

0.8956636175040908

In [28]:
pipe2.score(X_test2,y_test)

0.8795846369000951

In [29]:
feature2_names = X_train2.columns.to_list()

In [30]:
# let's create our feature importance dataframe
feats = pd.DataFrame({
    'Columns': feature2_names,
    'Importance': pipe2[2].feature_importances_
}).sort_values(by='Importance', ascending=False)
# and here we go
feats[(feats['Importance']>0.01)]

Unnamed: 0,Columns,Importance
0,OverallQual,0.5225
2,GrLivArea,0.1251
1,Neighborhood,0.1226
3,GarageCars,0.0548
4,BsmtFinSF1,0.0521
5,1stFlrSF,0.0334
6,TotalBsmtSF,0.0328
7,LotArea,0.0223
8,PorchSF_Total,0.0207
9,KitchenQual,0.0138


## Key features driving SalesPrice:

**Top 5 are:**

OverallQual: Rates the overall material and finish of the house

       10	Very Excellent
       9	Excellent
       8	Very Good
       7	Good
       6	Above Average
       5	Average
       4	Below Average
       3	Fair
       2	Poor
       1	Very Poor
       
GrLivArea: Above grade (ground) living area square feet

Neighborhood: Physical locations within Ames city limits

GarageCars: Size of garage in car capacity

1stFlrSF: First Floor square feet

BsmtFinSF1: Type 1 finished square feet


## Modelling Round 3
* Given similarity betwent BsmtFinSF1 and TotalBsmtSF, which would be highly correlated, test model without BsmtFinSF1 (as TotalBsmtSF will have all info of BsmtFinSF1 + any info on additional basement finishes)
* Can use same encoders as round 2 given that the variables are largely the same.

In [31]:
X_train3 = X_train[over_1perc_feats].copy()
X_val3 = X_val[over_1perc_feats].copy()
X_test3 = X_test[over_1perc_feats].copy()

X_train3.drop('BsmtFinSF1',1,inplace=True)
X_val3.drop('BsmtFinSF1',1,inplace=True)
X_test3.drop('BsmtFinSF1',1,inplace=True)

In [32]:
gbm3 = GradientBoostingRegressor(max_depth=5,min_samples_leaf=5,n_estimators=400)
pipe3 = make_pipeline(target_enc2,ordinal_enc2,gbm3)

In [33]:
pipe3.fit(X_train3,y_train)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('targetencoder',
                 TargetEncoder(cols=['Neighborhood'], min_samples_leaf=5,
                               smoothing=0.1, verbose=1)),
                ('ordinalencoder',
                 OrdinalEncoder(cols=['KitchenQual'],
                                mapping=[{'col': 'KitchenQual',
                                          'mapping': {'Ex': 1, 'Fa': 4, 'Gd': 2,
                                                      'NA': 0, 'Po': 5,
                                                      'TA': 3}}],
                                verbose=1)),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(max_depth=5, min_samples_leaf=5,
                                           n_estimators=400))])

In [34]:
pipe3.score(X_train3,y_train)

0.9958617102757572

In [35]:
pipe3.score(X_val3,y_val)

0.9013083425444236

In [36]:
pipe3.score(X_test3,y_test) # Still close to validation set results.

0.8561492550024187

In [37]:
feature3_names = X_train3.columns.to_list()
# let's create our feature importance dataframe
feats3 = pd.DataFrame({
    'Columns': feature3_names,
    'Importance': pipe3[2].feature_importances_
}).sort_values(by='Importance', ascending=False)
# and here we go
feats3

Unnamed: 0,Columns,Importance
0,OverallQual,0.5361
2,GrLivArea,0.1254
1,Neighborhood,0.1167
5,TotalBsmtSF,0.0523
3,GarageCars,0.0509
4,1stFlrSF,0.0406
6,LotArea,0.0321
7,PorchSF_Total,0.0263
8,KitchenQual,0.0196
