# Model building

## Load libraries

In [161]:
# Library loading

# To hide future warnings from sklearn.ensemble
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)

import numpy as np
import pandas as pd
import category_encoders as ce
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.pipeline import make_pipeline
import plotly.express as px
import plotly.graph_objects as go
from sklearn.model_selection import cross_val_score, train_test_split

In [2]:
pd.set_option('display.max_rows', None)
pd.set_option('display.max_columns', None)
pd.options.display.float_format = '{:,.4f}'.format

## Set up data and encoders to support model build

In [3]:
df_orig = pd.read_csv('../data/iowa_full.csv')
df = df_orig.copy()

In [4]:
# Drop the row ID column as this is not something that should impart any information.
df.drop('Id',axis=1,inplace=True)

# Capture all adjustments to deal with NaN values.
def denote_null_values(df):
    """Denotes whether or not there are null values or not"""
    empty_cols_query = df.isnull().sum() > 0
    empty_df_cols = df.loc[:, empty_cols_query].columns.tolist()
    for col in empty_df_cols:
        col_name = f"{col}_missing"
        df[col_name] = pd.isnull(df[col])
    return df

df = denote_null_values(df)

# LotFrontage Functions to populate training, test and validation
def LotFrontage_na_calc(training_df):
    lotfrontage_neighborhood_mean = training_df.groupby(by=['Neighborhood'])[['LotFrontage']].mean().reset_index()
    lotfrontage_neighborhood_mean.columns = ['Neighborhood','LotFrontage_Neighborhood_Mean']
    return lotfrontage_neighborhood_mean

def LotFrontage_na_apply(training_df, testing_df, validation_df=None):
    # Calc mean based on training data
    lnm = LotFrontage_na_calc(training_df)
    
    # Apply mean to training data - for neighbourhood
    # Reset LotFrontage NaN in case they have been filled in a prior run
    training_df['LotFrontage'] = np.where(training_df['LotFrontage_missing']==True,np.nan,training_df['LotFrontage'])
    training_df = training_df.merge(lnm,how='left',left_on='Neighborhood',right_on='Neighborhood')
    training_df['LotFrontage'] = training_df['LotFrontage'].fillna(training_df.LotFrontage_Neighborhood_Mean)
    training_df.drop('LotFrontage_Neighborhood_Mean',axis=1,inplace=True)
    
    # Apply mean to testing data
    # Reset LotFrontage NaN in case they have been filled in a prior run
    testing_df['LotFrontage'] = np.where(testing_df['LotFrontage_missing']==True,np.nan,testing_df['LotFrontage'])
    testing_df = testing_df.merge(lnm,how='left',left_on='Neighborhood',right_on='Neighborhood')
    testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(testing_df.LotFrontage_Neighborhood_Mean)
    testing_df.drop('LotFrontage_Neighborhood_Mean',axis=1,inplace=True)
    # Fill the training sample mean if a specific neighborhood is missing from the training sample
    testing_df['LotFrontage'] = testing_df['LotFrontage'].fillna(training_df['LotFrontage'].mean())

    if validation_df is None:
        return training_df, testing_df
    else:
        # Apply mean to validation data set
        validation_df['LotFrontage'] = np.where(validation_df['LotFrontage_missing']==True,np.nan,validation_df['LotFrontage'])
        validation_df = validation_df.merge(lnm,how='left',left_on='Neighborhood',right_on='Neighborhood')
        validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(validation_df.LotFrontage_Neighborhood_Mean)
        validation_df.drop('LotFrontage_Neighborhood_Mean',axis=1,inplace=True)        
        validation_df['LotFrontage'] = validation_df['LotFrontage'].fillna(training_df['LotFrontage'].mean())
        return training_df, testing_df,validation_df


# Other fills don't rely on knowledge of full sample to update
df['AlleyAccess_Flag'] = np.where(df['Alley'].isnull(),0,1)
df['MasVnrType'] = df['MasVnrType'].fillna('None')
df['MasVnrArea'] = df['MasVnrArea'].fillna(0)
df['BsmtQual'] = df['BsmtQual'].fillna('NA')
df['BsmtCond'] = df['BsmtCond'].fillna('NA')
df['BsmtExposure'] = df['BsmtExposure'].fillna('NA')
df['BsmtFinType1'] = df['BsmtFinType1'].fillna('NA')
df['BsmtFinType2'] = df['BsmtFinType2'].fillna('NA')
df['Electrical'] = df['Electrical'].fillna('SBrkr')
df['FireplaceQu'] = df['FireplaceQu'].fillna('NA')
df['GarageType'] = df['GarageType'].fillna('NA')
df['GarageYrBlt'] = df['GarageYrBlt'].fillna(0)
df['GarageFinish'] = df['GarageFinish'].fillna('NA')
df['GarageQual'] = df['GarageQual'].fillna('NA')
df['GarageCond'] = df['GarageCond'].fillna('NA')
df['PoolQC'] = df['PoolQC'].fillna('NA')
df['Fence'] = df['Fence'].fillna('NA')
df['MiscFeature'] = df['MiscFeature'].fillna('no_misc_feature_recorded')


# Additional data features to tidy things up; potentially drop some others
df['Functional_Typical_flag']=np.where(df['Functional']=='Typ',1,0)
df['PorchSF_Total'] = (df['WoodDeckSF']+df['OpenPorchSF']+df['EnclosedPorch']+df['3SsnPorch']+df['ScreenPorch'])
df['HasPorch_flag']=np.where(df['PorchSF_Total']>0,1,0)
df['HasPool_flag']=np.where(df['PoolQC']!='NA',1,0)

# df['BsmtFinSF_Total'] = df['BsmtFinSF1']+df['BsmtFinSF2'] 
# Removed this, since during modelling realised that TotalBsmtSF was already a variable, and this just duplicates it.


In [5]:
def ManualOneHotEncoding(df,column_list,ohc_prefix):
    # Identify values for new one hot encoded columns
    
    unique_col_vals = []
    
    for i,col in enumerate(column_list):
        if i == 0:
            unique_col_vals = df[col].unique().tolist()
        else:
            [unique_col_vals.append(j) for j in df[col].unique().tolist()]

    # Limit to unique values to generate columns
    unique_col_vals_set = set(unique_col_vals)
    new_cols = sorted(list(unique_col_vals_set))
    
    # Create and populate columns for data set
    for col in new_cols:
        new_col = ohc_prefix + '_' + col
        df[new_col] = 0 #Create new columns and set to 0
        onehot_target = col
        for i,target_cols in enumerate(column_list):
            if i == 0:
                where_conditions = (df[target_cols] == onehot_target) 
            else:
                where_conditions = where_conditions | (df[target_cols] == onehot_target) 
        # Populate with 0s & 1s
        df[new_col] = np.where(where_conditions,1,0)
        
    return df

# Populate OneHotEncoded Columns
df = ManualOneHotEncoding(df,['Condition1','Condition2'],'Conditions')
df = ManualOneHotEncoding(df,['Exterior1st','Exterior2nd'],'Exterior')
df = ManualOneHotEncoding(df,['BsmtFinType1','BsmtFinType2'],'BsmtFinType')

# Drop OneHotEncoded Columns
df.drop('Condition1',axis=1,inplace=True)
df.drop('Condition2',axis=1,inplace=True)
df.drop('Exterior1st',axis=1,inplace=True)
df.drop('Exterior2nd',axis=1,inplace=True)
df.drop('BsmtFinType1',axis=1,inplace=True)
df.drop('BsmtFinType2',axis=1,inplace=True)

***
**Important Step: Set up training, validation, and test data sets:**

In [6]:
# Train/test sets
train = df.sample(frac=0.9,random_state=743)
test = df.drop(train.index)
train,val = train.iloc[:-100],train.iloc[-100:]

train,test,val = LotFrontage_na_apply(train, test, val)

X_train, y_train = train.drop('SalePrice',axis=1), train['SalePrice']
X_val, y_val = val.drop('SalePrice',axis=1), val['SalePrice']
X_test, y_test  = test.drop('SalePrice',axis=1), test['SalePrice']


***

In [7]:
# Set up encoders

targ_enc_cols = [
    'MSSubClass',
    'MSZoning',
    'LandContour',
    'Neighborhood',
    'BldgType',
    'HouseStyle',
    'RoofStyle',
    'RoofMatl',
    'MasVnrType',
    'Foundation',
    'Heating',
    'Electrical',
    'Functional',
    'GarageType',
    'Fence',
    'SaleType',
    'SaleCondition',
]
target_enc = ce.TargetEncoder(verbose=1,cols=targ_enc_cols,min_samples_leaf=5,smoothing=0.1)

ordenc_cols = [
'LotShape',
'Utilities',
'LotConfig',
'LandSlope',
'ExterQual',
'ExterCond',
'BsmtQual',
'BsmtCond',
'BsmtExposure',
'HeatingQC',
'KitchenQual',
'FireplaceQu',
'GarageFinish',
'GarageQual',
'GarageCond',
'PavedDrive',
'PoolQC',
]

ordenc_maps = [
{'col':'LotShape', 'mapping':{"Reg":0,"IR1":1,"IR2":2,"IR3":3}},
{'col':'Utilities', 'mapping':{"AllPub":0,"NoSwer":1,"NoSeWa":2,"ELO":3}},
{'col':'LotConfig', 'mapping':{'Gtl':1,'Mod':2,'Sev':3,}},
{'col':'LandSlope', 'mapping':{'Gtl':1,'Mod':2,'Sev':3,}},
{'col':'ExterQual', 'mapping':{'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'ExterCond', 'mapping':{'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'BsmtQual', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'BsmtCond', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'BsmtExposure', 'mapping':{'Gd':1,'Av':2,'Mn':3,'No':4,'NA':5,}},
{'col':'HeatingQC', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'KitchenQual', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'FireplaceQu', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'GarageFinish', 'mapping':{'Fin':1,'RFn':2,'Unf':3,'NA':4,}},
{'col':'GarageQual', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'GarageCond', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
{'col':'PavedDrive', 'mapping':{'Y':1,'P':2,'N':3}},
{'col':'PoolQC', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
]

ordinal_enc = ce.OrdinalEncoder(cols=ordenc_cols,mapping=ordenc_maps,verbose=1)

onehot_enc = ce.OneHotEncoder(verbose=1,cols=['Street','Alley','CentralAir','MiscFeature'],use_cat_names=True)


## First round of modelling


In [8]:
gbm = GradientBoostingRegressor(max_depth=5,min_samples_leaf=5,n_estimators=400)
gbm.get_params()


{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 5,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 400,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [9]:
s1 = target_enc.fit_transform(X_train, y_train,return_df=True,)
s2 = ordinal_enc.fit_transform(s1, y_train,return_df=True)
s3 = onehot_enc.fit_transform(s2, y_train,return_df=True)
feature_names = s3.columns.to_list()

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


In [10]:
len(feature_names)

140

In [11]:
pipe1 = make_pipeline(target_enc,ordinal_enc,onehot_enc,gbm)

In [12]:
pipe1.fit(X_train,y_train)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('targetencoder',
                 TargetEncoder(cols=['MSSubClass', 'MSZoning', 'LandContour',
                                     'Neighborhood', 'BldgType', 'HouseStyle',
                                     'RoofStyle', 'RoofMatl', 'MasVnrType',
                                     'Foundation', 'Heating', 'Electrical',
                                     'Functional', 'GarageType', 'Fence',
                                     'SaleType', 'SaleCondition'],
                               min_samples_leaf=5, smoothing=0.1, verbose=1)),
                ('ordinalencoder',
                 OrdinalEncoder(cols=['L...
                                         {'col': 'PavedDrive',
                                          'mapping': {'N': 3, 'P': 2, 'Y': 1}},
                                         {'col': 'PoolQC',
                                          'mapping': {'Ex': 1, 'Fa': 4, 'Gd': 2,
                                                      'NA': 0, 'Po': 5,
   

In [13]:
pipe1.score(X_train,y_train)

0.9994150681853274

In [14]:
pipe1.score(X_val,y_val)
# Given extremely high initial outputs - potential over fitting is likely.

0.9069245521110199

In [15]:
pipe1.score(X_test,y_test)

0.9011118042880839

In [17]:
# let's create our feature importance dataframe
feats = pd.DataFrame({
    'Columns': feature_names,
    'Importance': pipe1[3].feature_importances_
}).sort_values(by='Importance', ascending=False)
# and here we go
feats[(feats['Importance']>0.01)]

Unnamed: 0,Columns,Importance
17,OverallQual,0.5076
14,Neighborhood,0.1117
43,GrLivArea,0.1099
58,GarageCars,0.056
31,BsmtFinSF1,0.0354
40,1stFlrSF,0.0282
34,TotalBsmtSF,0.0248
3,LotArea,0.013
102,PorchSF_Total,0.0126
50,KitchenQual,0.0104


In [18]:
# Features that contribute betwen 0.1% and 1%
feats[((feats['Importance']<=0.01) & (feats['Importance']>0.001))]

Unnamed: 0,Columns,Importance
59,GarageArea,0.0068
51,TotRmsAbvGrd,0.0064
20,YearRemodAdd,0.0056
19,YearBuilt,0.0054
56,GarageYrBlt,0.005
18,OverallCond,0.0049
2,LotFrontage,0.0041
41,2ndFlrSF,0.0039
25,ExterQual,0.0037
33,BsmtUnfSF,0.0035


## 2nd round of modelling 
## - test reduced feature set to those with at least 1% impact
* The goal in reducing the feature set is to see if the model can be simplified with minimal impact on score.
* Because a high scoring model with many columns can be more difficult to interpret and assess it makes sense to limit variables to key drivers if this can be achieved with minimal reduction in model score.

In [19]:
over_1perc_feats = feats[(feats['Importance']>0.01)]['Columns'].to_list()

In [20]:
X_train2 = X_train[over_1perc_feats]
X_val2 = X_val[over_1perc_feats]
X_test2 = X_test[over_1perc_feats]

In [21]:
X_train2.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1214 entries, 0 to 1213
Data columns (total 10 columns):
 #   Column         Non-Null Count  Dtype 
---  ------         --------------  ----- 
 0   OverallQual    1214 non-null   int64 
 1   Neighborhood   1214 non-null   object
 2   GrLivArea      1214 non-null   int64 
 3   GarageCars     1214 non-null   int64 
 4   BsmtFinSF1     1214 non-null   int64 
 5   1stFlrSF       1214 non-null   int64 
 6   TotalBsmtSF    1214 non-null   int64 
 7   LotArea        1214 non-null   int64 
 8   PorchSF_Total  1214 non-null   int64 
 9   KitchenQual    1214 non-null   object
dtypes: int64(8), object(2)
memory usage: 144.3+ KB


In [22]:
X_train2.head(5)

Unnamed: 0,OverallQual,Neighborhood,GrLivArea,GarageCars,BsmtFinSF1,1stFlrSF,TotalBsmtSF,LotArea,PorchSF_Total,KitchenQual
0,5,BrkSide,1167,2,645,1167,915,8731,342,TA
1,7,Somerst,1478,2,578,1478,1470,4403,144,Gd
2,3,OldTown,1699,2,440,1014,978,10615,74,TA
3,5,ClearCr,3086,0,152,1636,1598,18030,122,Ex
4,5,Edwards,1144,1,739,1144,1144,9571,44,TA


In [23]:
# Set up encoders and GBM

targ_enc_cols = [
    'Neighborhood',
]
target_enc2 = ce.TargetEncoder(verbose=1,cols=targ_enc_cols,min_samples_leaf=5,smoothing=0.1)

ordenc_cols = [
'KitchenQual',
]

ordenc_maps = [
{'col':'KitchenQual', 'mapping':{'NA':0,'Ex':1,'Gd':2,'TA':3,'Fa':4,'Po':5,}},
]

ordinal_enc2 = ce.OrdinalEncoder(cols=ordenc_cols,mapping=ordenc_maps,verbose=1)

gbm2 = GradientBoostingRegressor(max_depth=5,min_samples_leaf=5,n_estimators=400)

In [24]:
pipe2 = make_pipeline(target_enc2,ordinal_enc2,gbm2)

In [25]:
pipe2.fit(X_train2,y_train)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('targetencoder',
                 TargetEncoder(cols=['Neighborhood'], min_samples_leaf=5,
                               smoothing=0.1, verbose=1)),
                ('ordinalencoder',
                 OrdinalEncoder(cols=['KitchenQual'],
                                mapping=[{'col': 'KitchenQual',
                                          'mapping': {'Ex': 1, 'Fa': 4, 'Gd': 2,
                                                      'NA': 0, 'Po': 5,
                                                      'TA': 3}}],
                                verbose=1)),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(max_depth=5, min_samples_leaf=5,
                                           n_estimators=400))])

In [26]:
pipe2.score(X_train2,y_train)

0.9975146453869387

In [27]:
pipe2.score(X_val2,y_val)

0.8954370884096682

In [28]:
pipe2.score(X_test2,y_test)

0.8797100386378207

In [29]:
feature2_names = X_train2.columns.to_list()

In [30]:
# let's create our feature importance dataframe
feats = pd.DataFrame({
    'Columns': feature2_names,
    'Importance': pipe2[2].feature_importances_
}).sort_values(by='Importance', ascending=False)
# and here we go
feats[(feats['Importance']>0.01)]

Unnamed: 0,Columns,Importance
0,OverallQual,0.5225
2,GrLivArea,0.1248
1,Neighborhood,0.1227
3,GarageCars,0.0548
4,BsmtFinSF1,0.0523
5,1stFlrSF,0.0332
6,TotalBsmtSF,0.033
7,LotArea,0.0223
8,PorchSF_Total,0.0207
9,KitchenQual,0.0138


## Key features driving SalesPrice:

**Top 5 are:**

OverallQual: Rates the overall material and finish of the house

       10	Very Excellent
       9	Excellent
       8	Very Good
       7	Good
       6	Above Average
       5	Average
       4	Below Average
       3	Fair
       2	Poor
       1	Very Poor
       
GrLivArea: Above grade (ground) living area square feet

Neighborhood: Physical locations within Ames city limits

GarageCars: Size of garage in car capacity

1stFlrSF: First Floor square feet

BsmtFinSF1: Type 1 finished square feet


Reducing down to 10 variables had little impact on score (reducing by only 2% in the validation set) with validation and test scores remaining consistent, which is encouraging as it suggests that although we have excluded a lot of columns.

Given that the validation score remains close to the test score this suggests that we aren't overemphasising extreme samples.

## Modelling Round 3
* Given similarity betwent BsmtFinSF1 and TotalBsmtSF, which would be highly correlated, test model without BsmtFinSF1 (as TotalBsmtSF will have all info of BsmtFinSF1 + any info on additional basement finishes)
* Can use same encoders as round 2 given that the variables are largely the same.

In [31]:
X_train3 = X_train[over_1perc_feats].copy()
X_val3 = X_val[over_1perc_feats].copy()
X_test3 = X_test[over_1perc_feats].copy()

X_train3.drop('BsmtFinSF1',1,inplace=True)
X_val3.drop('BsmtFinSF1',1,inplace=True)
X_test3.drop('BsmtFinSF1',1,inplace=True)

In [32]:
gbm3 = GradientBoostingRegressor(max_depth=5,min_samples_leaf=5,n_estimators=400)
pipe3 = make_pipeline(target_enc2,ordinal_enc2,gbm3)

In [33]:
pipe3.fit(X_train3,y_train)

  elif pd.api.types.is_categorical(cols):
  elif pd.api.types.is_categorical(cols):


Pipeline(steps=[('targetencoder',
                 TargetEncoder(cols=['Neighborhood'], min_samples_leaf=5,
                               smoothing=0.1, verbose=1)),
                ('ordinalencoder',
                 OrdinalEncoder(cols=['KitchenQual'],
                                mapping=[{'col': 'KitchenQual',
                                          'mapping': {'Ex': 1, 'Fa': 4, 'Gd': 2,
                                                      'NA': 0, 'Po': 5,
                                                      'TA': 3}}],
                                verbose=1)),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(max_depth=5, min_samples_leaf=5,
                                           n_estimators=400))])

In [34]:
pipe3.score(X_train3,y_train)

0.9958617102757572

In [35]:
pipe3.score(X_val3,y_val)

0.9014352394590235

In [36]:
pipe3.score(X_test3,y_test) # Still close to validation set results.

0.8545353952893865

In [37]:
feature3_names = X_train3.columns.to_list()
# let's create our feature importance dataframe
feats3 = pd.DataFrame({
    'Columns': feature3_names,
    'Importance': pipe3[2].feature_importances_
}).sort_values(by='Importance', ascending=False)
# and here we go
feats3

Unnamed: 0,Columns,Importance
0,OverallQual,0.5361
2,GrLivArea,0.1263
1,Neighborhood,0.1167
3,GarageCars,0.0508
5,TotalBsmtSF,0.0503
4,1stFlrSF,0.0418
6,LotArea,0.0321
7,PorchSF_Total,0.0263
8,KitchenQual,0.0197


* Again validation score don't drop, however there is sum reduction in power for the test set. This seems ok, as we will focus next on seeing if we can improve the validation score by uplifting model parameters.

# Model Parameter adjustments
* Now we have a tightly confined set of relevant data that is known to strongly drive outcomes, the next step will be to look at how adjustments to our model parameters to see if we can improve the model score.
* As usual the key items we want to test are:
    * n_estimators: at 200, 400, 600
    * learning_rate: at 0.05, 0.1
    * max_depth: at 3,4,5,8
    * max_leaf_nodes: at 1,5,8,12,20
    * min_samples_leaf: at 1,5,8,12,20
    
 Noting that 20 is around 2% of our training sample

* This would mean we need 3*2*4*5*5 = 600 model generations to collect the scores.

In [38]:
gbm3.get_params()

{'alpha': 0.9,
 'ccp_alpha': 0.0,
 'criterion': 'friedman_mse',
 'init': None,
 'learning_rate': 0.1,
 'loss': 'ls',
 'max_depth': 5,
 'max_features': None,
 'max_leaf_nodes': None,
 'min_impurity_decrease': 0.0,
 'min_impurity_split': None,
 'min_samples_leaf': 5,
 'min_samples_split': 2,
 'min_weight_fraction_leaf': 0.0,
 'n_estimators': 400,
 'n_iter_no_change': None,
 'presort': 'deprecated',
 'random_state': None,
 'subsample': 1.0,
 'tol': 0.0001,
 'validation_fraction': 0.1,
 'verbose': 0,
 'warm_start': False}

In [63]:
n_estimators = [200,400,600]
learning_rate = [0.05, 0.1]
max_depth = [3,4,5,8]
max_leaf_nodes = [None,5,8,12,20] 
min_samples_leaf = [1,5,8,12,20]
iter_score = 0
iter_count = 0
best_iter_score = -1
save_iter_vars=[]
best_iter_vars = {
    'iter_count' : 0,
    'n_estimators' : 0,
    'learning_rate' : 0,
    'max_depth' : 0,
    'max_leaf_nodes' : 0,
    'min_samples_leaf' : 0,
    'score': 0,
}

target_enc2.set_params(verbose=0)
ordinal_enc2.set_params(verbose=0)
gbm3.set_params(verbose=0)
    
for est in n_estimators:
    for lr in learning_rate:
        for md in max_depth:
            for mln in max_leaf_nodes:
                for msl in min_samples_leaf:
                    # Begin new iteration
                    iter_count += 1
                    
                    # Set parameters,fit, score
                    gbm3 = GradientBoostingRegressor(
                        n_estimators=est,
                        learning_rate = lr,
                        max_depth=md,
                        max_leaf_nodes = mln,
                        min_samples_leaf=msl,
                    )
                    pipe3 = make_pipeline(target_enc2,ordinal_enc2,gbm3)
                    pipe3.fit(X_train3,y_train)
                    iter_score = pipe3.score(X_val3,y_val)
                    
                    # Record scores for future use
                    save_iter_vars.append(
                        {
                            'iter_count' : iter_count,
                            'n_estimators' : est,
                            'learning_rate' : lr,
                            'max_depth' : md,
                            'max_leaf_nodes' : mln,
                            'min_samples_leaf' : msl,
                            'score': iter_score,
                        }
                    )
                    
                    # Check progress
                    if iter_count % 10 == 0:
                        print (f"Just completed iteration: {iter_count}")
                    
                    # Check if score has improved
                    if iter_score > best_iter_score:
                        best_iter_score = iter_score
                        best_iter_vars['iter_count'] = iter_count
                        best_iter_vars['n_estimators'] = est
                        best_iter_vars['learning_rate'] = lr
                        best_iter_vars['max_depth'] = md
                        best_iter_vars['max_leaf_nodes'] = mln
                        best_iter_vars['min_samples_leaf'] = msl
                        best_iter_vars['score'] = iter_score
                        print (f"{best_iter_vars}")
print("Finished fitting models.")


{'iter_count': 1, 'n_estimators': 200, 'learning_rate': 0.05, 'max_depth': 3, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'score': 0.9067001085429429}
Just completed iteration: 10
{'iter_count': 11, 'n_estimators': 200, 'learning_rate': 0.05, 'max_depth': 3, 'max_leaf_nodes': 8, 'min_samples_leaf': 1, 'score': 0.9076736618056456}
Just completed iteration: 20
{'iter_count': 21, 'n_estimators': 200, 'learning_rate': 0.05, 'max_depth': 3, 'max_leaf_nodes': 20, 'min_samples_leaf': 1, 'score': 0.9077374274888556}
Just completed iteration: 30
Just completed iteration: 40
Just completed iteration: 50
{'iter_count': 51, 'n_estimators': 200, 'learning_rate': 0.05, 'max_depth': 5, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'score': 0.915215420211138}
Just completed iteration: 60
Just completed iteration: 70
{'iter_count': 76, 'n_estimators': 200, 'learning_rate': 0.05, 'max_depth': 8, 'max_leaf_nodes': None, 'min_samples_leaf': 1, 'score': 0.919337512871712}
Just completed iteration: 80
J

In [64]:
save_iter_vars

[{'iter_count': 1,
  'n_estimators': 200,
  'learning_rate': 0.05,
  'max_depth': 3,
  'max_leaf_nodes': None,
  'min_samples_leaf': 1,
  'score': 0.9067001085429429},
 {'iter_count': 2,
  'n_estimators': 200,
  'learning_rate': 0.05,
  'max_depth': 3,
  'max_leaf_nodes': None,
  'min_samples_leaf': 5,
  'score': 0.9031157003751228},
 {'iter_count': 3,
  'n_estimators': 200,
  'learning_rate': 0.05,
  'max_depth': 3,
  'max_leaf_nodes': None,
  'min_samples_leaf': 8,
  'score': 0.8697533069087307},
 {'iter_count': 4,
  'n_estimators': 200,
  'learning_rate': 0.05,
  'max_depth': 3,
  'max_leaf_nodes': None,
  'min_samples_leaf': 12,
  'score': 0.8670403978802954},
 {'iter_count': 5,
  'n_estimators': 200,
  'learning_rate': 0.05,
  'max_depth': 3,
  'max_leaf_nodes': None,
  'min_samples_leaf': 20,
  'score': 0.8588503442336559},
 {'iter_count': 6,
  'n_estimators': 200,
  'learning_rate': 0.05,
  'max_depth': 3,
  'max_leaf_nodes': 5,
  'min_samples_leaf': 1,
  'score': 0.903568641685

In [None]:
!pwd

In [66]:
with open("model_param_sweep_scores.txt","w") as output:
    output.write(str(save_iter_vars))

In [88]:
col_names = list(save_iter_vars[0].keys())
col_vals = []
for i in range(0,600):
    col_vals.append(list(save_iter_vars[i].values()))


In [89]:
col_vals[0]

[1, 200, 0.05, 3, None, 1, 0.9067001085429429]

In [113]:
param_sweep_df = pd.DataFrame(col_vals,columns=col_names)
param_sweep_df.head()

Unnamed: 0,iter_count,n_estimators,learning_rate,max_depth,max_leaf_nodes,min_samples_leaf,score
0,1,200,0.05,3,,1,0.9067
1,2,200,0.05,3,,5,0.9031
2,3,200,0.05,3,,8,0.8698
3,4,200,0.05,3,,12,0.867
4,5,200,0.05,3,,20,0.8589


In [114]:
param_sweep_df['max_leaf_nodes'] = param_sweep_df['max_leaf_nodes'].fillna(0)
param_sweep_df.head()

Unnamed: 0,iter_count,n_estimators,learning_rate,max_depth,max_leaf_nodes,min_samples_leaf,score
0,1,200,0.05,3,0.0,1,0.9067
1,2,200,0.05,3,0.0,5,0.9031
2,3,200,0.05,3,0.0,8,0.8698
3,4,200,0.05,3,0.0,12,0.867
4,5,200,0.05,3,0.0,20,0.8589


In [123]:
# Oddly scores got worse with more iterations.
# This suggets that given the limited sample size (i.e. only 800 rows) more iterations simply increased
# the incidence of overfitting.
param_sweep_df.groupby('n_estimators').mean()

Unnamed: 0_level_0,iter_count,learning_rate,max_depth,max_leaf_nodes,min_samples_leaf,score
n_estimators,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
200,100.5,0.075,5.0,9.0,9.2,0.8789
400,300.5,0.075,5.0,9.0,9.2,0.8768
600,500.5,0.075,5.0,9.0,9.2,0.8755


In [120]:
# Model scores generally improved with lower learning rates
param_sweep_df.groupby(['n_estimators','learning_rate']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,iter_count,max_depth,max_leaf_nodes,min_samples_leaf,score
n_estimators,learning_rate,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
200,0.05,50.5,5.0,9.0,9.2,0.8801
200,0.1,150.5,5.0,9.0,9.2,0.8778
400,0.05,250.5,5.0,9.0,9.2,0.8788
400,0.1,350.5,5.0,9.0,9.2,0.8749
600,0.05,450.5,5.0,9.0,9.2,0.8779
600,0.1,550.5,5.0,9.0,9.2,0.873


In [121]:
# Models generally improved with greater depth
param_sweep_df.groupby(['n_estimators','max_depth']).mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,iter_count,learning_rate,max_leaf_nodes,min_samples_leaf,score
n_estimators,max_depth,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
200,3,63.0,0.075,9.0,9.2,0.8781
200,4,88.0,0.075,9.0,9.2,0.8765
200,5,113.0,0.075,9.0,9.2,0.8795
200,8,138.0,0.075,9.0,9.2,0.8817
400,3,263.0,0.075,9.0,9.2,0.8734
400,4,288.0,0.075,9.0,9.2,0.8738
400,5,313.0,0.075,9.0,9.2,0.8783
400,8,338.0,0.075,9.0,9.2,0.8818
600,3,463.0,0.075,9.0,9.2,0.8716
600,4,488.0,0.075,9.0,9.2,0.8715


In [135]:
# Models generally improved with greater depth, 
# although 0 max_leaf_nodes consistently showed the best performance on average

df_plot = param_sweep_df.groupby(['n_estimators','max_leaf_nodes']).mean().reset_index()
df_plot[['n_estimators','max_leaf_nodes','score']]

Unnamed: 0,n_estimators,max_leaf_nodes,score
0,200,0.0,0.8846
1,200,5.0,0.8704
2,200,8.0,0.878
3,200,12.0,0.88
4,200,20.0,0.8817
5,400,0.0,0.8831
6,400,5.0,0.8688
7,400,8.0,0.8753
8,400,12.0,0.8772
9,400,20.0,0.8799


In [136]:

fig = px.scatter(df_plot, x='max_leaf_nodes', y='score',facet_col='n_estimators',labels={"score":"average score"})
fig.show()

In [100]:
param_sweep_df.score.idxmax()

175

In [103]:
param_sweep_df.iloc[175]

iter_count         176.0000
n_estimators       200.0000
learning_rate        0.1000
max_depth            8.0000
max_leaf_nodes          nan
min_samples_leaf     1.0000
score                0.9222
Name: 175, dtype: float64

In [115]:
# Histogram of scores

fig = px.histogram(param_sweep_df['score'], x="score")
fig.show()

In [116]:

fig = px.scatter(param_sweep_df, x='max_leaf_nodes', y='score',facet_col='n_estimators')
fig.show()

**Thoughts on results**
The above show that a wide range of outcomes can be achieved. Fewer max_leaf_nodes are more consistent with higher scores, as expected, as are higher estimators.

Overall lower n_estimators (number of trees generated) produced better results. This suggests that given the relatively low sample size of ~1200 rows in the training data set, overfitting was occuring when more trees were run.

# Final model building
* incorporating validation data into model build
* using highest performing parameter set
    * iter_count: 176
    * n_estimators: 200
    * learning_rate: 0.1
    * max_depth: 8
    * max_leaf_nodes:None
    * min_samples_leaf: 1
    * score: 0.922201370139053
* Then performing k-fold validation to check for issues in final model form.

## Set up training and testing data sets

In [166]:
# Train/test sets
train_final = df.sample(frac=0.9,random_state=743)
test_final = df.drop(train.index)

train_final,test_final = LotFrontage_na_apply(train_final, test_final)

X_train_final, y_train_final = train_final.drop('SalePrice',axis=1), train_final['SalePrice']
X_test_final, y_test_final = test_final.drop('SalePrice',axis=1), test_final['SalePrice']

In [167]:
over_1perc_feats = ['OverallQual',
 'Neighborhood',
 'GrLivArea',
 'GarageCars',
# 'BsmtFinSF1',
 '1stFlrSF',
 'TotalBsmtSF',
 'LotArea',
 'PorchSF_Total',
 'KitchenQual']

X_train_final = X_train_final[over_1perc_feats].copy()
X_test_final = X_test_final[over_1perc_feats].copy()


In [168]:
gbm_final = GradientBoostingRegressor(
    n_estimators=200,
    learning_rate=0.1,
    max_leaf_nodes=None,
    max_depth=8,
    min_samples_leaf=1)
pipe_final = make_pipeline(target_enc2,ordinal_enc2,gbm_final)


In [169]:
pipe_final.fit(X_train_final,y_train_final)

Pipeline(steps=[('targetencoder',
                 TargetEncoder(cols=['Neighborhood'], min_samples_leaf=5,
                               smoothing=0.1)),
                ('ordinalencoder',
                 OrdinalEncoder(cols=['KitchenQual'],
                                mapping=[{'col': 'KitchenQual',
                                          'mapping': {'Ex': 1, 'Fa': 4, 'Gd': 2,
                                                      'NA': 0, 'Po': 5,
                                                      'TA': 3}}])),
                ('gradientboostingregressor',
                 GradientBoostingRegressor(max_depth=8, n_estimators=200))])

In [170]:
pipe_final.score(X_test_final,y_test_final)
# This looks too high using the full data set, likely due to the high max_depth / low overall size

0.9895211506922589

In [174]:
cv_scores = cross_val_score(estimator=pipe_final, X=X_train_final, y=y_train_final, cv=8)
cv_scores
# However the k-fold cross val scores look ok, although one is clearly impacted by an outlier?

array([0.89440415, 0.85543351, 0.88519159, 0.83700597, 0.822409  ,
       0.75285398, 0.64087577, 0.88321139])

In [175]:
?cv_scores

Given this combination of outcomes (i.e. high score on test data, mix of lower scores out of k-fold) not sure what this implies apart from there lkely being a high degree of overfitting.

In [172]:
feature_final_names = X_test_final.columns.to_list()

features_final = pd.DataFrame({
    'Columns': feature_final_names,
    'Importance': pipe_final[2].feature_importances_
}).sort_values(by='Importance', ascending=False)

features_final

Unnamed: 0,Columns,Importance
0,OverallQual,0.5681
1,Neighborhood,0.1355
2,GrLivArea,0.1236
5,TotalBsmtSF,0.0466
7,PorchSF_Total,0.0333
4,1stFlrSF,0.0308
6,LotArea,0.0262
3,GarageCars,0.0251
8,KitchenQual,0.0108
