### Notebook Settings

In [219]:
# Increase width of cells
from IPython.display import display, HTML
display(HTML("<style>.container { width:70% !important; }</style>"))


# set working directory
import os
curr_dir = os.getcwd()
os.chdir(curr_dir)

### Package Imports


In [282]:
# core
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
pd.set_option('display.max_columns', 100)

# feature selection

# data preprocessing
from sklearn.model_selection import train_test_split

# modeling

### Load Data

In [221]:
# load data
df = pd.read_pickle('../data/aimes_train_data.pkl')
df.shape

(2197, 82)

In [222]:
# head
df.head()

Unnamed: 0,order,pid,ms_subclass,ms_zoning,lot_frontage,lot_area,street,alley,lot_shape,land_contour,...,pool_area,pool_qc,fence,misc_feature,misc_val,mo_sold,yr_sold,sale_type,sale_condition,saleprice
0,534,531363010,20,RL,80.0,9605,Pave,,Reg,Lvl,...,0,,,,0,4,2009,WD,Normal,159000
1,803,906203120,20,RL,90.0,14684,Pave,,IR1,Lvl,...,0,,,,0,6,2009,WD,Normal,271900
2,956,916176030,20,RL,,14375,Pave,,IR1,Lvl,...,0,,,,0,1,2009,COD,Abnorml,137500
3,460,528180130,120,RL,48.0,6472,Pave,,Reg,Lvl,...,0,,,,0,4,2009,WD,Normal,248500
4,487,528290030,80,RL,61.0,9734,Pave,,IR1,Lvl,...,0,,,,0,5,2009,WD,Normal,167000


__Observation:__ At first glance, we can see that *order* and *pid* are not really useful for predicting house prices so we can go ahead and drop those features.

## Data Inspection

In [223]:
# check for null values
nulls = []
for feature in df.columns:
    if df[feature].isnull().sum() > 0:
        nulls.append(feature)

features_with_nulls_df = df[nulls].isnull().sum() / len(df)
features_with_nulls_df

lot_frontage      0.164770
alley             0.934911
mas_vnr_type      0.010014
mas_vnr_area      0.010014
bsmt_qual         0.030496
bsmt_cond         0.030496
bsmt_exposure     0.031406
bsmtfin_type_1    0.030496
bsmtfin_sf_1      0.000455
bsmtfin_type_2    0.030951
bsmtfin_sf_2      0.000455
bsmt_unf_sf       0.000455
total_bsmt_sf     0.000455
electrical        0.000455
bsmt_full_bath    0.000455
bsmt_half_bath    0.000455
fireplace_qu      0.485207
garage_type       0.054620
garage_yr_blt     0.055530
garage_finish     0.055530
garage_cars       0.000455
garage_area       0.000455
garage_qual       0.055530
garage_cond       0.055530
pool_qc           0.994538
fence             0.809285
misc_feature      0.963587
dtype: float64

In [14]:
# get list of features to drop
# 1: features with null proportion of null values >= 0.2
# 2: order and pid can also be dropped

features_to_drop_names = features_with_nulls_df[features_with_nulls_df >= 0.2].index.tolist()
features_to_drop_names.extend(['order', 'pid'])
features_to_drop_names

['alley', 'fireplace_qu', 'pool_qc', 'fence', 'misc_feature', 'order', 'pid']

In [224]:
# drop unwanted features
df.drop(features_to_drop_names, axis=1, inplace=True)
df.shape

(2197, 75)

## Train Test Split

In [225]:
# get x and y features
X = df.drop('saleprice', axis=1)
y = df['saleprice']

In [226]:
# train test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=100)
X_train.shape, X_test.shape

((1537, 74), (660, 74))

In [227]:
# make copy of X_train and X_test
X_train_copy = X_train.copy()
X_test_copy = X_test.copy()

In [228]:
X_test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 660 entries, 1663 to 1131
Data columns (total 74 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   ms_subclass      660 non-null    int64  
 1   ms_zoning        660 non-null    object 
 2   lot_frontage     540 non-null    float64
 3   lot_area         660 non-null    int64  
 4   street           660 non-null    object 
 5   lot_shape        660 non-null    object 
 6   land_contour     660 non-null    object 
 7   utilities        660 non-null    object 
 8   lot_config       660 non-null    object 
 9   land_slope       660 non-null    object 
 10  neighborhood     660 non-null    object 
 11  condition_1      660 non-null    object 
 12  condition_2      660 non-null    object 
 13  bldg_type        660 non-null    object 
 14  house_style      660 non-null    object 
 15  overall_qual     660 non-null    int64  
 16  overall_cond     660 non-null    int64  
 17  year_built  

## Data Cleaning

### Missing Value Imputation

In [229]:
# import 
from feature_engine.imputation import MeanMedianImputer, CategoricalImputer
from sklearn.pipeline import Pipeline

In [230]:
# separate feature names
num_feature_names = X_train.select_dtypes(exclude='object').columns.tolist()
cat_feature_names = X_train.select_dtypes(include='object').columns.tolist()

In [231]:
# create imputers
num_imputer = MeanMedianImputer(imputation_method='median', variables=num_feature_names)
cat_imputer = CategoricalImputer(imputation_method='frequent', variables=cat_feature_names)

In [232]:
# imputer pipeline
imputer_pipe = Pipeline([
    ('num_imputer', num_imputer),
    ('cat_imputer', cat_imputer)
])

In [233]:
# fit imputer pipeline to train data
imputer_pipe.fit(X_train)

# transform test data
X_train = imputer_pipe.transform(X_train)
X_test = imputer_pipe.transform(X_test)

X_train.shape, X_test.shape

((1537, 74), (660, 74))

## Feature Selection

### Feature Selection Data Prep

#### Categorical Feature Encoding

In [239]:
# imports
from feature_engine.encoding import OrdinalEncoder, RareLabelEncoder

In [301]:
# encode categorical featues
cat_encode_pipe = Pipeline([
    ('rare_label_enc', RareLabelEncoder(tol=0.05, n_categories=4, variables=cat_feature_names)),
    ('ordinal_enc', OrdinalEncoder(encoding_method='arbitrary'))
])

In [302]:
# fit categorical encoder to train data
cat_encode_pipe.fit(X_train)

# transform train and test data
X_train_enc = cat_encode_pipe.transform(X_train)
X_test_enc = cat_encode_pipe.transform(X_test)

X_train_enc.shape, X_test_enc.shape



((1537, 74), (660, 74))

#### Remove Constants & Qusi-Contants

In [303]:
# imports
from feature_engine.selection import DropConstantFeatures

In [304]:
# create selector
constants_selector = DropConstantFeatures(tol=0.99, variables=None, missing_values='raise')

# fit selector
constants_selector.fit(X_train_enc)

DropConstantFeatures(tol=0.99)

In [305]:
# view constants
constants_selector.features_to_drop_

['street', 'utilities', 'condition_2', 'pool_area']

In [306]:
# transform train and test data
X_train_qc = constants_selector.transform(X_train_enc)
X_test_qc = constants_selector.transform(X_test_enc)

X_train_qc.shape, X_test_qc.shape

((1537, 70), (660, 70))

#### Remove Duplicated Features

In [307]:
# imports
from feature_engine.selection import DropDuplicateFeatures

In [308]:
# create duplicate features selector
dup_features = DropConstantFeatures(variables=None, missing_values='raise')

# fit duplicate features selector to train data
dup_features.fit(X_train_qc)

# view duplicated features
dup_features.features_to_drop_

[]

__Observation:__ There are no duplicated features to drop.

#### Drop Correlated Numeric Features


In [309]:
# import
from feature_engine.selection import SmartCorrelatedSelection
from sklearn.ensemble import RandomForestRegressor

In [310]:
# random forest model
rf = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=4)

# correlation selector
corr_selector = SmartCorrelatedSelection(
    variables=X_train_qc.select_dtypes(exclude='object').columns.tolist(),
    method='pearson',
    threshold=0.8,
    missing_values='raise',
    selection_method='model_performance',
    estimator=rf,
    scoring='neg_mean_absolute_error',
    cv=5

)

# fit
corr_selector.fit(X_train_qc, y_train)

SmartCorrelatedSelection(cv=5,
                         estimator=RandomForestRegressor(max_depth=4,
                                                         n_estimators=200,
                                                         random_state=42),
                         missing_values='raise',
                         scoring='neg_mean_absolute_error',
                         selection_method='model_performance',
                         variables=['ms_subclass', 'ms_zoning', 'lot_frontage',
                                    'lot_area', 'lot_shape', 'land_contour',
                                    'lot_config', 'land_slope', 'neighborhood',
                                    'condition_1', 'bldg_type', 'house_style',
                                    'overall_qual', 'overall_cond',
                                    'year_built', 'year_remod/add',
                                    'roof_style', 'roof_matl', 'exterior_1st',
                                    'exterior_

In [311]:
# view correlated feature sets
corr_selector.correlated_feature_sets_

[{'2nd_flr_sf', 'house_style'},
 {'exterior_1st', 'exterior_2nd'},
 {'1st_flr_sf', 'total_bsmt_sf'},
 {'gr_liv_area', 'totrms_abvgrd'},
 {'garage_area', 'garage_cars'}]

In [312]:
# view dropped correlated features
corr_selector.features_to_drop_

['house_style', 'exterior_2nd', '1st_flr_sf', 'totrms_abvgrd', 'garage_area']

In [313]:
# fit correlation selector to train and test sets
X_train_corr = corr_selector.transform(X_train_qc)
X_test_corr = corr_selector.transform(X_test_qc)

X_train_corr.shape, X_test_corr.shape

((1537, 65), (660, 65))

### Feature Selection Embedded Methods

#### Tree Importance

In [314]:
# import
from sklearn.feature_selection import SelectFromModel, SelectKBest

In [315]:
# create selector
ti_selector = SelectFromModel(RandomForestRegressor(n_estimators=100, max_depth=4, random_state=42))

# fit selector
ti_selector.fit(X_train_corr, y_train)

# view selected features
ti_selected_features = X_train_corr.columns[(ti_selector.get_support())]
ti_selected_features

Index(['overall_qual', 'year_built', 'total_bsmt_sf', 'gr_liv_area',
       'screen_porch'],
      dtype='object')

In [316]:
# transform train and test data
X_train_ti = X_train_corr[ti_selected_features.tolist()]
X_test_ti = X_test_corr[ti_selected_features.tolist()]

X_train_ti.shape, X_test_ti.shape

((1537, 5), (660, 5))

#### Recursive Feature Selection

In [317]:
from sklearn.feature_selection import RFE

In [318]:
# create selector
rs_selector = RFE(RandomForestRegressor(n_estimators=100, max_depth=4, random_state=42))

# fit selector
rs_selector.fit(X_train_corr, y_train)

# view selected features
rs_selected_features = X_train_corr.columns[(rs_selector.get_support())]
rs_selected_features

Index(['ms_zoning', 'lot_frontage', 'lot_area', 'land_slope', 'neighborhood',
       'overall_qual', 'year_built', 'year_remod/add', 'roof_matl',
       'exterior_1st', 'mas_vnr_area', 'exter_qual', 'bsmt_qual',
       'bsmt_exposure', 'bsmtfin_sf_1', 'bsmt_unf_sf', 'total_bsmt_sf',
       'central_air', '2nd_flr_sf', 'gr_liv_area', 'bsmt_full_bath',
       'full_bath', 'bedroom_abvgr', 'kitchen_qual', 'fireplaces',
       'garage_type', 'garage_finish', 'garage_cars', 'wood_deck_sf',
       'open_porch_sf', 'screen_porch', 'mo_sold'],
      dtype='object')

In [320]:
# get train and test sets
X_train_rs = X_train_corr[rs_selected_features.tolist()]
X_test_rs = X_test_corr[rs_selected_features.tolist()]

X_train_rs.shape, X_test_rs.shape

((1537, 32), (660, 32))

#### Lasso Feature Selection

In [321]:
# imports
from sklearn.linear_model import Lasso, LinearRegression
from sklearn.preprocessing import StandardScaler

In [322]:
# standard scaler
sc = StandardScaler()
sc.fit(X_train_corr)

StandardScaler()

In [323]:
# get selector
lasso_selector = SelectFromModel(Lasso(alpha=100))

# fit selector
lasso_selector.fit(sc.transform(X_train_corr), y_train)

# view selected features
lasso_selected_features = X_train_corr.columns[(lasso_selector.get_support())]
lasso_selected_features

Index(['ms_subclass', 'ms_zoning', 'lot_frontage', 'lot_area', 'lot_shape',
       'land_contour', 'lot_config', 'land_slope', 'neighborhood',
       'condition_1', 'bldg_type', 'overall_qual', 'overall_cond',
       'year_built', 'year_remod/add', 'roof_style', 'roof_matl',
       'exterior_1st', 'mas_vnr_type', 'mas_vnr_area', 'exter_qual',
       'exter_cond', 'foundation', 'bsmt_qual', 'bsmt_exposure',
       'bsmtfin_type_1', 'bsmtfin_sf_1', 'bsmtfin_type_2', 'bsmtfin_sf_2',
       'total_bsmt_sf', 'heating_qc', 'central_air', '2nd_flr_sf',
       'low_qual_fin_sf', 'gr_liv_area', 'bsmt_full_bath', 'bsmt_half_bath',
       'full_bath', 'half_bath', 'bedroom_abvgr', 'kitchen_abvgr',
       'kitchen_qual', 'functional', 'fireplaces', 'garage_type',
       'garage_yr_blt', 'garage_finish', 'garage_cars', 'garage_qual',
       'garage_cond', 'paved_drive', 'wood_deck_sf', 'open_porch_sf',
       'enclosed_porch', '3ssn_porch', 'screen_porch', 'misc_val', 'mo_sold',
       'yr_sold', '

In [324]:
# count of selected features
len(lasso_selected_features)

61

In [325]:
# get train and test sets
X_train_lasso = X_train_corr[lasso_selected_features.tolist()]
X_test_lasso = X_test_corr[lasso_selected_features.tolist()]

X_train_lasso.shape, X_test_lasso.shape

((1537, 61), (660, 61))

### Compare Model Performance

In [326]:
# import
from sklearn.model_selection import cross_val_score

In [327]:
# function to build random forest model and compare performance in train and test data

def get_random_forest_comparison(x_train, y_train, x_test, y_test, method):
    rf = RandomForestRegressor(n_estimators=200, random_state=42, max_depth=4)
    train_score = cross_val_score(rf, x_train, y_train, cv=5,  scoring='neg_mean_squared_error')
    train_score = np.sqrt(abs(train_score.mean()))
        
    test_score = cross_val_score(rf, x_test, y_test, cv=5, scoring='neg_mean_squared_error')
    test_score = np.sqrt(abs(test_score.mean()))
    
    print('Feature Selection Method: ', method)
    print('------------------------------------------------')
    print('Train Set RMSE: ', train_score)
    print('Test Set: RMSE', test_score) 

In [328]:
# correlated features selection
get_random_forest_comparison(X_train_corr, y_train, X_test_corr, y_test, method='Correlated Feature Selection')

Feature Selection Method:  Correlated Feature Selection
------------------------------------------------
Train Set RMSE:  31877.45755874098
Test Set: RMSE 33385.724257097805


In [329]:
# tree importance
get_random_forest_comparison(X_train_ti, y_train, X_test_ti.fillna(0), y_test, method='Tree Importance')

Feature Selection Method:  Tree Importance
------------------------------------------------
Train Set RMSE:  31832.601335170697
Test Set: RMSE 33387.93711807962


In [201]:
# recursive feature selection
get_random_forest_comparison(X_train_rs, y_train, X_test_rs.fillna(0), y_test, method='Recursive Feature Selection')

Feature Selection Method:  Recursive Feature Selection
------------------------------------------------
Train Set RMSE:  31795.67586603215
Test Set: RMSE 33313.12216390058


In [330]:
# lasso
get_random_forest_comparison(X_train_lasso, y_train, X_test_lasso.fillna(0), y_test, method='Lasso Feature Selection')

Feature Selection Method:  Lasso Feature Selection
------------------------------------------------
Train Set RMSE:  31858.423447196503
Test Set: RMSE 33221.59665143214


## Save Training & Test Feature Lists


In [337]:
# save recursive feature elimination dataset
X_train_rs.columns

Index(['ms_zoning', 'lot_frontage', 'lot_area', 'land_slope', 'neighborhood',
       'overall_qual', 'year_built', 'year_remod/add', 'roof_matl',
       'exterior_1st', 'mas_vnr_area', 'exter_qual', 'bsmt_qual',
       'bsmt_exposure', 'bsmtfin_sf_1', 'bsmt_unf_sf', 'total_bsmt_sf',
       'central_air', '2nd_flr_sf', 'gr_liv_area', 'bsmt_full_bath',
       'full_bath', 'bedroom_abvgr', 'kitchen_qual', 'fireplaces',
       'garage_type', 'garage_finish', 'garage_cars', 'wood_deck_sf',
       'open_porch_sf', 'screen_porch', 'mo_sold'],
      dtype='object')