## This is Dev notebook for house prices project from Kaggle

#### 1. Load environment and data

In [1]:
import numpy as np
import pandas as pd
import os, time, warnings, random
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.svm import SVC, SVR
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
from scipy.special import inv_boxcox
from category_encoders import MEstimateEncoder
from xgboost import XGBClassifier, XGBRegressor

pd.set_option('display.max_columns', 100)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore')

def draw_histograms(df, variables, n_rows, n_cols):
    # stolen from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  
    plt.show()


def fillna_mp_i1(df_train, df_test, df_pred, num_features, cat_features, num_fill='median', cat_fill='mode'):
    """
    This function speeds up filling missing values for 3 main datasets using different imputation methods.
    Later may replace it with some subclass.
    Example: fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)
    """
    # set df_pred to None if it does not exist
    if (cat_fill=='mode'):
    
        df_train[cat_features] = df_train[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
        df_test[cat_features] = df_test[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
        if (df_pred is not None):
            df_pred[cat_features] = df_pred[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
            
    if (cat_fill=='missing'):
    
        df_train[cat_features] = df_train[cat_features].fillna(value='missing')
        df_test[cat_features] = df_test[cat_features].fillna(value='missing')
        if (df_pred is not None):
            df_pred[cat_features] = df_pred[cat_features].fillna(value='missing')
        
    if (num_fill=='median'):
        df_train[num_features] = df_train[num_features].fillna(value=df_train[num_features].median())
        df_test[num_features] = df_test[num_features].fillna(value=df_train[num_features].median())
        if (df_pred is not None):
            df_pred[num_features] = df_pred[num_features].fillna(value=df_train[num_features].median())    
    
    all_good = (
    (np.prod(df_train[num_features+cat_features].shape)==df_train[num_features+cat_features].count().sum()) and 
    (np.prod(df_test[num_features+cat_features].shape) == df_test[num_features+cat_features].count().sum()) and 
    (np.prod(df_pred[num_features+cat_features].shape) == df_pred[num_features+cat_features].count().sum()))
    if (all_good):
        print('Missing values imputed successfully')
    else:
        print('There are still some missing values...')
    
    
    
def add_misDummy_mp_i1(df_train, df_test, df_pred, features):
    """
    This function creates new dummy columns for missing features.
    Example: add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])
    """
    # set df_pred to None if it does not exist
    for feature_name in features:
        misColName = 'mis'+feature_name
        df_train.loc[df_train[feature_name].isnull(), misColName]=1
        df_train.loc[df_train[feature_name].notnull(), misColName]=0
        df_test.loc[df_test[feature_name].isnull(), misColName]=1
        df_test.loc[df_test[feature_name].notnull(), misColName]=0
        if (df_pred is not None):
            df_pred.loc[df_pred[feature_name].isnull(), misColName]=1
            df_pred.loc[df_pred[feature_name].notnull(), misColName]=0
   

def discretize_mp_i1(df_train, df_test, df_pred, feature, ntiles, delete_feature=False):
    """
    This function divides a continuous feature into quantile groups.
    Example: discretize_mp_i1(X_train, X_test, X_pred, 'Age', 15)
    """
    # set df_pred to None if it does not exist
    _,bin = pd.qcut(df_train[feature], ntiles, retbins = True, labels = False, duplicates = 'drop')
    df_train[feature+'Ntile'] = pd.cut(df_train[feature], 
                                       labels=False, 
                                       duplicates = 'drop', 
                                       bins = bin , 
                                       include_lowest = True)
    df_test[feature+'Ntile'] = pd.cut(df_test[feature], 
                                      labels=False, 
                                      duplicates = 'drop', 
                                      bins = bin , 
                                      include_lowest = True)
    if (df_pred is not None):
        df_pred[feature+'Ntile'] = pd.cut(df_pred[feature], 
                                          labels=False, 
                                          duplicates = 'drop', 
                                          bins = bin , 
                                          include_lowest = True)
    if (delete_feature==True):
        df_train.drop(columns=[feature], inplace=True)
        df_test.drop(columns=[feature], inplace=True)
        df_pred.drop(columns=[feature], inplace=True)
    print('Discretized ',feature, ' into ', len(bin)-1, ' bins')


def log_transformer_mp_i1(df_train, df_test, feature_subset=False, min_skew=3, df_pred=None):
    """
    This function divides a continuous feature into quantile groups.
    Example: log_transformer_mp_i1(X_train, X_test, X_pred, feature_subset=num_cols)
    """
    # set df_pred to None if it does not exist
    if (feature_subset==False):
        features_totransform = df_train.columns
    else:
        features_totransform = feature_subset.copy()
    skewed_vars = list(df_train.skew()[abs(df_train.skew())>min_skew].index)
    for col in list(set(skewed_vars)&set(features_totransform)):
        df_train[col] = np.log1p(df_train[col])
        df_test[col] = np.log1p(df_test[col])
        if df_pred:
            df_pred[col] = np.log1p(df_pred[col])
    print('Skewed columns log-transformed: ', list(set(skewed_vars)&set(features_totransform)))
    
    
def add_dummyfeatures(df_train, df_test, feature_dict, df_pred=None):
    """
    This function adds dummy feature when some feature is equal to value, specified in a dictionary.
    Example: add_dummyfeatures(X_train, X_test, X_pred, {'RoomService':0, 'Spa':0, 'VRDeck':0, 'ShoppingMall':0})
    """
    if df_pred:
        input_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    else:
        input_dimensions = np.array([df_train.shape[1], df_test.shape[1]])    
    for i in range(len(list(feature_dict.items()))):
        feature,value = list(feature_dict.keys())[i], list(feature_dict.values())[i]
        df_train.loc[df_train[feature]==value,(str(feature)+str(value))]=1
        df_train.loc[df_train[feature]!=value,(str(feature)+str(value))]=0
        df_test.loc[df_test[feature]==value,(str(feature)+str(value))]=1
        df_test.loc[df_test[feature]!=value,(str(feature)+str(value))]=0
        if df_pred:
            df_pred.loc[df_pred[feature]==value,(str(feature)+str(value))]=1
            df_pred.loc[df_pred[feature]!=value,(str(feature)+str(value))]=0
    if df_pred:
        output_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    else:
        output_dimensions = np.array([df_train.shape[1], df_test.shape[1]])
    print(output_dimensions-input_dimensions, ' variables created') 
    

### target encoding ###
# source: https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices/notebook

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=4)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded    

In [2]:
time0 = time.time()

os.chdir('/home/jupyter/projects_data/house_price')
df = pd.read_csv('train.csv') 
# df.drop(columns = ['Id'], inplace=True)
pred=pd.read_csv('test.csv')
pred0 = pred.copy()

print(df.shape, pred.shape)
df

(1460, 81) (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,1999,2000,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,953,953,GasA,Ex,Y,SBrkr,953,694,0,1647,0,0,2,1,3,1,TA,7,Typ,1,TA,Attchd,1999.0,RFn,2,460,TA,TA,Y,0,40,0,0,0,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,6,6,1978,1988,Gable,CompShg,Plywood,Plywood,Stone,119.0,TA,TA,CBlock,Gd,TA,No,ALQ,790,Rec,163,589,1542,GasA,TA,Y,SBrkr,2073,0,0,2073,1,0,2,0,3,1,TA,7,Min1,2,TA,Attchd,1978.0,Unf,2,500,TA,TA,Y,349,0,0,0,0,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,9,1941,2006,Gable,CompShg,CemntBd,CmentBd,,0.0,Ex,Gd,Stone,TA,Gd,No,GLQ,275,Unf,0,877,1152,GasA,Ex,Y,SBrkr,1188,1152,0,2340,0,0,2,0,4,1,Gd,9,Typ,2,Gd,Attchd,1941.0,RFn,1,252,TA,TA,Y,0,60,0,0,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,6,1950,1996,Hip,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,Mn,GLQ,49,Rec,1029,0,1078,GasA,Gd,Y,FuseA,1078,0,0,1078,1,0,1,0,2,1,Gd,5,Typ,0,,Attchd,1950.0,Unf,1,240,TA,TA,Y,366,0,112,0,0,0,,,,0,4,2010,WD,Normal,142125


#### 2. Data cleaning and EDA

In [3]:
# 2. pEDA #

cols_tokeep = ['SalePrice', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'ExterCond', 
               'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', 'HeatingQC', '1stFlrSF', '2ndFlrSF', 'GrLivArea',  
               'KitchenQual', 'GarageArea', 'GarageCars', 'TotRmsAbvGrd', 'BedroomAbvGr', 'FullBath', 
               'HalfBath', 'MiscVal', 'LotFrontage', 
               'ExterQual', 'MSSubClass', 'MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'Neighborhood',
               'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd',
               'Foundation', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'PavedDrive',
               'SaleType', 'SaleCondition', 'BsmtQual', 'BsmtCond', 
               'BsmtExposure', 'BsmtFinType1', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
df = df[cols_tokeep]

# preliminary feature engineering:
df['GrLivArea_log'] = np.log1p(df['GrLivArea'])
pred['GrLivArea_log'] = np.log1p(pred['GrLivArea'])
# w/o logtransform, scatterplot looks better. not sure whether log tranform helps.

df['MisGarage'] = df.GarageType.isnull().astype(int)
df['MisBsmt'] = df.BsmtCond.isnull().astype(int)
df

Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,FullBath,HalfBath,MiscVal,LotFrontage,ExterQual,MSSubClass,MSZoning,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,RoofStyle,Exterior1st,Exterior2nd,Foundation,Heating,CentralAir,Electrical,Functional,PavedDrive,SaleType,SaleCondition,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,GarageType,GarageFinish,GarageQual,GarageCond,GrLivArea_log,MisGarage,MisBsmt
0,208500,8450,7,5,2003,TA,706,0,856,Ex,856,854,1710,Gd,548,2,8,3,2,1,0,65.0,Gd,60,RL,Reg,Lvl,Inside,CollgCr,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,Gd,TA,No,GLQ,Attchd,RFn,TA,TA,7.444833,0,0
1,181500,9600,6,8,1976,TA,978,0,1262,Ex,1262,0,1262,TA,460,2,6,3,2,0,0,80.0,TA,20,RL,Reg,Lvl,FR2,Veenker,Feedr,1Fam,1Story,Gable,MetalSd,MetalSd,CBlock,GasA,Y,SBrkr,Typ,Y,WD,Normal,Gd,TA,Gd,ALQ,Attchd,RFn,TA,TA,7.141245,0,0
2,223500,11250,7,5,2001,TA,486,0,920,Ex,920,866,1786,Gd,608,2,6,3,2,1,0,68.0,Gd,60,RL,IR1,Lvl,Inside,CollgCr,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,Gd,TA,Mn,GLQ,Attchd,RFn,TA,TA,7.488294,0,0
3,140000,9550,7,5,1915,TA,216,0,756,Gd,961,756,1717,Gd,642,3,7,3,1,0,0,60.0,TA,70,RL,IR1,Lvl,Corner,Crawfor,Norm,1Fam,2Story,Gable,Wd Sdng,Wd Shng,BrkTil,GasA,Y,SBrkr,Typ,Y,WD,Abnorml,TA,Gd,No,ALQ,Detchd,Unf,TA,TA,7.448916,0,0
4,250000,14260,8,5,2000,TA,655,0,1145,Ex,1145,1053,2198,Gd,836,3,9,4,2,1,0,84.0,Gd,60,RL,IR1,Lvl,FR2,NoRidge,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,Gd,TA,Av,GLQ,Attchd,RFn,TA,TA,7.695758,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,175000,7917,6,5,1999,TA,0,0,953,Ex,953,694,1647,TA,460,2,7,3,2,1,0,62.0,TA,60,RL,Reg,Lvl,Inside,Gilbert,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,Gd,TA,No,Unf,Attchd,RFn,TA,TA,7.407318,0,0
1456,210000,13175,6,6,1978,TA,790,163,1542,TA,2073,0,2073,TA,500,2,7,3,2,0,0,85.0,TA,20,RL,Reg,Lvl,Inside,NWAmes,Norm,1Fam,1Story,Gable,Plywood,Plywood,CBlock,GasA,Y,SBrkr,Min1,Y,WD,Normal,Gd,TA,No,ALQ,Attchd,Unf,TA,TA,7.637234,0,0
1457,266500,9042,7,9,1941,Gd,275,0,1152,Ex,1188,1152,2340,Gd,252,1,9,4,2,0,2500,66.0,Ex,70,RL,Reg,Lvl,Inside,Crawfor,Norm,1Fam,2Story,Gable,CemntBd,CmentBd,Stone,GasA,Y,SBrkr,Typ,Y,WD,Normal,TA,Gd,No,GLQ,Attchd,RFn,TA,TA,7.758333,0,0
1458,142125,9717,5,6,1950,TA,49,1029,1078,Gd,1078,0,1078,Gd,240,1,5,2,1,0,0,68.0,TA,20,RL,Reg,Lvl,Inside,NAmes,Norm,1Fam,1Story,Hip,MetalSd,MetalSd,CBlock,GasA,Y,FuseA,Typ,Y,WD,Normal,TA,TA,Mn,GLQ,Attchd,Unf,TA,TA,6.983790,0,0


In [4]:
ord_cols = ['ExterCond', 'HeatingQC', 'KitchenQual', 'ExterQual', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond']
num_cols = ['LotArea', 'YearBuilt', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
            'GrLivArea', 'GarageArea', 'MiscVal', 'LotFrontage', 
           'TotRmsAbvGrd', 'GarageCars', 'BedroomAbvGr', 'OverallCond', 'OverallQual', 'GrLivArea_log']
cat_cols = list(set(df.columns)-set(num_cols)-set(ord_cols)-set(['SalePrice']))
print("Numerical features ", num_cols, "\n",
      'Ordinal features', ord_cols, '\n',
      "Categorical features ", cat_cols)

df[ord_cols] = df[ord_cols].replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1,2,3,4,5])
pred[ord_cols] = pred[ord_cols].replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1,2,3,4,5])

Numerical features  ['LotArea', 'YearBuilt', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'MiscVal', 'LotFrontage', 'TotRmsAbvGrd', 'GarageCars', 'BedroomAbvGr', 'OverallCond', 'OverallQual', 'GrLivArea_log'] 
 Ordinal features ['ExterCond', 'HeatingQC', 'KitchenQual', 'ExterQual', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond'] 
 Categorical features  ['GarageFinish', 'Heating', 'Condition1', 'CentralAir', 'Exterior2nd', 'BldgType', 'MSZoning', 'Electrical', 'HouseStyle', 'Foundation', 'SaleType', 'LandContour', 'BsmtFinType1', 'MisGarage', 'MisBsmt', 'BsmtExposure', 'GarageType', 'FullBath', 'Functional', 'MSSubClass', 'RoofStyle', 'Exterior1st', 'LotShape', 'HalfBath', 'Neighborhood', 'SaleCondition', 'LotConfig', 'PavedDrive']


In [5]:
df[cat_cols].nunique()

GarageFinish      3
Heating           6
Condition1        9
CentralAir        2
Exterior2nd      16
BldgType          5
MSZoning          5
Electrical        5
HouseStyle        8
Foundation        6
SaleType          9
LandContour       4
BsmtFinType1      6
MisGarage         2
MisBsmt           2
BsmtExposure      4
GarageType        6
FullBath          4
Functional        7
MSSubClass       15
RoofStyle         6
Exterior1st      15
LotShape          4
HalfBath          3
Neighborhood     25
SaleCondition     6
LotConfig         5
PavedDrive        3
dtype: int64

According to feature importances, only Neighboorhood and possibly Exterior1 categorical features are really useful.
And they have too many unique values to use OHC. All other categorical features are not important enough to bother with ohe.
So use target encoding for all of them.

In [6]:
test_size = 0.2
df.reset_index(inplace=True, drop=True)
#random.seed(2)
test_index = random.sample(list(df.index), int(test_size*df.shape[0]))
train = df.iloc[list(set(df.index)-set(test_index))]
test = df.iloc[test_index]
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train.drop(columns=['id'],inplace=True, errors='ignore')
test.drop(columns=['id'],inplace=True, errors='ignore')
display(train.shape, test.shape, train.head(3), test.head(3))

(1168, 54)

(292, 54)

Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,FullBath,HalfBath,MiscVal,LotFrontage,ExterQual,MSSubClass,MSZoning,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,RoofStyle,Exterior1st,Exterior2nd,Foundation,Heating,CentralAir,Electrical,Functional,PavedDrive,SaleType,SaleCondition,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,GarageType,GarageFinish,GarageQual,GarageCond,GrLivArea_log,MisGarage,MisBsmt
0,208500,8450,7,5,2003,3,706,0,856,5,856,854,1710,4,548,2,8,3,2,1,0,65.0,4,60,RL,Reg,Lvl,Inside,CollgCr,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,4.0,3.0,No,GLQ,Attchd,RFn,3.0,3.0,7.444833,0,0
1,181500,9600,6,8,1976,3,978,0,1262,5,1262,0,1262,3,460,2,6,3,2,0,0,80.0,3,20,RL,Reg,Lvl,FR2,Veenker,Feedr,1Fam,1Story,Gable,MetalSd,MetalSd,CBlock,GasA,Y,SBrkr,Typ,Y,WD,Normal,4.0,3.0,Gd,ALQ,Attchd,RFn,3.0,3.0,7.141245,0,0
2,140000,9550,7,5,1915,3,216,0,756,4,961,756,1717,4,642,3,7,3,1,0,0,60.0,3,70,RL,IR1,Lvl,Corner,Crawfor,Norm,1Fam,2Story,Gable,Wd Sdng,Wd Shng,BrkTil,GasA,Y,SBrkr,Typ,Y,WD,Abnorml,3.0,4.0,No,ALQ,Detchd,Unf,3.0,3.0,7.448916,0,0


Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,FullBath,HalfBath,MiscVal,LotFrontage,ExterQual,MSSubClass,MSZoning,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,RoofStyle,Exterior1st,Exterior2nd,Foundation,Heating,CentralAir,Electrical,Functional,PavedDrive,SaleType,SaleCondition,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,GarageType,GarageFinish,GarageQual,GarageCond,GrLivArea_log,MisGarage,MisBsmt
0,102000,8877,4,5,1951,3,836,0,836,3,1220,0,1220,3,396,2,6,2,1,0,0,67.0,3,20,RL,Reg,Lvl,Inside,Edwards,Norm,1Fam,1Story,Gable,Wd Sdng,Wd Sdng,CBlock,GasA,Y,FuseF,Typ,Y,COD,Normal,2.0,2.0,No,LwQ,Detchd,Unf,3.0,3.0,7.107425,0,0
1,131400,10000,5,6,1961,3,594,0,864,5,1144,0,1144,3,264,1,6,3,1,0,400,80.0,3,20,RL,Reg,Lvl,Corner,Sawyer,Feedr,1Fam,1Story,Hip,HdBoard,HdBoard,CBlock,GasA,Y,SBrkr,Typ,Y,WD,Normal,3.0,3.0,No,BLQ,Attchd,RFn,3.0,3.0,7.04316,0,0
2,134800,7200,5,7,1951,3,234,486,900,3,900,0,900,4,576,2,5,3,1,0,0,60.0,3,20,RL,Reg,Lvl,Corner,NAmes,Norm,1Fam,1Story,Gable,Wd Sdng,Wd Sdng,CBlock,GasA,Y,SBrkr,Typ,Y,WD,Normal,3.0,3.0,Mn,BLQ,Detchd,Unf,3.0,3.0,6.803505,0,0


In [8]:
# fill missing values
display(train.info())

mis_col_mode = ['LotFrontage', 'Electrical']
mis_cat_cols = ['BsmtExposure', 'BsmtFinType1', 'GarageType', 'GarageFinish']
mis_num_cols = ['BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond']

for col in mis_col_mode:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(train[col].mode()[0])

for col in mis_cat_cols:
    train[col] = train[col].fillna(value='missing')
    test[col] = test[col].fillna(value='missing')
    
for col in mis_num_cols:
    train[col] = train[col].fillna(value=-1)
    test[col] = test[col].fillna(value=-1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1168 entries, 0 to 1167
Data columns (total 54 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SalePrice      1168 non-null   int64  
 1   LotArea        1168 non-null   int64  
 2   OverallQual    1168 non-null   int64  
 3   OverallCond    1168 non-null   int64  
 4   YearBuilt      1168 non-null   int64  
 5   ExterCond      1168 non-null   int64  
 6   BsmtFinSF1     1168 non-null   int64  
 7   BsmtFinSF2     1168 non-null   int64  
 8   TotalBsmtSF    1168 non-null   int64  
 9   HeatingQC      1168 non-null   int64  
 10  1stFlrSF       1168 non-null   int64  
 11  2ndFlrSF       1168 non-null   int64  
 12  GrLivArea      1168 non-null   int64  
 13  KitchenQual    1168 non-null   int64  
 14  GarageArea     1168 non-null   int64  
 15  GarageCars     1168 non-null   int64  
 16  TotRmsAbvGrd   1168 non-null   int64  
 17  BedroomAbvGr   1168 non-null   int64  
 18  FullBath

None

#### 3. Feature engineering

In [9]:
# do target encoding #

encoder = CrossFoldEncoder(MEstimateEncoder, m=10)
train_encoded = encoder.fit_transform(train, train.SalePrice, cols=cat_cols)
test_encoded = encoder.transform(test)

train.drop(columns=cat_cols, inplace=True)
test.drop(columns=cat_cols,  inplace=True)
train = pd.concat([train, train_encoded], axis = 1)
test = pd.concat([test, test_encoded], axis = 1)

display(train.shape, train.head(), train.count())
train0 = train.copy()
test0 = test.copy()

(1168, 54)

Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,MiscVal,LotFrontage,ExterQual,BsmtQual,BsmtCond,GarageQual,GarageCond,GrLivArea_log,GarageFinish_encoded,Heating_encoded,Condition1_encoded,CentralAir_encoded,Exterior2nd_encoded,BldgType_encoded,MSZoning_encoded,Electrical_encoded,HouseStyle_encoded,Foundation_encoded,SaleType_encoded,LandContour_encoded,BsmtFinType1_encoded,MisGarage_encoded,MisBsmt_encoded,BsmtExposure_encoded,GarageType_encoded,FullBath_encoded,Functional_encoded,MSSubClass_encoded,RoofStyle_encoded,Exterior1st_encoded,LotShape_encoded,HalfBath_encoded,Neighborhood_encoded,SaleCondition_encoded,LotConfig_encoded,PavedDrive_encoded
0,208500,8450,7,5,2003,3,706,0,856,5,856,854,1710,4,548,2,8,3,0,65.0,4,4.0,3.0,3.0,3.0,7.444833,198327.943537,178388.495302,179965.854824,181899.881998,211351.248219,181649.658887,186137.527733,182132.892348,202102.82289,219642.054443,169400.241795,177192.142422,229521.866186,181710.825079,178941.16795,164866.852132,199020.035872,210867.030175,180146.088199,228713.521652,167218.778339,210723.683045,161051.005088,206468.786854,190459.967454,172103.059889,172228.592392,182538.783342
1,181500,9600,6,8,1976,3,978,0,1262,5,1262,0,1262,3,460,2,6,3,0,80.0,3,4.0,3.0,3.0,3.0,7.141245,198327.943537,178388.495302,148920.07601,181899.881998,149120.815182,181649.658887,186137.527733,182132.892348,174451.720587,145798.211886,169400.241795,177192.142422,158041.948472,181710.825079,178941.16795,237612.528713,199020.035872,210867.030175,180146.088199,183940.24677,167218.778339,148325.737857,161051.005088,160170.049365,184667.638617,172103.059889,178578.56258,182538.783342
2,140000,9550,7,5,1915,3,216,0,756,4,961,756,1717,4,642,3,7,3,0,60.0,3,3.0,4.0,3.0,3.0,7.448916,140853.708506,178388.495302,179965.854824,181899.881998,149973.231355,181649.658887,186137.527733,182132.892348,202102.82289,134386.888289,169400.241795,177192.142422,158041.948472,181710.825079,178941.16795,164866.852132,133926.570043,133315.58623,180146.088199,175308.774299,167218.778339,143635.770005,204667.96495,160170.049365,205994.673038,141982.947957,179998.777366,182538.783342
3,250000,14260,8,5,2000,3,655,0,1145,5,1145,1053,2198,4,836,3,9,4,0,84.0,4,4.0,3.0,3.0,3.0,7.695758,198327.943537,178388.495302,179965.854824,181899.881998,211351.248219,181649.658887,186137.527733,182132.892348,202102.82289,219642.054443,169400.241795,177192.142422,229521.866186,181710.825079,178941.16795,199551.445545,199020.035872,210867.030175,180146.088199,228713.521652,167218.778339,210723.683045,204667.96495,206468.786854,282088.588222,172103.059889,178578.56258,182538.783342
4,143000,14115,5,5,1993,3,732,0,796,5,796,566,1362,3,480,2,5,1,700,85.0,3,4.0,3.0,3.0,3.0,7.217443,140853.708506,178388.495302,179965.854824,181899.881998,211351.248219,181649.658887,186137.527733,182132.892348,143768.999424,181987.245053,169400.241795,177192.142422,229521.866186,181710.825079,178941.16795,164866.852132,199020.035872,133315.58623,180146.088199,144878.422068,167218.778339,210723.683045,204667.96495,206468.786854,156677.423516,172103.059889,172228.592392,182538.783342


SalePrice                1168
LotArea                  1168
OverallQual              1168
OverallCond              1168
YearBuilt                1168
ExterCond                1168
BsmtFinSF1               1168
BsmtFinSF2               1168
TotalBsmtSF              1168
HeatingQC                1168
1stFlrSF                 1168
2ndFlrSF                 1168
GrLivArea                1168
KitchenQual              1168
GarageArea               1168
GarageCars               1168
TotRmsAbvGrd             1168
BedroomAbvGr             1168
MiscVal                  1168
LotFrontage              1168
ExterQual                1168
BsmtQual                 1168
BsmtCond                 1168
GarageQual               1168
GarageCond               1168
GrLivArea_log            1168
GarageFinish_encoded     1168
Heating_encoded          1168
Condition1_encoded       1168
CentralAir_encoded       1168
Exterior2nd_encoded      1168
BldgType_encoded         1168
MSZoning_encoded         1168
Electrical

In [10]:
add_dummyfeatures(train, test, {'OverallQual':1})
add_dummyfeatures(train, test, {'OverallQual':8})
add_dummyfeatures(train, test, {'OverallQual':9})
add_dummyfeatures(train, test, {'OverallQual':10})

log_transformer_mp_i1(train, test, feature_subset=num_cols)

train

[1 1]  variables created
[1 1]  variables created
[1 1]  variables created
[1 1]  variables created
Skewed columns log-transformed:  ['LotArea', 'BsmtFinSF2', 'MiscVal']


Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,MiscVal,LotFrontage,ExterQual,BsmtQual,BsmtCond,GarageQual,GarageCond,GrLivArea_log,GarageFinish_encoded,Heating_encoded,Condition1_encoded,CentralAir_encoded,Exterior2nd_encoded,BldgType_encoded,MSZoning_encoded,Electrical_encoded,HouseStyle_encoded,Foundation_encoded,SaleType_encoded,LandContour_encoded,BsmtFinType1_encoded,MisGarage_encoded,MisBsmt_encoded,BsmtExposure_encoded,GarageType_encoded,FullBath_encoded,Functional_encoded,MSSubClass_encoded,RoofStyle_encoded,Exterior1st_encoded,LotShape_encoded,HalfBath_encoded,Neighborhood_encoded,SaleCondition_encoded,LotConfig_encoded,PavedDrive_encoded,OverallQual1,OverallQual8,OverallQual9,OverallQual10
0,208500,9.042040,7,5,2003,3,706,0.000000,856,5,856,854,1710,4,548,2,8,3,0.000000,65.0,4,4.0,3.0,3.0,3.0,7.444833,198327.943537,178388.495302,179965.854824,181899.881998,211351.248219,181649.658887,186137.527733,182132.892348,202102.822890,219642.054443,169400.241795,177192.142422,229521.866186,181710.825079,178941.167950,164866.852132,199020.035872,210867.030175,180146.088199,228713.521652,167218.778339,210723.683045,161051.005088,206468.786854,190459.967454,172103.059889,172228.592392,182538.783342,0.0,0.0,0.0,0.0
1,181500,9.169623,6,8,1976,3,978,0.000000,1262,5,1262,0,1262,3,460,2,6,3,0.000000,80.0,3,4.0,3.0,3.0,3.0,7.141245,198327.943537,178388.495302,148920.076010,181899.881998,149120.815182,181649.658887,186137.527733,182132.892348,174451.720587,145798.211886,169400.241795,177192.142422,158041.948472,181710.825079,178941.167950,237612.528713,199020.035872,210867.030175,180146.088199,183940.246770,167218.778339,148325.737857,161051.005088,160170.049365,184667.638617,172103.059889,178578.562580,182538.783342,0.0,0.0,0.0,0.0
2,140000,9.164401,7,5,1915,3,216,0.000000,756,4,961,756,1717,4,642,3,7,3,0.000000,60.0,3,3.0,4.0,3.0,3.0,7.448916,140853.708506,178388.495302,179965.854824,181899.881998,149973.231355,181649.658887,186137.527733,182132.892348,202102.822890,134386.888289,169400.241795,177192.142422,158041.948472,181710.825079,178941.167950,164866.852132,133926.570043,133315.586230,180146.088199,175308.774299,167218.778339,143635.770005,204667.964950,160170.049365,205994.673038,141982.947957,179998.777366,182538.783342,0.0,0.0,0.0,0.0
3,250000,9.565284,8,5,2000,3,655,0.000000,1145,5,1145,1053,2198,4,836,3,9,4,0.000000,84.0,4,4.0,3.0,3.0,3.0,7.695758,198327.943537,178388.495302,179965.854824,181899.881998,211351.248219,181649.658887,186137.527733,182132.892348,202102.822890,219642.054443,169400.241795,177192.142422,229521.866186,181710.825079,178941.167950,199551.445545,199020.035872,210867.030175,180146.088199,228713.521652,167218.778339,210723.683045,204667.964950,206468.786854,282088.588222,172103.059889,178578.562580,182538.783342,0.0,1.0,0.0,0.0
4,143000,9.555064,5,5,1993,3,732,0.000000,796,5,796,566,1362,3,480,2,5,1,6.552508,85.0,3,4.0,3.0,3.0,3.0,7.217443,140853.708506,178388.495302,179965.854824,181899.881998,211351.248219,181649.658887,186137.527733,182132.892348,143768.999424,181987.245053,169400.241795,177192.142422,229521.866186,181710.825079,178941.167950,164866.852132,199020.035872,133315.586230,180146.088199,144878.422068,167218.778339,210723.683045,204667.964950,206468.786854,156677.423516,172103.059889,172228.592392,182538.783342,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,185000,8.922792,7,5,2004,3,410,0.000000,1221,5,1221,0,1221,4,400,2,6,2,0.000000,62.0,4,4.0,3.0,3.0,3.0,7.108244,201572.510157,181428.186367,184612.753699,185814.588865,214939.954796,186000.188801,209049.972691,185904.814663,175686.850178,222676.267268,172595.028246,180013.707860,229749.856131,184898.053135,182508.159421,167914.267993,200635.511322,214052.274743,183819.375097,185151.868424,172326.363040,214590.734908,165052.326655,162173.190577,218196.815805,175443.899972,178118.933302,185957.343809,0.0,0.0,0.0,0.0
1164,175000,8.976894,6,5,1999,3,0,0.000000,953,5,953,694,1647,3,460,2,7,3,0.000000,62.0,3,4.0,3.0,3.0,3.0,7.407318,201572.510157,181428.186367,184612.753699,185814.588865,214939.954796,186000.188801,191636.692145,185904.814663,208435.345894,222676.267268,172595.028246,180013.707860,173212.387940,184898.053135,182508.159421,167914.267993,200635.511322,214052.274743,183819.375097,237308.487395,172326.363040,214590.734908,165052.326655,210927.111183,191544.433558,175443.899972,178118.933302,185957.343809,0.0,0.0,0.0,0.0
1165,210000,9.486152,6,6,1978,3,790,5.099866,1542,3,2073,0,2073,3,500,2,7,3,0.000000,85.0,3,4.0,3.0,3.0,3.0,7.637234,142015.947795,181428.186367,184612.753699,185814.588865,169651.280195,186000.188801,191636.692145,185904.814663,175686.850178,148659.475340,172595.028246,180013.707860,157177.463555,184898.053135,182508.159421,167914.267993,200635.511322,214052.274743,151921.085997,185151.868424,172326.363040,173115.809629,165052.326655,162173.190577,184842.362591,175443.899972,178118.933302,185957.343809,0.0,0.0,0.0,0.0
1166,266500,9.109746,7,9,1941,4,275,0.000000,1152,5,1188,1152,2340,4,252,1,9,4,7.824446,66.0,5,3.0,4.0,3.0,3.0,7.758333,201572.510157,181428.186367,184612.753699,185814.588865,220572.991427,186000.188801,191636.692145,185904.814663,208435.345894,177833.659993,172595.028246,180013.707860,229749.856131,184898.053135,182508.159421,167914.267993,200635.511322,214052.274743,183819.375097,172280.013486,172326.363040,219521.531598,165052.326655,162173.190577,196491.849442,175443.899972,178118.933302,185957.343809,0.0,0.0,0.0,0.0


In [11]:
X_train = train.copy()
y_train = X_train.pop('SalePrice')
X_test = test.copy()
y_test = X_test.pop('SalePrice')
print(X_train.shape, X_test.shape, y_train.shape)


(1168, 57) (292, 57) (1168,)


In [12]:
feature_transformer = ColumnTransformer([
    ("num", StandardScaler(), num_cols+ord_cols),
    ], remainder = "passthrough")

X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), 
                       columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), 
                      columns=feature_transformer.get_feature_names_out())

# there are many dummies... may wish to use pca here later.

print(X_train.shape, X_test.shape, y_train.shape)
display(X_train)

(1168, 57) (292, 57) (1168,)


Unnamed: 0,num__LotArea,num__YearBuilt,num__BsmtFinSF1,num__BsmtFinSF2,num__TotalBsmtSF,num__1stFlrSF,num__2ndFlrSF,num__GrLivArea,num__GarageArea,num__MiscVal,num__LotFrontage,num__TotRmsAbvGrd,num__GarageCars,num__BedroomAbvGr,num__OverallCond,num__OverallQual,num__GrLivArea_log,num__ExterCond,num__HeatingQC,num__KitchenQual,num__ExterQual,num__BsmtQual,num__BsmtCond,num__GarageQual,num__GarageCond,remainder__GarageFinish_encoded,remainder__Heating_encoded,remainder__Condition1_encoded,remainder__CentralAir_encoded,remainder__Exterior2nd_encoded,remainder__BldgType_encoded,remainder__MSZoning_encoded,remainder__Electrical_encoded,remainder__HouseStyle_encoded,remainder__Foundation_encoded,remainder__SaleType_encoded,remainder__LandContour_encoded,remainder__BsmtFinType1_encoded,remainder__MisGarage_encoded,remainder__MisBsmt_encoded,remainder__BsmtExposure_encoded,remainder__GarageType_encoded,remainder__FullBath_encoded,remainder__Functional_encoded,remainder__MSSubClass_encoded,remainder__RoofStyle_encoded,remainder__Exterior1st_encoded,remainder__LotShape_encoded,remainder__HalfBath_encoded,remainder__Neighborhood_encoded,remainder__SaleCondition_encoded,remainder__LotConfig_encoded,remainder__PavedDrive_encoded,remainder__OverallQual1,remainder__OverallQual8,remainder__OverallQual9,remainder__OverallQual10
0,-0.109730,1.039570,0.581730,-0.348404,-0.450863,-0.765013,1.155189,0.388488,0.356780,-0.195929,-0.143232,0.929996,0.311197,0.155509,-0.510312,0.648713,0.544840,-0.246084,0.894349,0.744551,1.056852,0.534575,0.124583,0.264388,0.267084,198327.943537,178388.495302,179965.854824,181899.881998,211351.248219,181649.658887,186137.527733,182132.892348,202102.822890,219642.054443,169400.241795,177192.142422,229521.866186,181710.825079,178941.167950,164866.852132,199020.035872,210867.030175,180146.088199,228713.521652,167218.778339,210723.683045,161051.005088,206468.786854,190459.967454,172103.059889,172228.592392,182538.783342,0.0,0.0,0.0,0.0
1,0.135406,0.147467,1.174373,-0.348404,0.471978,0.283710,-0.800112,-0.469662,-0.053845,-0.195929,0.511971,-0.309291,0.311197,0.155509,2.190797,-0.072216,-0.364534,-0.246084,0.894349,-0.760009,-0.681744,0.534575,0.124583,0.264388,0.267084,198327.943537,178388.495302,148920.076010,181899.881998,149120.815182,181649.658887,186137.527733,182132.892348,174451.720587,145798.211886,169400.241795,177192.142422,158041.948472,181710.825079,178941.167950,237612.528713,199020.035872,210867.030175,180146.088199,183940.246770,167218.778339,148325.737857,161051.005088,160170.049365,184667.638617,172103.059889,178578.562580,182538.783342,0.0,0.0,0.0,0.0
2,0.125374,-1.868023,-0.485899,-0.348404,-0.678163,-0.493792,0.930810,0.401897,0.795402,-0.195929,-0.361634,0.310352,1.637761,0.155509,-0.510312,0.648713,0.557070,-0.246084,-0.146089,0.744551,-0.681744,-0.492372,1.609406,0.264388,0.267084,140853.708506,178388.495302,179965.854824,181899.881998,149973.231355,181649.658887,186137.527733,182132.892348,202102.822890,134386.888289,169400.241795,177192.142422,158041.948472,181710.825079,178941.167950,164866.852132,133926.570043,133315.586230,180146.088199,175308.774299,167218.778339,143635.770005,204667.964950,160170.049365,205994.673038,141982.947957,179998.777366,182538.783342,0.0,0.0,0.0,0.0
3,0.895628,0.940447,0.470610,-0.348404,0.206036,-0.018508,1.610815,1.323259,1.700643,-0.195929,0.686692,1.549639,1.637761,1.374533,-0.510312,1.369643,1.296465,-0.246084,0.894349,0.744551,1.056852,0.534575,0.124583,0.264388,0.267084,198327.943537,178388.495302,179965.854824,181899.881998,211351.248219,181649.658887,186137.527733,182132.892348,202102.822890,219642.054443,169400.241795,177192.142422,229521.866186,181710.825079,178941.167950,199551.445545,199020.035872,210867.030175,180146.088199,228713.521652,167218.778339,210723.683045,204667.964950,206468.786854,282088.588222,172103.059889,178578.562580,182538.783342,0.0,1.0,0.0,0.0
4,0.875992,0.709161,0.638380,-0.348404,-0.587243,-0.919997,0.495790,-0.278110,0.039479,4.981083,0.730373,-0.928935,0.311197,-2.282539,-0.510312,-0.793146,-0.136288,-0.246084,0.894349,-0.760009,-0.681744,0.534575,0.124583,0.264388,0.267084,140853.708506,178388.495302,179965.854824,181899.881998,211351.248219,181649.658887,186137.527733,182132.892348,143768.999424,181987.245053,169400.241795,177192.142422,229521.866186,181710.825079,178941.167950,164866.852132,199020.035872,133315.586230,180146.088199,144878.422068,167218.778339,210723.683045,204667.964950,206468.786854,156677.423516,172103.059889,172228.592392,182538.783342,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,-0.338854,1.072611,-0.063205,-0.348404,0.378784,0.177804,-0.800112,-0.548198,-0.333816,-0.195929,-0.274273,-0.309291,0.311197,-1.063515,-0.510312,0.648713,-0.463385,-0.246084,0.894349,0.744551,1.056852,0.534575,0.124583,0.264388,0.267084,201572.510157,181428.186367,184612.753699,185814.588865,214939.954796,186000.188801,209049.972691,185904.814663,175686.850178,222676.267268,172595.028246,180013.707860,229749.856131,184898.053135,182508.159421,167914.267993,200635.511322,214052.274743,183819.375097,185151.868424,172326.363040,214590.734908,165052.326655,162173.190577,218196.815805,175443.899972,178118.933302,185957.343809,0.0,0.0,0.0,0.0
1164,-0.234902,0.907406,-0.956527,-0.348404,-0.230381,-0.514456,0.788856,0.267811,-0.053845,-0.195929,-0.274273,0.310352,0.311197,0.155509,-0.510312,-0.072216,0.432465,-0.246084,0.894349,-0.760009,-0.681744,0.534575,0.124583,0.264388,0.267084,201572.510157,181428.186367,184612.753699,185814.588865,214939.954796,186000.188801,191636.692145,185904.814663,208435.345894,222676.267268,172595.028246,180013.707860,173212.387940,184898.053135,182508.159421,167914.267993,200635.511322,214052.274743,183819.375097,237308.487395,172326.363040,214590.734908,165052.326655,210927.111183,191544.433558,175443.899972,178118.933302,185957.343809,0.0,0.0,0.0,0.0
1165,0.743585,0.213549,0.764752,2.471137,1.108419,2.378573,-0.800112,1.083820,0.132803,-0.195929,0.730373,0.310352,0.311197,0.155509,0.390057,-0.072216,1.121162,-0.246084,-1.186527,-0.760009,-0.681744,0.534575,0.124583,0.264388,0.267084,142015.947795,181428.186367,184612.753699,185814.588865,169651.280195,186000.188801,191636.692145,185904.814663,175686.850178,148659.475340,172595.028246,180013.707860,157177.463555,184898.053135,182508.159421,167914.267993,200635.511322,214052.274743,151921.085997,185151.868424,172326.363040,173115.809629,165052.326655,162173.190577,184842.362591,175443.899972,178118.933302,185957.343809,0.0,0.0,0.0,0.0
1166,0.020360,-1.008962,-0.357348,-0.348404,0.221947,0.092563,1.837484,1.595262,-1.024413,5.986018,-0.099552,1.549639,-1.015366,1.374533,3.091167,0.648713,1.483905,2.571820,0.894349,0.744551,2.795448,-0.492372,1.609406,0.264388,0.267084,201572.510157,181428.186367,184612.753699,185814.588865,220572.991427,186000.188801,191636.692145,185904.814663,208435.345894,177833.659993,172595.028246,180013.707860,229749.856131,184898.053135,182508.159421,167914.267993,200635.511322,214052.274743,183819.375097,172280.013486,172326.363040,219521.531598,165052.326655,162173.190577,196491.849442,175443.899972,178118.933302,185957.343809,0.0,0.0,0.0,0.0


#### 4. Modeling

In [13]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print('OLS ', mean_squared_error(y_train, lr.predict(X_train))**0.5)

OLS  27549.000398377404


In [14]:
xgbb = XGBRegressor(n_estimators=200,
                   max_depth=5,
                   eta=0.06,
                   subsample=0.8,
                   colsample_bytree=0.6)
xgbb.fit(X_train, y_train)
print('xgb ', mean_squared_error(y_train, xgbb.predict(X_train))**0.5)

xgb  6771.340425103643


In [15]:
xgbgs = XGBRegressor()
grid_param = {'n_estimators':[200], 
              'max_depth':[2, 3, 4, 5], 
              'eta':[0.04, 0.06, 0.08, 0.1],
              'subsample':[0.7], 
              'colsample_bytree':[0.5]}
xgbgs = GridSearchCV(xgbgs, grid_param, cv=2, scoring='neg_root_mean_squared_error')
xgbgs.fit(X_train, y_train)

print('xgbgs ',       
      xgbgs.best_params_, 
      xgbgs.best_score_, 
      mean_squared_error(y_train, xgbgs.predict(X_train))**0.5)

xgbgs  {'colsample_bytree': 0.5, 'eta': 0.06, 'max_depth': 3, 'n_estimators': 200, 'subsample': 0.7} -26639.726739032398 13858.404881810424


In [16]:
# performance evaluation
print('OLS', np.sqrt(mean_squared_error(y_test, lr.predict(X_test))))
print('XGB', np.sqrt(mean_squared_error(y_test, xgbb.predict(X_test))))
print('XGBgs', np.sqrt(mean_squared_error(y_test, xgbgs.predict(X_test))))

OLS 36682.6547486498
XGB 29692.787384485582
XGBgs 28496.658119880918


In [17]:
# SVR sucks, so I omit it.
# as usual, XGB GS fails to clearly beat XGB baseline.

In [18]:
print('Total time for this Dev script: ', time.time() - time0)

Total time for this Dev script:  21.578758478164673
