### This is Dev notebook for house prices project from Kaggle

In [99]:
# 0. Load libraries #

import numpy as np
import pandas as pd
import os, time, warnings, random
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.svm import SVC, SVR
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
from scipy.special import inv_boxcox
from category_encoders import MEstimateEncoder
from xgboost import XGBClassifier, XGBRegressor

pd.set_option('display.max_columns', 100)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore')

def draw_histograms(df, variables, n_rows, n_cols):
    # stolen from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  
    plt.show()


def fillna_mp_i1(df_train, df_test, df_pred, num_features, cat_features, num_fill='median', cat_fill='mode'):
    """
    This function speeds up filling missing values for 3 main datasets using different imputation methods.
    Later may replace it with some subclass.
    Example: fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)
    """
    # set df_pred to None if it does not exist
    if (cat_fill=='mode'):
    
        df_train[cat_features] = df_train[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
        df_test[cat_features] = df_test[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
        if (df_pred is not None):
            df_pred[cat_features] = df_pred[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
            
    if (cat_fill=='missing'):
    
        df_train[cat_features] = df_train[cat_features].fillna(value='missing')
        df_test[cat_features] = df_test[cat_features].fillna(value='missing')
        if (df_pred is not None):
            df_pred[cat_features] = df_pred[cat_features].fillna(value='missing')
        
    if (num_fill=='median'):
        df_train[num_features] = df_train[num_features].fillna(value=df_train[num_features].median())
        df_test[num_features] = df_test[num_features].fillna(value=df_train[num_features].median())
        if (df_pred is not None):
            df_pred[num_features] = df_pred[num_features].fillna(value=df_train[num_features].median())    
    
    all_good = (
    (np.prod(df_train[num_features+cat_features].shape)==df_train[num_features+cat_features].count().sum()) and 
    (np.prod(df_test[num_features+cat_features].shape) == df_test[num_features+cat_features].count().sum()) and 
    (np.prod(df_pred[num_features+cat_features].shape) == df_pred[num_features+cat_features].count().sum()))
    if (all_good):
        print('Missing values imputed successfully')
    else:
        print('There are still some missing values...')
    
    
    
def add_misDummy_mp_i1(df_train, df_test, df_pred, features):
    """
    This function creates new dummy columns for missing features.
    Example: add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])
    """
    # set df_pred to None if it does not exist
    for feature_name in features:
        misColName = 'mis'+feature_name
        df_train.loc[df_train[feature_name].isnull(), misColName]=1
        df_train.loc[df_train[feature_name].notnull(), misColName]=0
        df_test.loc[df_test[feature_name].isnull(), misColName]=1
        df_test.loc[df_test[feature_name].notnull(), misColName]=0
        if (df_pred is not None):
            df_pred.loc[df_pred[feature_name].isnull(), misColName]=1
            df_pred.loc[df_pred[feature_name].notnull(), misColName]=0
   

def discretize_mp_i1(df_train, df_test, df_pred, feature, ntiles, delete_feature=False):
    """
    This function divides a continuous feature into quantile groups.
    Example: discretize_mp_i1(X_train, X_test, X_pred, 'Age', 15)
    """
    # set df_pred to None if it does not exist
    _,bin = pd.qcut(df_train[feature], ntiles, retbins = True, labels = False, duplicates = 'drop')
    df_train[feature+'Ntile'] = pd.cut(df_train[feature], 
                                       labels=False, 
                                       duplicates = 'drop', 
                                       bins = bin , 
                                       include_lowest = True)
    df_test[feature+'Ntile'] = pd.cut(df_test[feature], 
                                      labels=False, 
                                      duplicates = 'drop', 
                                      bins = bin , 
                                      include_lowest = True)
    if (df_pred is not None):
        df_pred[feature+'Ntile'] = pd.cut(df_pred[feature], 
                                          labels=False, 
                                          duplicates = 'drop', 
                                          bins = bin , 
                                          include_lowest = True)
    if (delete_feature==True):
        df_train.drop(columns=[feature], inplace=True)
        df_test.drop(columns=[feature], inplace=True)
        df_pred.drop(columns=[feature], inplace=True)
    print('Discretized ',feature, ' into ', len(bin)-1, ' bins')


def log_transformer_mp_i1(df_train, df_test, feature_subset=False, min_skew=3, df_pred=None):
    """
    This function divides a continuous feature into quantile groups.
    Example: log_transformer_mp_i1(X_train, X_test, X_pred, feature_subset=num_cols)
    """
    # set df_pred to None if it does not exist
    if (feature_subset==False):
        features_totransform = df_train.columns
    else:
        features_totransform = feature_subset.copy()
    skewed_vars = list(df_train.skew()[abs(df_train.skew())>min_skew].index)
    for col in list(set(skewed_vars)&set(features_totransform)):
        df_train[col] = np.log1p(df_train[col])
        df_test[col] = np.log1p(df_test[col])
        if df_pred:
            df_pred[col] = np.log1p(df_pred[col])
    print('Skewed columns log-transformed: ', list(set(skewed_vars)&set(features_totransform)))
    
    
def add_dummyfeatures(df_train, df_test, feature_dict, df_pred=None):
    """
    This function adds dummy feature when some feature is equal to value, specified in a dictionary.
    Example: add_dummyfeatures(X_train, X_test, X_pred, {'RoomService':0, 'Spa':0, 'VRDeck':0, 'ShoppingMall':0})
    """
    if df_pred:
        input_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    else:
        input_dimensions = np.array([df_train.shape[1], df_test.shape[1]])    
    for i in range(len(list(feature_dict.items()))):
        feature,value = list(feature_dict.keys())[i], list(feature_dict.values())[i]
        df_train.loc[df_train[feature]==value,(str(feature)+str(value))]=1
        df_train.loc[df_train[feature]!=value,(str(feature)+str(value))]=0
        df_test.loc[df_test[feature]==value,(str(feature)+str(value))]=1
        df_test.loc[df_test[feature]!=value,(str(feature)+str(value))]=0
        if df_pred:
            df_pred.loc[df_pred[feature]==value,(str(feature)+str(value))]=1
            df_pred.loc[df_pred[feature]!=value,(str(feature)+str(value))]=0
    if df_pred:
        output_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    else:
        output_dimensions = np.array([df_train.shape[1], df_test.shape[1]])
    print(output_dimensions-input_dimensions, ' variables created') 
    

In [100]:
# add_dummyfeatures(train, test, 
#                   {'OverallQual':1, 'OverallQual':8, 'OverallQual':9, 'OverallQual':10})

# feature_dict = {'OverallQual': 1, 'OverallQual': 8, 'OverallQual': 9, 'OverallQual': 10}
# feature_dict.items()

In [101]:
    
### target encoding ###
# source: https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices/notebook

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=4)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded    

In [102]:
# 1. Load data #

time0 = time.time()

os.chdir('/home/jupyter/projects_data/house_price')
df = pd.read_csv('train.csv') 
# df.drop(columns = ['Id'], inplace=True)
pred=pd.read_csv('test.csv')
pred0 = pred.copy()

print(df.shape, pred.shape)
df

(1460, 81) (1459, 80)


Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,LotConfig,LandSlope,Neighborhood,Condition1,Condition2,BldgType,HouseStyle,OverallQual,OverallCond,YearBuilt,YearRemodAdd,RoofStyle,RoofMatl,Exterior1st,Exterior2nd,MasVnrType,MasVnrArea,ExterQual,ExterCond,Foundation,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,BsmtFinSF1,BsmtFinType2,BsmtFinSF2,BsmtUnfSF,TotalBsmtSF,Heating,HeatingQC,CentralAir,Electrical,1stFlrSF,2ndFlrSF,LowQualFinSF,GrLivArea,BsmtFullBath,BsmtHalfBath,FullBath,HalfBath,BedroomAbvGr,KitchenAbvGr,KitchenQual,TotRmsAbvGrd,Functional,Fireplaces,FireplaceQu,GarageType,GarageYrBlt,GarageFinish,GarageCars,GarageArea,GarageQual,GarageCond,PavedDrive,WoodDeckSF,OpenPorchSF,EnclosedPorch,3SsnPorch,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2003,2003,Gable,CompShg,VinylSd,VinylSd,BrkFace,196.0,Gd,TA,PConc,Gd,TA,No,GLQ,706,Unf,0,150,856,GasA,Ex,Y,SBrkr,856,854,0,1710,1,0,2,1,3,1,Gd,8,Typ,0,,Attchd,2003.0,RFn,2,548,TA,TA,Y,0,61,0,0,0,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,FR2,Gtl,Veenker,Feedr,Norm,1Fam,1Story,6,8,1976,1976,Gable,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,Gd,TA,Gd,ALQ,978,Unf,0,284,1262,GasA,Ex,Y,SBrkr,1262,0,0,1262,0,1,2,0,3,1,TA,6,Typ,1,TA,Attchd,1976.0,RFn,2,460,TA,TA,Y,298,0,0,0,0,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,Inside,Gtl,CollgCr,Norm,Norm,1Fam,2Story,7,5,2001,2002,Gable,CompShg,VinylSd,VinylSd,BrkFace,162.0,Gd,TA,PConc,Gd,TA,Mn,GLQ,486,Unf,0,434,920,GasA,Ex,Y,SBrkr,920,866,0,1786,1,0,2,1,3,1,Gd,6,Typ,1,TA,Attchd,2001.0,RFn,2,608,TA,TA,Y,0,42,0,0,0,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,Corner,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,5,1915,1970,Gable,CompShg,Wd Sdng,Wd Shng,,0.0,TA,TA,BrkTil,TA,Gd,No,ALQ,216,Unf,0,540,756,GasA,Gd,Y,SBrkr,961,756,0,1717,1,0,1,0,3,1,Gd,7,Typ,1,Gd,Detchd,1998.0,Unf,3,642,TA,TA,Y,0,35,272,0,0,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,FR2,Gtl,NoRidge,Norm,Norm,1Fam,2Story,8,5,2000,2000,Gable,CompShg,VinylSd,VinylSd,BrkFace,350.0,Gd,TA,PConc,Gd,TA,Av,GLQ,655,Unf,0,490,1145,GasA,Ex,Y,SBrkr,1145,1053,0,2198,1,0,2,1,4,1,Gd,9,Typ,1,TA,Attchd,2000.0,RFn,3,836,TA,TA,Y,192,84,0,0,0,0,,,,0,12,2008,WD,Normal,250000
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,1456,60,RL,62.0,7917,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Gilbert,Norm,Norm,1Fam,2Story,6,5,1999,2000,Gable,CompShg,VinylSd,VinylSd,,0.0,TA,TA,PConc,Gd,TA,No,Unf,0,Unf,0,953,953,GasA,Ex,Y,SBrkr,953,694,0,1647,0,0,2,1,3,1,TA,7,Typ,1,TA,Attchd,1999.0,RFn,2,460,TA,TA,Y,0,40,0,0,0,0,,,,0,8,2007,WD,Normal,175000
1456,1457,20,RL,85.0,13175,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NWAmes,Norm,Norm,1Fam,1Story,6,6,1978,1988,Gable,CompShg,Plywood,Plywood,Stone,119.0,TA,TA,CBlock,Gd,TA,No,ALQ,790,Rec,163,589,1542,GasA,TA,Y,SBrkr,2073,0,0,2073,1,0,2,0,3,1,TA,7,Min1,2,TA,Attchd,1978.0,Unf,2,500,TA,TA,Y,349,0,0,0,0,0,,MnPrv,,0,2,2010,WD,Normal,210000
1457,1458,70,RL,66.0,9042,Pave,,Reg,Lvl,AllPub,Inside,Gtl,Crawfor,Norm,Norm,1Fam,2Story,7,9,1941,2006,Gable,CompShg,CemntBd,CmentBd,,0.0,Ex,Gd,Stone,TA,Gd,No,GLQ,275,Unf,0,877,1152,GasA,Ex,Y,SBrkr,1188,1152,0,2340,0,0,2,0,4,1,Gd,9,Typ,2,Gd,Attchd,1941.0,RFn,1,252,TA,TA,Y,0,60,0,0,0,0,,GdPrv,Shed,2500,5,2010,WD,Normal,266500
1458,1459,20,RL,68.0,9717,Pave,,Reg,Lvl,AllPub,Inside,Gtl,NAmes,Norm,Norm,1Fam,1Story,5,6,1950,1996,Hip,CompShg,MetalSd,MetalSd,,0.0,TA,TA,CBlock,TA,TA,Mn,GLQ,49,Rec,1029,0,1078,GasA,Gd,Y,FuseA,1078,0,0,1078,1,0,1,0,2,1,Gd,5,Typ,0,,Attchd,1950.0,Unf,1,240,TA,TA,Y,366,0,112,0,0,0,,,,0,4,2010,WD,Normal,142125


In [103]:
# 2. pEDA #

cols_tokeep = ['SalePrice', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'ExterCond', 
               'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', 'HeatingQC', '1stFlrSF', '2ndFlrSF', 'GrLivArea',  
               'KitchenQual', 'GarageArea', 'GarageCars', 'TotRmsAbvGrd', 'BedroomAbvGr', 'FullBath', 
               'HalfBath', 'MiscVal', 'LotFrontage', 
               'ExterQual', 'MSSubClass', 'MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'Neighborhood',
               'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd',
               'Foundation', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'PavedDrive',
               'SaleType', 'SaleCondition', 'BsmtQual', 'BsmtCond', 
               'BsmtExposure', 'BsmtFinType1', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
df = df[cols_tokeep]

# preliminary feature engineering:
df['GrLivArea_log'] = np.log1p(df['GrLivArea'])
pred['GrLivArea_log'] = np.log1p(pred['GrLivArea'])
# w/o logtransform, scatterplot looks better. not sure whether log tranform helps.

df['MisGarage'] = df.GarageType.isnull().astype(int)
df['MisBsmt'] = df.BsmtCond.isnull().astype(int)

In [104]:
df

Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,FullBath,HalfBath,MiscVal,LotFrontage,ExterQual,MSSubClass,MSZoning,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,RoofStyle,Exterior1st,Exterior2nd,Foundation,Heating,CentralAir,Electrical,Functional,PavedDrive,SaleType,SaleCondition,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,GarageType,GarageFinish,GarageQual,GarageCond,GrLivArea_log,MisGarage,MisBsmt
0,208500,8450,7,5,2003,TA,706,0,856,Ex,856,854,1710,Gd,548,2,8,3,2,1,0,65.0,Gd,60,RL,Reg,Lvl,Inside,CollgCr,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,Gd,TA,No,GLQ,Attchd,RFn,TA,TA,7.444833,0,0
1,181500,9600,6,8,1976,TA,978,0,1262,Ex,1262,0,1262,TA,460,2,6,3,2,0,0,80.0,TA,20,RL,Reg,Lvl,FR2,Veenker,Feedr,1Fam,1Story,Gable,MetalSd,MetalSd,CBlock,GasA,Y,SBrkr,Typ,Y,WD,Normal,Gd,TA,Gd,ALQ,Attchd,RFn,TA,TA,7.141245,0,0
2,223500,11250,7,5,2001,TA,486,0,920,Ex,920,866,1786,Gd,608,2,6,3,2,1,0,68.0,Gd,60,RL,IR1,Lvl,Inside,CollgCr,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,Gd,TA,Mn,GLQ,Attchd,RFn,TA,TA,7.488294,0,0
3,140000,9550,7,5,1915,TA,216,0,756,Gd,961,756,1717,Gd,642,3,7,3,1,0,0,60.0,TA,70,RL,IR1,Lvl,Corner,Crawfor,Norm,1Fam,2Story,Gable,Wd Sdng,Wd Shng,BrkTil,GasA,Y,SBrkr,Typ,Y,WD,Abnorml,TA,Gd,No,ALQ,Detchd,Unf,TA,TA,7.448916,0,0
4,250000,14260,8,5,2000,TA,655,0,1145,Ex,1145,1053,2198,Gd,836,3,9,4,2,1,0,84.0,Gd,60,RL,IR1,Lvl,FR2,NoRidge,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,Gd,TA,Av,GLQ,Attchd,RFn,TA,TA,7.695758,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,175000,7917,6,5,1999,TA,0,0,953,Ex,953,694,1647,TA,460,2,7,3,2,1,0,62.0,TA,60,RL,Reg,Lvl,Inside,Gilbert,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,Gd,TA,No,Unf,Attchd,RFn,TA,TA,7.407318,0,0
1456,210000,13175,6,6,1978,TA,790,163,1542,TA,2073,0,2073,TA,500,2,7,3,2,0,0,85.0,TA,20,RL,Reg,Lvl,Inside,NWAmes,Norm,1Fam,1Story,Gable,Plywood,Plywood,CBlock,GasA,Y,SBrkr,Min1,Y,WD,Normal,Gd,TA,No,ALQ,Attchd,Unf,TA,TA,7.637234,0,0
1457,266500,9042,7,9,1941,Gd,275,0,1152,Ex,1188,1152,2340,Gd,252,1,9,4,2,0,2500,66.0,Ex,70,RL,Reg,Lvl,Inside,Crawfor,Norm,1Fam,2Story,Gable,CemntBd,CmentBd,Stone,GasA,Y,SBrkr,Typ,Y,WD,Normal,TA,Gd,No,GLQ,Attchd,RFn,TA,TA,7.758333,0,0
1458,142125,9717,5,6,1950,TA,49,1029,1078,Gd,1078,0,1078,Gd,240,1,5,2,1,0,0,68.0,TA,20,RL,Reg,Lvl,Inside,NAmes,Norm,1Fam,1Story,Hip,MetalSd,MetalSd,CBlock,GasA,Y,FuseA,Typ,Y,WD,Normal,TA,TA,Mn,GLQ,Attchd,Unf,TA,TA,6.983790,0,0


In [105]:
# 3. train-test split #

ord_cols = ['ExterCond', 'HeatingQC', 'KitchenQual', 'ExterQual', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond']
num_cols = ['LotArea', 'YearBuilt', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
            'GrLivArea', 'GarageArea', 'MiscVal', 'LotFrontage', 
           'TotRmsAbvGrd', 'GarageCars', 'BedroomAbvGr', 'OverallCond', 'OverallQual', 'GrLivArea_log']
cat_cols = list(set(df.columns)-set(num_cols)-set(ord_cols)-set(['SalePrice']))
# for now, view ordinal features as categorical features
print("Numerical features ", num_cols, "\n",
      'Ordinal features', ord_cols, '\n',
      "Categorical features ", cat_cols)

df[ord_cols] = df[ord_cols].replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1,2,3,4,5])
pred[ord_cols] = pred[ord_cols].replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1,2,3,4,5])

Numerical features  ['LotArea', 'YearBuilt', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'MiscVal', 'LotFrontage', 'TotRmsAbvGrd', 'GarageCars', 'BedroomAbvGr', 'OverallCond', 'OverallQual', 'GrLivArea_log'] 
 Ordinal features ['ExterCond', 'HeatingQC', 'KitchenQual', 'ExterQual', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond'] 
 Categorical features  ['Condition1', 'GarageType', 'GarageFinish', 'Neighborhood', 'MSSubClass', 'Exterior1st', 'BsmtExposure', 'Exterior2nd', 'HouseStyle', 'LotShape', 'FullBath', 'CentralAir', 'LotConfig', 'MSZoning', 'BldgType', 'MisBsmt', 'LandContour', 'BsmtFinType1', 'Functional', 'MisGarage', 'SaleCondition', 'PavedDrive', 'HalfBath', 'RoofStyle', 'SaleType', 'Foundation', 'Heating', 'Electrical']


In [106]:
df[cat_cols].nunique()

Condition1        9
GarageType        6
GarageFinish      3
Neighborhood     25
MSSubClass       15
Exterior1st      15
BsmtExposure      4
Exterior2nd      16
HouseStyle        8
LotShape          4
FullBath          4
CentralAir        2
LotConfig         5
MSZoning          5
BldgType          5
MisBsmt           2
LandContour       4
BsmtFinType1      6
Functional        7
MisGarage         2
SaleCondition     6
PavedDrive        3
HalfBath          3
RoofStyle         6
SaleType          9
Foundation        6
Heating           6
Electrical        5
dtype: int64

According to feature importances, only Neighboorhood and possibly Exterior1 categorical features are really useful.
And they have too many unique values to use OHC. All other categorical features are not important enough to bother with ohe.
So use target encoding for all of them.

In [107]:
test_size = 0.2
df.reset_index(inplace=True, drop=True)
random.seed(2)
test_index = random.sample(list(df.index), int(test_size*df.shape[0]))
train = df.iloc[list(set(df.index)-set(test_index))]
test = df.iloc[test_index]
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train.drop(columns=['id'],inplace=True, errors='ignore')
test.drop(columns=['id'],inplace=True, errors='ignore')
display(train.shape, test.shape, train.head(3), test.head(3))

(1168, 54)

(292, 54)

Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,FullBath,HalfBath,MiscVal,LotFrontage,ExterQual,MSSubClass,MSZoning,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,RoofStyle,Exterior1st,Exterior2nd,Foundation,Heating,CentralAir,Electrical,Functional,PavedDrive,SaleType,SaleCondition,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,GarageType,GarageFinish,GarageQual,GarageCond,GrLivArea_log,MisGarage,MisBsmt
0,208500,8450,7,5,2003,3,706,0,856,5,856,854,1710,4,548,2,8,3,2,1,0,65.0,4,60,RL,Reg,Lvl,Inside,CollgCr,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,4.0,3.0,No,GLQ,Attchd,RFn,3.0,3.0,7.444833,0,0
1,181500,9600,6,8,1976,3,978,0,1262,5,1262,0,1262,3,460,2,6,3,2,0,0,80.0,3,20,RL,Reg,Lvl,FR2,Veenker,Feedr,1Fam,1Story,Gable,MetalSd,MetalSd,CBlock,GasA,Y,SBrkr,Typ,Y,WD,Normal,4.0,3.0,Gd,ALQ,Attchd,RFn,3.0,3.0,7.141245,0,0
2,223500,11250,7,5,2001,3,486,0,920,5,920,866,1786,4,608,2,6,3,2,1,0,68.0,4,60,RL,IR1,Lvl,Inside,CollgCr,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,4.0,3.0,Mn,GLQ,Attchd,RFn,3.0,3.0,7.488294,0,0


Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,FullBath,HalfBath,MiscVal,LotFrontage,ExterQual,MSSubClass,MSZoning,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,RoofStyle,Exterior1st,Exterior2nd,Foundation,Heating,CentralAir,Electrical,Functional,PavedDrive,SaleType,SaleCondition,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,GarageType,GarageFinish,GarageQual,GarageCond,GrLivArea_log,MisGarage,MisBsmt
0,176000,3230,6,5,1999,3,419,0,729,4,729,729,1458,3,440,2,5,2,2,1,0,34.0,3,160,FV,Reg,Lvl,Corner,Somerst,Norm,TwnhsE,2Story,Gable,MetalSd,MetalSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,4.0,3.0,No,GLQ,Detchd,Unf,3.0,3.0,7.285507,0,0
1,135000,10410,5,7,1916,3,0,0,660,5,808,704,1656,3,180,1,8,3,2,1,0,60.0,3,50,RL,Reg,Lvl,Inside,OldTown,Norm,1Fam,1.5Fin,Gable,HdBoard,HdBoard,CBlock,GasA,Y,SBrkr,Min2,N,WD,Normal,2.0,3.0,No,Unf,Detchd,Unf,2.0,2.0,7.412764,0,0
2,163000,10197,6,5,1961,3,288,374,1362,3,1362,0,1362,3,504,2,6,3,1,1,0,80.0,3,20,RL,IR1,Lvl,Inside,NAmes,Norm,1Fam,1Story,Gable,WdShing,Wd Shng,CBlock,GasA,Y,SBrkr,Typ,Y,COD,Normal,3.0,3.0,No,ALQ,Attchd,Unf,3.0,3.0,7.217443,0,0


In [108]:
train

Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,FullBath,HalfBath,MiscVal,LotFrontage,ExterQual,MSSubClass,MSZoning,LotShape,LandContour,LotConfig,Neighborhood,Condition1,BldgType,HouseStyle,RoofStyle,Exterior1st,Exterior2nd,Foundation,Heating,CentralAir,Electrical,Functional,PavedDrive,SaleType,SaleCondition,BsmtQual,BsmtCond,BsmtExposure,BsmtFinType1,GarageType,GarageFinish,GarageQual,GarageCond,GrLivArea_log,MisGarage,MisBsmt
0,208500,8450,7,5,2003,3,706,0,856,5,856,854,1710,4,548,2,8,3,2,1,0,65.0,4,60,RL,Reg,Lvl,Inside,CollgCr,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,4.0,3.0,No,GLQ,Attchd,RFn,3.0,3.0,7.444833,0,0
1,181500,9600,6,8,1976,3,978,0,1262,5,1262,0,1262,3,460,2,6,3,2,0,0,80.0,3,20,RL,Reg,Lvl,FR2,Veenker,Feedr,1Fam,1Story,Gable,MetalSd,MetalSd,CBlock,GasA,Y,SBrkr,Typ,Y,WD,Normal,4.0,3.0,Gd,ALQ,Attchd,RFn,3.0,3.0,7.141245,0,0
2,223500,11250,7,5,2001,3,486,0,920,5,920,866,1786,4,608,2,6,3,2,1,0,68.0,4,60,RL,IR1,Lvl,Inside,CollgCr,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,4.0,3.0,Mn,GLQ,Attchd,RFn,3.0,3.0,7.488294,0,0
3,250000,14260,8,5,2000,3,655,0,1145,5,1145,1053,2198,4,836,3,9,4,2,1,0,84.0,4,60,RL,IR1,Lvl,FR2,NoRidge,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,4.0,3.0,Av,GLQ,Attchd,RFn,3.0,3.0,7.695758,0,0
4,143000,14115,5,5,1993,3,732,0,796,5,796,566,1362,3,480,2,5,1,1,1,700,85.0,3,50,RL,IR1,Lvl,Inside,Mitchel,Norm,1Fam,1.5Fin,Gable,VinylSd,VinylSd,Wood,GasA,Y,SBrkr,Typ,Y,WD,Normal,4.0,3.0,No,GLQ,Attchd,Unf,3.0,3.0,7.217443,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,175000,7917,6,5,1999,3,0,0,953,5,953,694,1647,3,460,2,7,3,2,1,0,62.0,3,60,RL,Reg,Lvl,Inside,Gilbert,Norm,1Fam,2Story,Gable,VinylSd,VinylSd,PConc,GasA,Y,SBrkr,Typ,Y,WD,Normal,4.0,3.0,No,Unf,Attchd,RFn,3.0,3.0,7.407318,0,0
1164,210000,13175,6,6,1978,3,790,163,1542,3,2073,0,2073,3,500,2,7,3,2,0,0,85.0,3,20,RL,Reg,Lvl,Inside,NWAmes,Norm,1Fam,1Story,Gable,Plywood,Plywood,CBlock,GasA,Y,SBrkr,Min1,Y,WD,Normal,4.0,3.0,No,ALQ,Attchd,Unf,3.0,3.0,7.637234,0,0
1165,266500,9042,7,9,1941,4,275,0,1152,5,1188,1152,2340,4,252,1,9,4,2,0,2500,66.0,5,70,RL,Reg,Lvl,Inside,Crawfor,Norm,1Fam,2Story,Gable,CemntBd,CmentBd,Stone,GasA,Y,SBrkr,Typ,Y,WD,Normal,3.0,4.0,No,GLQ,Attchd,RFn,3.0,3.0,7.758333,0,0
1166,142125,9717,5,6,1950,3,49,1029,1078,4,1078,0,1078,4,240,1,5,2,1,0,0,68.0,3,20,RL,Reg,Lvl,Inside,NAmes,Norm,1Fam,1Story,Hip,MetalSd,MetalSd,CBlock,GasA,Y,FuseA,Typ,Y,WD,Normal,3.0,3.0,Mn,GLQ,Attchd,Unf,3.0,3.0,6.983790,0,0


In [109]:
# fill missing values
display(train.info())

mis_col_mode = ['LotFrontage', 'Electrical']
mis_cat_cols = ['BsmtExposure', 'BsmtFinType1', 'GarageType', 'GarageFinish']
mis_num_cols = ['BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond']

for col in mis_col_mode:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(train[col].mode()[0])

for col in mis_cat_cols:
    train[col] = train[col].fillna(value='missing')
    test[col] = test[col].fillna(value='missing')
    
for col in mis_num_cols:
    train[col] = train[col].fillna(value=-1)
    test[col] = test[col].fillna(value=-1)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1168 entries, 0 to 1167
Data columns (total 54 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   SalePrice      1168 non-null   int64  
 1   LotArea        1168 non-null   int64  
 2   OverallQual    1168 non-null   int64  
 3   OverallCond    1168 non-null   int64  
 4   YearBuilt      1168 non-null   int64  
 5   ExterCond      1168 non-null   int64  
 6   BsmtFinSF1     1168 non-null   int64  
 7   BsmtFinSF2     1168 non-null   int64  
 8   TotalBsmtSF    1168 non-null   int64  
 9   HeatingQC      1168 non-null   int64  
 10  1stFlrSF       1168 non-null   int64  
 11  2ndFlrSF       1168 non-null   int64  
 12  GrLivArea      1168 non-null   int64  
 13  KitchenQual    1168 non-null   int64  
 14  GarageArea     1168 non-null   int64  
 15  GarageCars     1168 non-null   int64  
 16  TotRmsAbvGrd   1168 non-null   int64  
 17  BedroomAbvGr   1168 non-null   int64  
 18  FullBath

None

In [110]:
### do target encoding ###

encoder = CrossFoldEncoder(MEstimateEncoder, m=10)
train_encoded = encoder.fit_transform(train, train.SalePrice, cols=cat_cols)
test_encoded = encoder.transform(test)

train.drop(columns=cat_cols, inplace=True)
test.drop(columns=cat_cols,  inplace=True)
train = pd.concat([train, train_encoded], axis = 1)
test = pd.concat([test, test_encoded], axis = 1)

display(train.shape, train.head(), train.count())
train0 = train.copy()
test0 = test.copy()

(1168, 54)

Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,MiscVal,LotFrontage,ExterQual,BsmtQual,BsmtCond,GarageQual,GarageCond,GrLivArea_log,Condition1_encoded,GarageType_encoded,GarageFinish_encoded,Neighborhood_encoded,MSSubClass_encoded,Exterior1st_encoded,BsmtExposure_encoded,Exterior2nd_encoded,HouseStyle_encoded,LotShape_encoded,FullBath_encoded,CentralAir_encoded,LotConfig_encoded,MSZoning_encoded,BldgType_encoded,MisBsmt_encoded,LandContour_encoded,BsmtFinType1_encoded,Functional_encoded,MisGarage_encoded,SaleCondition_encoded,PavedDrive_encoded,HalfBath_encoded,RoofStyle_encoded,SaleType_encoded,Foundation_encoded,Heating_encoded,Electrical_encoded
0,208500,8450,7,5,2003,3,706,0,856,5,856,854,1710,4,548,2,8,3,0,65.0,4,4.0,3.0,3.0,3.0,7.444833,182674.1645,200574.832714,196950.262584,195399.622658,238929.484626,212712.42914,164756.801323,213377.142953,209957.366562,163215.241198,210327.852299,184993.291622,174936.402071,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.59521,171888.617908,225241.198424,181003.706895,185676.932477
1,181500,9600,6,8,1976,3,978,0,1262,5,1262,0,1262,3,460,2,6,3,0,80.0,3,4.0,3.0,3.0,3.0,7.141245,154532.744697,200574.832714,196950.262584,194339.344749,183254.018872,152679.296674,253912.335302,152980.445564,174270.129347,163215.241198,210327.852299,184993.291622,184362.129281,188446.863686,184561.336263,181771.651065,179299.736926,160403.804023,182169.617926,183915.682724,173678.534313,185373.853992,161074.500308,169385.59521,171888.617908,146841.013681,181003.706895,185676.932477
2,223500,11250,7,5,2001,3,486,0,920,5,920,866,1786,4,608,2,6,3,0,68.0,4,4.0,3.0,3.0,3.0,7.488294,182674.1645,200574.832714,196950.262584,195399.622658,238929.484626,212712.42914,184252.040142,213377.142953,209957.366562,208022.174206,210327.852299,184993.291622,174936.402071,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.59521,171888.617908,225241.198424,181003.706895,185676.932477
3,250000,14260,8,5,2000,3,655,0,1145,5,1145,1053,2198,4,836,3,9,4,0,84.0,4,4.0,3.0,3.0,3.0,7.695758,182674.1645,200574.832714,196950.262584,298505.355979,238929.484626,212712.42914,204991.43268,213377.142953,209957.366562,208022.174206,210327.852299,184993.291622,184362.129281,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.59521,171888.617908,225241.198424,181003.706895,185676.932477
4,143000,14115,5,5,1993,3,732,0,796,5,796,566,1362,3,480,2,5,1,700,85.0,3,4.0,3.0,3.0,3.0,7.217443,182674.1645,200574.832714,142186.274858,162749.734358,147548.516547,212712.42914,164756.801323,213377.142953,146452.295149,208022.174206,135353.752899,184993.291622,174936.402071,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.59521,171888.617908,186235.470112,181003.706895,185676.932477


SalePrice                1168
LotArea                  1168
OverallQual              1168
OverallCond              1168
YearBuilt                1168
ExterCond                1168
BsmtFinSF1               1168
BsmtFinSF2               1168
TotalBsmtSF              1168
HeatingQC                1168
1stFlrSF                 1168
2ndFlrSF                 1168
GrLivArea                1168
KitchenQual              1168
GarageArea               1168
GarageCars               1168
TotRmsAbvGrd             1168
BedroomAbvGr             1168
MiscVal                  1168
LotFrontage              1168
ExterQual                1168
BsmtQual                 1168
BsmtCond                 1168
GarageQual               1168
GarageCond               1168
GrLivArea_log            1168
Condition1_encoded       1168
GarageType_encoded       1168
GarageFinish_encoded     1168
Neighborhood_encoded     1168
MSSubClass_encoded       1168
Exterior1st_encoded      1168
BsmtExposure_encoded     1168
Exterior2n

In [111]:
train

Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,MiscVal,LotFrontage,ExterQual,BsmtQual,BsmtCond,GarageQual,GarageCond,GrLivArea_log,Condition1_encoded,GarageType_encoded,GarageFinish_encoded,Neighborhood_encoded,MSSubClass_encoded,Exterior1st_encoded,BsmtExposure_encoded,Exterior2nd_encoded,HouseStyle_encoded,LotShape_encoded,FullBath_encoded,CentralAir_encoded,LotConfig_encoded,MSZoning_encoded,BldgType_encoded,MisBsmt_encoded,LandContour_encoded,BsmtFinType1_encoded,Functional_encoded,MisGarage_encoded,SaleCondition_encoded,PavedDrive_encoded,HalfBath_encoded,RoofStyle_encoded,SaleType_encoded,Foundation_encoded,Heating_encoded,Electrical_encoded
0,208500,8450,7,5,2003,3,706,0,856,5,856,854,1710,4,548,2,8,3,0,65.0,4,4.0,3.0,3.0,3.0,7.444833,182674.164500,200574.832714,196950.262584,195399.622658,238929.484626,212712.429140,164756.801323,213377.142953,209957.366562,163215.241198,210327.852299,184993.291622,174936.402071,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.595210,171888.617908,225241.198424,181003.706895,185676.932477
1,181500,9600,6,8,1976,3,978,0,1262,5,1262,0,1262,3,460,2,6,3,0,80.0,3,4.0,3.0,3.0,3.0,7.141245,154532.744697,200574.832714,196950.262584,194339.344749,183254.018872,152679.296674,253912.335302,152980.445564,174270.129347,163215.241198,210327.852299,184993.291622,184362.129281,188446.863686,184561.336263,181771.651065,179299.736926,160403.804023,182169.617926,183915.682724,173678.534313,185373.853992,161074.500308,169385.595210,171888.617908,146841.013681,181003.706895,185676.932477
2,223500,11250,7,5,2001,3,486,0,920,5,920,866,1786,4,608,2,6,3,0,68.0,4,4.0,3.0,3.0,3.0,7.488294,182674.164500,200574.832714,196950.262584,195399.622658,238929.484626,212712.429140,184252.040142,213377.142953,209957.366562,208022.174206,210327.852299,184993.291622,174936.402071,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.595210,171888.617908,225241.198424,181003.706895,185676.932477
3,250000,14260,8,5,2000,3,655,0,1145,5,1145,1053,2198,4,836,3,9,4,0,84.0,4,4.0,3.0,3.0,3.0,7.695758,182674.164500,200574.832714,196950.262584,298505.355979,238929.484626,212712.429140,204991.432680,213377.142953,209957.366562,208022.174206,210327.852299,184993.291622,184362.129281,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.595210,171888.617908,225241.198424,181003.706895,185676.932477
4,143000,14115,5,5,1993,3,732,0,796,5,796,566,1362,3,480,2,5,1,700,85.0,3,4.0,3.0,3.0,3.0,7.217443,182674.164500,200574.832714,142186.274858,162749.734358,147548.516547,212712.429140,164756.801323,213377.142953,146452.295149,208022.174206,135353.752899,184993.291622,174936.402071,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.595210,171888.617908,186235.470112,181003.706895,185676.932477
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,175000,7917,6,5,1999,3,0,0,953,5,953,694,1647,3,460,2,7,3,0,62.0,3,4.0,3.0,3.0,3.0,7.407318,185981.875122,200983.653258,201849.699694,186989.176712,241782.255074,216480.587719,167647.743548,216961.195239,214204.908403,168262.279928,213905.401121,187666.177411,178915.214736,191555.237492,187084.754999,184068.659791,182095.082355,174403.270955,184592.414205,186549.244004,176052.874095,187888.374656,215552.488714,173337.318562,173440.733926,225986.794811,182943.828215,188607.889641
1164,210000,13175,6,6,1978,3,790,163,1542,3,2073,0,2073,3,500,2,7,3,0,85.0,3,4.0,3.0,3.0,3.0,7.637234,185981.875122,200983.653258,144669.190105,184168.667677,183563.195085,173306.009220,167647.743548,167945.281561,175006.663858,168262.279928,213905.401121,187666.177411,178915.214736,191555.237492,187084.754999,184068.659791,182095.082355,162598.281886,164307.113299,186549.244004,176052.874095,187888.374656,162465.702197,173337.318562,173440.733926,150418.299540,182943.828215,188607.889641
1165,266500,9042,7,9,1941,4,275,0,1152,5,1188,1152,2340,4,252,1,9,4,2500,66.0,5,3.0,4.0,3.0,3.0,7.758333,185981.875122,200983.653258,201849.699694,195649.871199,173653.064742,222571.790187,167647.743548,217021.653600,214204.908403,168262.279928,213905.401121,187666.177411,178915.214736,191555.237492,187084.754999,184068.659791,182095.082355,234255.350073,184592.414205,186549.244004,176052.874095,187888.374656,162465.702197,173337.318562,173440.733926,174630.408513,182943.828215,188607.889641
1166,142125,9717,5,6,1950,3,49,1029,1078,4,1078,0,1078,4,240,1,5,2,0,68.0,3,3.0,3.0,3.0,3.0,6.983790,185981.875122,200983.653258,144669.190105,148677.814416,183563.195085,154655.333693,193802.982922,154789.435550,175006.663858,168262.279928,137068.597830,187666.177411,178915.214736,191555.237492,187084.754999,184068.659791,182095.082355,234255.350073,184592.414205,186549.244004,176052.874095,187888.374656,162465.702197,217671.576218,173440.733926,150418.299540,182943.828215,129339.184336


In [112]:
add_dummyfeatures(train, test, {'OverallQual':1})
add_dummyfeatures(train, test, {'OverallQual':8})
add_dummyfeatures(train, test, {'OverallQual':9})
add_dummyfeatures(train, test, {'OverallQual':10})
train

[1 1]  variables created
[1 1]  variables created
[1 1]  variables created
[1 1]  variables created


Unnamed: 0,SalePrice,LotArea,OverallQual,OverallCond,YearBuilt,ExterCond,BsmtFinSF1,BsmtFinSF2,TotalBsmtSF,HeatingQC,1stFlrSF,2ndFlrSF,GrLivArea,KitchenQual,GarageArea,GarageCars,TotRmsAbvGrd,BedroomAbvGr,MiscVal,LotFrontage,ExterQual,BsmtQual,BsmtCond,GarageQual,GarageCond,GrLivArea_log,Condition1_encoded,GarageType_encoded,GarageFinish_encoded,Neighborhood_encoded,MSSubClass_encoded,Exterior1st_encoded,BsmtExposure_encoded,Exterior2nd_encoded,HouseStyle_encoded,LotShape_encoded,FullBath_encoded,CentralAir_encoded,LotConfig_encoded,MSZoning_encoded,BldgType_encoded,MisBsmt_encoded,LandContour_encoded,BsmtFinType1_encoded,Functional_encoded,MisGarage_encoded,SaleCondition_encoded,PavedDrive_encoded,HalfBath_encoded,RoofStyle_encoded,SaleType_encoded,Foundation_encoded,Heating_encoded,Electrical_encoded,OverallQual1,OverallQual8,OverallQual9,OverallQual10
0,208500,8450,7,5,2003,3,706,0,856,5,856,854,1710,4,548,2,8,3,0,65.0,4,4.0,3.0,3.0,3.0,7.444833,182674.164500,200574.832714,196950.262584,195399.622658,238929.484626,212712.429140,164756.801323,213377.142953,209957.366562,163215.241198,210327.852299,184993.291622,174936.402071,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.595210,171888.617908,225241.198424,181003.706895,185676.932477,0.0,0.0,0.0,0.0
1,181500,9600,6,8,1976,3,978,0,1262,5,1262,0,1262,3,460,2,6,3,0,80.0,3,4.0,3.0,3.0,3.0,7.141245,154532.744697,200574.832714,196950.262584,194339.344749,183254.018872,152679.296674,253912.335302,152980.445564,174270.129347,163215.241198,210327.852299,184993.291622,184362.129281,188446.863686,184561.336263,181771.651065,179299.736926,160403.804023,182169.617926,183915.682724,173678.534313,185373.853992,161074.500308,169385.595210,171888.617908,146841.013681,181003.706895,185676.932477,0.0,0.0,0.0,0.0
2,223500,11250,7,5,2001,3,486,0,920,5,920,866,1786,4,608,2,6,3,0,68.0,4,4.0,3.0,3.0,3.0,7.488294,182674.164500,200574.832714,196950.262584,195399.622658,238929.484626,212712.429140,184252.040142,213377.142953,209957.366562,208022.174206,210327.852299,184993.291622,174936.402071,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.595210,171888.617908,225241.198424,181003.706895,185676.932477,0.0,0.0,0.0,0.0
3,250000,14260,8,5,2000,3,655,0,1145,5,1145,1053,2198,4,836,3,9,4,0,84.0,4,4.0,3.0,3.0,3.0,7.695758,182674.164500,200574.832714,196950.262584,298505.355979,238929.484626,212712.429140,204991.432680,213377.142953,209957.366562,208022.174206,210327.852299,184993.291622,184362.129281,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.595210,171888.617908,225241.198424,181003.706895,185676.932477,0.0,1.0,0.0,0.0
4,143000,14115,5,5,1993,3,732,0,796,5,796,566,1362,3,480,2,5,1,700,85.0,3,4.0,3.0,3.0,3.0,7.217443,182674.164500,200574.832714,142186.274858,162749.734358,147548.516547,212712.429140,164756.801323,213377.142953,146452.295149,208022.174206,135353.752899,184993.291622,174936.402071,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.595210,171888.617908,186235.470112,181003.706895,185676.932477,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,175000,7917,6,5,1999,3,0,0,953,5,953,694,1647,3,460,2,7,3,0,62.0,3,4.0,3.0,3.0,3.0,7.407318,185981.875122,200983.653258,201849.699694,186989.176712,241782.255074,216480.587719,167647.743548,216961.195239,214204.908403,168262.279928,213905.401121,187666.177411,178915.214736,191555.237492,187084.754999,184068.659791,182095.082355,174403.270955,184592.414205,186549.244004,176052.874095,187888.374656,215552.488714,173337.318562,173440.733926,225986.794811,182943.828215,188607.889641,0.0,0.0,0.0,0.0
1164,210000,13175,6,6,1978,3,790,163,1542,3,2073,0,2073,3,500,2,7,3,0,85.0,3,4.0,3.0,3.0,3.0,7.637234,185981.875122,200983.653258,144669.190105,184168.667677,183563.195085,173306.009220,167647.743548,167945.281561,175006.663858,168262.279928,213905.401121,187666.177411,178915.214736,191555.237492,187084.754999,184068.659791,182095.082355,162598.281886,164307.113299,186549.244004,176052.874095,187888.374656,162465.702197,173337.318562,173440.733926,150418.299540,182943.828215,188607.889641,0.0,0.0,0.0,0.0
1165,266500,9042,7,9,1941,4,275,0,1152,5,1188,1152,2340,4,252,1,9,4,2500,66.0,5,3.0,4.0,3.0,3.0,7.758333,185981.875122,200983.653258,201849.699694,195649.871199,173653.064742,222571.790187,167647.743548,217021.653600,214204.908403,168262.279928,213905.401121,187666.177411,178915.214736,191555.237492,187084.754999,184068.659791,182095.082355,234255.350073,184592.414205,186549.244004,176052.874095,187888.374656,162465.702197,173337.318562,173440.733926,174630.408513,182943.828215,188607.889641,0.0,0.0,0.0,0.0
1166,142125,9717,5,6,1950,3,49,1029,1078,4,1078,0,1078,4,240,1,5,2,0,68.0,3,3.0,3.0,3.0,3.0,6.983790,185981.875122,200983.653258,144669.190105,148677.814416,183563.195085,154655.333693,193802.982922,154789.435550,175006.663858,168262.279928,137068.597830,187666.177411,178915.214736,191555.237492,187084.754999,184068.659791,182095.082355,234255.350073,184592.414205,186549.244004,176052.874095,187888.374656,162465.702197,217671.576218,173440.733926,150418.299540,182943.828215,129339.184336,0.0,0.0,0.0,0.0


In [113]:
log_transformer_mp_i1(train, test, feature_subset=num_cols)

Skewed columns log-transformed:  ['MiscVal', 'LotArea', 'BsmtFinSF2']


In [115]:
X_train = train.copy()
y_train = X_train.pop('SalePrice')
X_test = test.copy()
y_test = X_test.pop('SalePrice')
print(X_train.shape, X_test.shape, y_train.shape)


(1168, 57) (292, 57) (1168,)


In [117]:
feature_transformer = ColumnTransformer([
    ("num", StandardScaler(), num_cols+ord_cols),
    ], remainder = "passthrough")

X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), 
                       columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), 
                      columns=feature_transformer.get_feature_names_out())

# there are many dummies... may wish to use pca here later.

print(X_train.shape, X_test.shape, y_train.shape)


(1168, 57) (292, 57) (1168,)


In [118]:
X_train

Unnamed: 0,num__LotArea,num__YearBuilt,num__BsmtFinSF1,num__BsmtFinSF2,num__TotalBsmtSF,num__1stFlrSF,num__2ndFlrSF,num__GrLivArea,num__GarageArea,num__MiscVal,num__LotFrontage,num__TotRmsAbvGrd,num__GarageCars,num__BedroomAbvGr,num__OverallCond,num__OverallQual,num__GrLivArea_log,num__ExterCond,num__HeatingQC,num__KitchenQual,num__ExterQual,num__BsmtQual,num__BsmtCond,num__GarageQual,num__GarageCond,remainder__Condition1_encoded,remainder__GarageType_encoded,remainder__GarageFinish_encoded,remainder__Neighborhood_encoded,remainder__MSSubClass_encoded,remainder__Exterior1st_encoded,remainder__BsmtExposure_encoded,remainder__Exterior2nd_encoded,remainder__HouseStyle_encoded,remainder__LotShape_encoded,remainder__FullBath_encoded,remainder__CentralAir_encoded,remainder__LotConfig_encoded,remainder__MSZoning_encoded,remainder__BldgType_encoded,remainder__MisBsmt_encoded,remainder__LandContour_encoded,remainder__BsmtFinType1_encoded,remainder__Functional_encoded,remainder__MisGarage_encoded,remainder__SaleCondition_encoded,remainder__PavedDrive_encoded,remainder__HalfBath_encoded,remainder__RoofStyle_encoded,remainder__SaleType_encoded,remainder__Foundation_encoded,remainder__Heating_encoded,remainder__Electrical_encoded,remainder__OverallQual1,remainder__OverallQual8,remainder__OverallQual9,remainder__OverallQual10
0,-0.135621,1.053414,0.571322,-0.352717,-0.459328,-0.796429,1.166987,0.374514,0.349494,-0.184376,-0.148847,0.928845,0.302357,0.175223,-0.520955,0.651341,0.534249,-0.230812,0.887512,0.739931,1.036520,0.540864,0.133868,0.254423,0.257168,182674.164500,200574.832714,196950.262584,195399.622658,238929.484626,212712.429140,164756.801323,213377.142953,209957.366562,163215.241198,210327.852299,184993.291622,174936.402071,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.595210,171888.617908,225241.198424,181003.706895,185676.932477,0.0,0.0,0.0,0.0
1,0.113139,0.161781,1.162129,-0.352717,0.468758,0.263530,-0.793618,-0.475436,-0.065626,-0.184376,0.514546,-0.303986,0.302357,0.175223,2.136142,-0.071134,-0.373783,-0.230812,0.887512,-0.773623,-0.692987,0.540864,0.133868,0.254423,0.257168,154532.744697,200574.832714,196950.262584,194339.344749,183254.018872,152679.296674,253912.335302,152980.445564,174270.129347,163215.241198,210327.852299,184993.291622,184362.129281,188446.863686,184561.336263,181771.651065,179299.736926,160403.804023,182169.617926,183915.682724,173678.534313,185373.853992,161074.500308,169385.595210,171888.617908,146841.013681,181003.706895,185676.932477,0.0,0.0,0.0,0.0
2,0.422357,0.987367,0.093463,-0.352717,-0.313029,-0.629342,1.194537,0.518702,0.632530,-0.184376,-0.016168,-0.303986,0.302357,0.175223,-0.520955,0.651341,0.664238,-0.230812,0.887512,0.739931,1.036520,0.540864,0.133868,0.254423,0.257168,182674.164500,200574.832714,196950.262584,195399.622658,238929.484626,212712.429140,184252.040142,213377.142953,209957.366562,208022.174206,210327.852299,184993.291622,174936.402071,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.595210,171888.617908,225241.198424,181003.706895,185676.932477,0.0,0.0,0.0,0.0
3,0.884599,0.954343,0.460546,-0.352717,0.201304,-0.041926,1.623849,1.300352,1.708068,-0.184376,0.691450,1.545260,1.645146,1.408117,-0.520955,1.373816,1.284765,-0.230812,0.887512,0.739931,1.036520,0.540864,0.133868,0.254423,0.257168,182674.164500,200574.832714,196950.262584,298505.355979,238929.484626,212712.429140,204991.432680,213377.142953,209957.366562,208022.174206,210327.852299,184993.291622,184362.129281,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.595210,171888.617908,225241.198424,181003.706895,185676.932477,0.0,1.0,0.0,0.0
4,0.864673,0.723179,0.627796,-0.352717,-0.596484,-0.953073,0.505799,-0.285715,0.028720,5.340384,0.735677,-0.920401,0.302357,-2.290564,-0.520955,-0.793609,-0.145874,-0.230812,0.887512,-0.773623,-0.692987,0.540864,0.133868,0.254423,0.257168,182674.164500,200574.832714,142186.274858,162749.734358,147548.516547,212712.429140,164756.801323,213377.142953,146452.295149,208022.174206,135353.752899,184993.291622,174936.402071,188446.863686,184561.336263,181771.651065,179299.736926,236454.399884,182169.617926,183915.682724,173678.534313,185373.853992,211831.562451,169385.595210,171888.617908,186235.470112,181003.706895,185676.932477,0.0,0.0,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1163,-0.262643,0.921320,-0.962171,-0.352717,-0.237593,-0.543187,0.799661,0.254990,-0.065626,-0.184376,-0.281525,0.312430,0.302357,0.175223,-0.520955,-0.071134,0.422040,-0.230812,0.887512,-0.773623,-0.692987,0.540864,0.133868,0.254423,0.257168,185981.875122,200983.653258,201849.699694,186989.176712,241782.255074,216480.587719,167647.743548,216961.195239,214204.908403,168262.279928,213905.401121,187666.177411,178915.214736,191555.237492,187084.754999,184068.659791,182095.082355,174403.270955,184592.414205,186549.244004,176052.874095,187888.374656,215552.488714,173337.318562,173440.733926,225986.794811,182943.828215,188607.889641,0.0,0.0,0.0,0.0
1164,0.730308,0.227828,0.753777,2.441885,1.108817,2.380838,-0.793618,1.063201,0.123065,-0.184376,0.735677,0.312430,0.302357,0.175223,0.364744,-0.071134,1.109721,-0.230812,-1.183645,-0.773623,-0.692987,0.540864,0.133868,0.254423,0.257168,185981.875122,200983.653258,144669.190105,184168.667677,183563.195085,173306.009220,167647.743548,167945.281561,175006.663858,168262.279928,213905.401121,187666.177411,178915.214736,191555.237492,187084.754999,184068.659791,182095.082355,162598.281886,164307.113299,186549.244004,176052.874095,187888.374656,162465.702197,173337.318562,173440.733926,150418.299540,182943.828215,188607.889641,0.0,0.0,0.0,0.0
1165,-0.003608,-0.994039,-0.364847,-0.352717,0.217306,0.070336,1.851133,1.569756,-1.046818,6.412820,-0.104621,1.545260,-1.040431,1.408117,3.021841,0.651341,1.471928,2.577400,0.887512,0.739931,2.766027,-0.496458,1.637306,0.254423,0.257168,185981.875122,200983.653258,201849.699694,195649.871199,173653.064742,222571.790187,167647.743548,217021.653600,214204.908403,168262.279928,213905.401121,187666.177411,178915.214736,191555.237492,187084.754999,184068.659791,182095.082355,234255.350073,184592.414205,186549.244004,176052.874095,187888.374656,162465.702197,173337.318562,173440.733926,174630.408513,182943.828215,188607.889641,0.0,0.0,0.0,0.0
1166,0.136756,-0.696828,-0.855739,3.448761,0.048147,-0.216845,-0.793618,-0.824523,-1.103425,-0.184376,-0.016168,-0.920401,-1.040431,-1.057671,0.364744,-0.793609,-0.844731,-0.230812,-0.148066,0.739931,-0.692987,-0.496458,0.133868,0.254423,0.257168,185981.875122,200983.653258,144669.190105,148677.814416,183563.195085,154655.333693,193802.982922,154789.435550,175006.663858,168262.279928,137068.597830,187666.177411,178915.214736,191555.237492,187084.754999,184068.659791,182095.082355,234255.350073,184592.414205,186549.244004,176052.874095,187888.374656,162465.702197,217671.576218,173440.733926,150418.299540,182943.828215,129339.184336,0.0,0.0,0.0,0.0


In [120]:
# 6. Model Fitting #

print(X_train.shape)

lr = LinearRegression()
lr.fit(X_train, y_train)
print('OLS ', mean_squared_error(y_train, lr.predict(X_train))**0.5)


(1168, 57)
OLS  28515.943812401296


In [None]:
# 6. Model Fitting #

print(X_train.shape)

lr = LinearRegression()
lr.fit(X_train[cols_veryveryfew], y_train)
print('OLS ', mean_squared_error(y_train, lr.predict(X_train[cols_veryveryfew])))

time1 = time.time()
svr4 = SVR()
grid_param = {'C': [50000, 100000, 200000, 400000, 600000, 900000]}
svrm4 = GridSearchCV(svr4, grid_param, cv=8, scoring='neg_root_mean_squared_error')
svrm4.fit(X_train[cols_veryveryfew], y_train)
print('SVR 56 cols', 
      svrm4.best_params_, 
      svrm4.best_score_, 
      np.sqrt(mean_squared_error(y_train, svrm4.predict(X_train[cols_veryveryfew]))), 
      time.time()-time1)

xgbb = XGBRegressor(n_estimators=200,
                   max_depth=5,
                   eta=0.06,
                   subsample=0.8,
                   colsample_bytree=0.6)
xgbb.fit(X_train[cols_veryveryfew], y_train)

xgb4 = XGBRegressor()
grid_param = {'n_estimators':[200], 
              'max_depth':[2,3,4,5], 
              'eta':[0.04, 0.06, 0.08, 0.1],
             'subsample':[0.7], 
              'colsample_bytree':[0.5]}
xgbm4 = GridSearchCV(xgb4, grid_param, cv=8, scoring='neg_root_mean_squared_error')
xgbm4.fit(X_train[cols_veryveryfew], y_train)
print('XGB 56 cols', 
      xgbm4.best_params_, 
      xgbm4.best_score_, 
      np.sqrt(mean_squared_error(y_train, xgbm4.predict(X_train[cols_veryveryfew]))), 
      time.time()-time1)

# 7. Model Evaluation #

print('SVR 56', np.sqrt(mean_squared_error(y_test, svrm4.predict(X_test[cols_veryveryfew]))))
print('XGB 56', np.sqrt(mean_squared_error(y_test, xgbm4.predict(X_test[cols_veryveryfew]))))

# sometimes ridge may fail really bad.
print('Total Time is ', time.time()-time0)

# all 3 models perform best with the smallest features set (56 features)

In [None]:
print('train lr 56', np.sqrt(mean_squared_error(y_train, lr.predict(X_train[cols_veryveryfew]))))
print('train SVR 56', np.sqrt(mean_squared_error(y_train, svrm4.predict(X_train[cols_veryveryfew]))))
print('train xgb 56', np.sqrt(mean_squared_error(y_train, xgbb.predict(X_train[cols_veryveryfew]))))
print('test lr 56', np.sqrt(mean_squared_error(y_test, lr.predict(X_test[cols_veryveryfew]))))
print('test SVR 56', np.sqrt(mean_squared_error(y_test, svrm4.predict(X_test[cols_veryveryfew]))))
print('test xgb 56', np.sqrt(mean_squared_error(y_test, xgbb.predict(X_test[cols_veryveryfew]))))