In [1]:
from sklearn.preprocessing import OrdinalEncoder
from sklearn.model_selection import GridSearchCV
from pycaret.regression import setup, compare_models
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import pandas as pd

In [2]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge, OrthogonalMatchingPursuit, LinearRegression, Lasso, Ridge
from lightgbm import LGBMRegressor
from catboost import CatBoostRegressor

# Read train and test sets

In [3]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

# Store both dfs lengths

In [4]:
m_train = df_train.shape[0]
m_test = df_test.shape[0]
m_train, m_test

(1460, 1459)

# Concat the dfs

In [5]:
df = pd.concat([df_train, df_test])
assert df.shape[0] == m_train + m_test

In [6]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0


# Save target

In [7]:
target_col, target = "SalePrice", df["SalePrice"]

#  Drop Id and target columns

In [8]:
cols_to_drop = ["Id", target_col]
df.drop(cols_to_drop, axis=1, inplace=True)

# View and fix columns data-types

In [9]:
df.select_dtypes(object).columns

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [10]:
df.select_dtypes(np.number).columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [11]:
num_to_obj_cols = ['MSSubClass', 'MoSold']
df[num_to_obj_cols] = df[num_to_obj_cols].astype(object)

In [12]:
cols_cat = df.select_dtypes(object).columns.to_list()
cols_num = df.select_dtypes(np.number).columns.to_list()

# Impute categorical columns

In [13]:
cols_cat_na = df[cols_cat].isnull().sum()[df[cols_cat].isnull().sum() > 0]
cols_cat_na

MSZoning           4
Alley           2721
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        24
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinType2      80
Electrical         1
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageFinish     159
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
SaleType           1
dtype: int64

In [14]:
mode_filled_cols = ["MSZoning", "Utilities", "Exterior1st", "Exterior2nd", "MasVnrType", "Electrical", "KitchenQual", "Functional", "SaleType"]
for col in mode_filled_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

none_filled_cols = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature"]
for col in none_filled_cols:
    df[col].fillna("None", inplace=True)
    
df[cols_cat].isnull().sum().sum()

0

# Change ordinal columns to numeric, and encode accordingly

In [15]:
df.select_dtypes(object).columns

Index(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
       'Utilities', 'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1',
       'Condition2', 'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl',
       'Exterior1st', 'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond',
       'Foundation', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinType2', 'Heating', 'HeatingQC', 'CentralAir', 'Electrical',
       'KitchenQual', 'Functional', 'FireplaceQu', 'GarageType',
       'GarageFinish', 'GarageQual', 'GarageCond', 'PavedDrive', 'PoolQC',
       'Fence', 'MiscFeature', 'MoSold', 'SaleType', 'SaleCondition'],
      dtype='object')

In [16]:
oe = OrdinalEncoder(categories=[['Reg', 'IR1', 'IR2', 'IR3']])
df.loc[:, "LotShape"] = oe.fit_transform(df[["LotShape"]])
df["LotShape_sq"] = df["LotShape"] ** 2

In [17]:
oe = OrdinalEncoder(categories=[['Gtl', 'Mod', 'Sev']])
df.loc[:, "LandSlope"] = oe.fit_transform(df[["LandSlope"]])
df["LandSlope_sq"] = df["LandSlope"] ** 2

In [18]:
qual_oe = OrdinalEncoder(categories=[['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']])
for col in ["ExterQual", "ExterCond", 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']:
    df.loc[:, col] = qual_oe.fit_transform(df[[col]])
    df[col + "sq"] = df[col] ** 2

In [19]:
oe = OrdinalEncoder(categories=[['None', 'No', 'Mn', 'Av', 'Gd']])
df.loc[:, "BsmtExposure"] = oe.fit_transform(df[["BsmtExposure"]])
df["BsmtExposure_sq"] = df["BsmtExposure"] ** 2

In [20]:
oe = OrdinalEncoder(categories=[['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']])
for col in ["BsmtFinType1", "BsmtFinType2"]:
    df.loc[:, col] = oe.fit_transform(df[[col]])
    df[col + "sq"] = df[col] ** 2

In [21]:
oe = OrdinalEncoder(categories=[['None', 'Unf', 'RFn', 'Fin']])
df.loc[:, "GarageFinish"] = oe.fit_transform(df[["GarageFinish"]])
df["GarageFinishsq"] = df["GarageFinish"] ** 2

In [22]:
oe = OrdinalEncoder(categories=[['N', 'P', 'Y']])
df.loc[:, "PavedDrive"] = oe.fit_transform(df[["PavedDrive"]])
df["PavedDrivesq"] = df["PavedDrive"] ** 2

In [23]:
oe = OrdinalEncoder(categories=[['None', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']])
df.loc[:, "Fence"] = oe.fit_transform(df[["Fence"]])
df["Fencesq"] = df["Fence"] ** 2

In [24]:
cols_cat = df.select_dtypes(object).columns.to_list()
cols_num = df.select_dtypes(np.number).columns.to_list()

# Impute numerical columns

In [25]:
cols_num_na = df[cols_num].isnull().sum()[df[cols_num].isnull().sum() > 0]
cols_num_na

LotFrontage     486
MasVnrArea       23
BsmtFinSF1        1
BsmtFinSF2        1
BsmtUnfSF         1
TotalBsmtSF       1
BsmtFullBath      2
BsmtHalfBath      2
GarageYrBlt     159
GarageCars        1
GarageArea        1
dtype: int64

In [26]:
zero_filled_cols = ["BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath", "GarageCars", "GarageArea"]
for col in zero_filled_cols:
    df[col].fillna(0, inplace=True)

In [27]:
cols_num_na = df[cols_num].isnull().sum()[df[cols_num].isnull().sum() > 0]
cols_num_na

LotFrontage    486
MasVnrArea      23
GarageYrBlt    159
dtype: int64

In [28]:
def catboost_imputer(df, cols_num_na):
    
    # save columns to impute and drop them from df
    cols_to_impute = df[cols_num_na]
    df.drop(cols_num_na, axis=1, inplace=True)
    
    for col in cols_num_na:
        # define X and y
        X, y = df.copy(), cols_to_impute[col]

        # get train and test sets
        train_indexes, test_indexes = ~y.isnull(), y.isnull()
        X_train, X_test = X.loc[train_indexes, :], X.loc[test_indexes, :]
        y_train, y_test = y.loc[train_indexes], y.loc[test_indexes]
        
        model = CatBoostRegressor(max_depth=8, random_seed=10,
                                  subsample=0.65, n_estimators=1000,
                                  cat_features=cols_cat)
        model.fit(X_train, y_train)
        cols_to_impute.loc[test_indexes, col] = model.predict(X_test)
        df[col] = cols_to_impute[col]
    return df

In [29]:
df = catboost_imputer(df, cols_num_na.index)
df.isnull().sum().sum()

Learning rate set to 0.047118
0:	learn: 22.8559408	total: 66.2ms	remaining: 1m 6s
1:	learn: 22.3971548	total: 84.4ms	remaining: 42.1s
2:	learn: 21.9638828	total: 100ms	remaining: 33.3s
3:	learn: 21.5490582	total: 112ms	remaining: 27.8s
4:	learn: 21.0911381	total: 126ms	remaining: 25.1s
5:	learn: 20.7153817	total: 140ms	remaining: 23.2s
6:	learn: 20.2903763	total: 151ms	remaining: 21.4s
7:	learn: 19.9186439	total: 167ms	remaining: 20.7s
8:	learn: 19.5674084	total: 178ms	remaining: 19.6s
9:	learn: 19.2232179	total: 194ms	remaining: 19.2s
10:	learn: 18.9409731	total: 208ms	remaining: 18.7s
11:	learn: 18.6465703	total: 226ms	remaining: 18.6s
12:	learn: 18.4092670	total: 231ms	remaining: 17.5s
13:	learn: 18.1629147	total: 244ms	remaining: 17.2s
14:	learn: 17.8953425	total: 256ms	remaining: 16.8s
15:	learn: 17.6851373	total: 270ms	remaining: 16.6s
16:	learn: 17.4374575	total: 282ms	remaining: 16.3s
17:	learn: 17.2409281	total: 298ms	remaining: 16.3s
18:	learn: 17.0445667	total: 310ms	remaini

158:	learn: 10.7819900	total: 2.31s	remaining: 12.2s
159:	learn: 10.7570933	total: 2.33s	remaining: 12.2s
160:	learn: 10.7351989	total: 2.34s	remaining: 12.2s
161:	learn: 10.7125567	total: 2.36s	remaining: 12.2s
162:	learn: 10.7106330	total: 2.38s	remaining: 12.2s
163:	learn: 10.7002524	total: 2.39s	remaining: 12.2s
164:	learn: 10.6872709	total: 2.41s	remaining: 12.2s
165:	learn: 10.6485305	total: 2.42s	remaining: 12.2s
166:	learn: 10.6217196	total: 2.44s	remaining: 12.2s
167:	learn: 10.6035469	total: 2.45s	remaining: 12.1s
168:	learn: 10.5668881	total: 2.46s	remaining: 12.1s
169:	learn: 10.5579734	total: 2.48s	remaining: 12.1s
170:	learn: 10.5287511	total: 2.49s	remaining: 12.1s
171:	learn: 10.5114655	total: 2.5s	remaining: 12.1s
172:	learn: 10.5069250	total: 2.52s	remaining: 12.1s
173:	learn: 10.4759288	total: 2.54s	remaining: 12.1s
174:	learn: 10.4623418	total: 2.55s	remaining: 12s
175:	learn: 10.4382841	total: 2.57s	remaining: 12s
176:	learn: 10.4367621	total: 2.57s	remaining: 12s


326:	learn: 8.6775213	total: 4.82s	remaining: 9.92s
327:	learn: 8.6626313	total: 4.84s	remaining: 9.91s
328:	learn: 8.6613156	total: 4.86s	remaining: 9.9s
329:	learn: 8.6547620	total: 4.87s	remaining: 9.89s
330:	learn: 8.6481842	total: 4.89s	remaining: 9.88s
331:	learn: 8.6388078	total: 4.91s	remaining: 9.87s
332:	learn: 8.6275171	total: 4.92s	remaining: 9.86s
333:	learn: 8.6177723	total: 4.94s	remaining: 9.85s
334:	learn: 8.6078695	total: 4.95s	remaining: 9.83s
335:	learn: 8.5939151	total: 4.97s	remaining: 9.82s
336:	learn: 8.5804470	total: 4.98s	remaining: 9.8s
337:	learn: 8.5795076	total: 5s	remaining: 9.79s
338:	learn: 8.5720860	total: 5.01s	remaining: 9.77s
339:	learn: 8.5570511	total: 5.03s	remaining: 9.77s
340:	learn: 8.5551819	total: 5.04s	remaining: 9.75s
341:	learn: 8.5447953	total: 5.06s	remaining: 9.73s
342:	learn: 8.5443489	total: 5.07s	remaining: 9.72s
343:	learn: 8.5432941	total: 5.09s	remaining: 9.71s
344:	learn: 8.5397365	total: 5.1s	remaining: 9.69s
345:	learn: 8.5391

488:	learn: 7.4724767	total: 7.32s	remaining: 7.65s
489:	learn: 7.4547807	total: 7.33s	remaining: 7.63s
490:	learn: 7.4494137	total: 7.35s	remaining: 7.62s
491:	learn: 7.4366966	total: 7.36s	remaining: 7.6s
492:	learn: 7.4320105	total: 7.38s	remaining: 7.58s
493:	learn: 7.4303150	total: 7.39s	remaining: 7.57s
494:	learn: 7.4276184	total: 7.41s	remaining: 7.56s
495:	learn: 7.4241553	total: 7.42s	remaining: 7.54s
496:	learn: 7.4213741	total: 7.44s	remaining: 7.53s
497:	learn: 7.4115983	total: 7.45s	remaining: 7.51s
498:	learn: 7.4109934	total: 7.47s	remaining: 7.5s
499:	learn: 7.3989521	total: 7.48s	remaining: 7.48s
500:	learn: 7.3979975	total: 7.49s	remaining: 7.46s
501:	learn: 7.3894945	total: 7.5s	remaining: 7.44s
502:	learn: 7.3830584	total: 7.52s	remaining: 7.43s
503:	learn: 7.3796488	total: 7.54s	remaining: 7.42s
504:	learn: 7.3788617	total: 7.55s	remaining: 7.41s
505:	learn: 7.3688350	total: 7.57s	remaining: 7.39s
506:	learn: 7.3677290	total: 7.59s	remaining: 7.38s
507:	learn: 7.3

655:	learn: 6.4234120	total: 9.83s	remaining: 5.16s
656:	learn: 6.4193988	total: 9.85s	remaining: 5.14s
657:	learn: 6.4188577	total: 9.86s	remaining: 5.13s
658:	learn: 6.4129344	total: 9.88s	remaining: 5.11s
659:	learn: 6.4022448	total: 9.89s	remaining: 5.09s
660:	learn: 6.3980149	total: 9.91s	remaining: 5.08s
661:	learn: 6.3938606	total: 9.92s	remaining: 5.07s
662:	learn: 6.3853721	total: 9.94s	remaining: 5.05s
663:	learn: 6.3829856	total: 9.95s	remaining: 5.04s
664:	learn: 6.3768269	total: 9.97s	remaining: 5.02s
665:	learn: 6.3747678	total: 9.98s	remaining: 5s
666:	learn: 6.3744637	total: 10s	remaining: 4.99s
667:	learn: 6.3690572	total: 10s	remaining: 4.97s
668:	learn: 6.3667679	total: 10s	remaining: 4.96s
669:	learn: 6.3627755	total: 10s	remaining: 4.95s
670:	learn: 6.3553651	total: 10.1s	remaining: 4.93s
671:	learn: 6.3551924	total: 10.1s	remaining: 4.92s
672:	learn: 6.3548583	total: 10.1s	remaining: 4.9s
673:	learn: 6.3494231	total: 10.1s	remaining: 4.89s
674:	learn: 6.3313267	to

821:	learn: 5.6816992	total: 12.3s	remaining: 2.67s
822:	learn: 5.6710848	total: 12.3s	remaining: 2.65s
823:	learn: 5.6636133	total: 12.4s	remaining: 2.64s
824:	learn: 5.6635131	total: 12.4s	remaining: 2.63s
825:	learn: 5.6625929	total: 12.4s	remaining: 2.61s
826:	learn: 5.6565815	total: 12.4s	remaining: 2.6s
827:	learn: 5.6528880	total: 12.4s	remaining: 2.58s
828:	learn: 5.6492512	total: 12.5s	remaining: 2.57s
829:	learn: 5.6455311	total: 12.5s	remaining: 2.55s
830:	learn: 5.6415751	total: 12.5s	remaining: 2.54s
831:	learn: 5.6373726	total: 12.5s	remaining: 2.53s
832:	learn: 5.6371436	total: 12.5s	remaining: 2.51s
833:	learn: 5.6269527	total: 12.5s	remaining: 2.5s
834:	learn: 5.6177820	total: 12.6s	remaining: 2.48s
835:	learn: 5.6160438	total: 12.6s	remaining: 2.47s
836:	learn: 5.6105268	total: 12.6s	remaining: 2.45s
837:	learn: 5.5978330	total: 12.6s	remaining: 2.44s
838:	learn: 5.5896816	total: 12.6s	remaining: 2.42s
839:	learn: 5.5855332	total: 12.7s	remaining: 2.41s
840:	learn: 5.

980:	learn: 5.1333656	total: 14.8s	remaining: 287ms
981:	learn: 5.1313869	total: 14.9s	remaining: 272ms
982:	learn: 5.1310980	total: 14.9s	remaining: 257ms
983:	learn: 5.1285745	total: 14.9s	remaining: 242ms
984:	learn: 5.1228190	total: 14.9s	remaining: 227ms
985:	learn: 5.1163272	total: 14.9s	remaining: 212ms
986:	learn: 5.1162230	total: 14.9s	remaining: 197ms
987:	learn: 5.1128254	total: 15s	remaining: 182ms
988:	learn: 5.1108771	total: 15s	remaining: 167ms
989:	learn: 5.1107067	total: 15s	remaining: 151ms
990:	learn: 5.1090333	total: 15s	remaining: 136ms
991:	learn: 5.1088759	total: 15s	remaining: 121ms
992:	learn: 5.1033303	total: 15s	remaining: 106ms
993:	learn: 5.1020447	total: 15.1s	remaining: 90.9ms
994:	learn: 5.1011595	total: 15.1s	remaining: 75.8ms
995:	learn: 5.0995208	total: 15.1s	remaining: 60.6ms
996:	learn: 5.0986652	total: 15.1s	remaining: 45.5ms
997:	learn: 5.0928614	total: 15.1s	remaining: 30.3ms
998:	learn: 5.0885991	total: 15.1s	remaining: 15.2ms
999:	learn: 5.0830

149:	learn: 81.7734484	total: 2.1s	remaining: 11.9s
150:	learn: 81.6776456	total: 2.12s	remaining: 11.9s
151:	learn: 81.5408214	total: 2.13s	remaining: 11.9s
152:	learn: 81.3400201	total: 2.15s	remaining: 11.9s
153:	learn: 81.1764663	total: 2.16s	remaining: 11.9s
154:	learn: 81.1016008	total: 2.18s	remaining: 11.9s
155:	learn: 81.0685503	total: 2.2s	remaining: 11.9s
156:	learn: 80.9567263	total: 2.21s	remaining: 11.9s
157:	learn: 80.8711257	total: 2.23s	remaining: 11.9s
158:	learn: 80.8184317	total: 2.24s	remaining: 11.9s
159:	learn: 80.7795823	total: 2.26s	remaining: 11.8s
160:	learn: 80.5674787	total: 2.27s	remaining: 11.8s
161:	learn: 80.5258650	total: 2.28s	remaining: 11.8s
162:	learn: 80.3677336	total: 2.3s	remaining: 11.8s
163:	learn: 80.2927524	total: 2.32s	remaining: 11.8s
164:	learn: 80.2833059	total: 2.33s	remaining: 11.8s
165:	learn: 80.2623442	total: 2.34s	remaining: 11.8s
166:	learn: 80.2354281	total: 2.36s	remaining: 11.8s
167:	learn: 80.1918029	total: 2.37s	remaining: 11

317:	learn: 68.4594066	total: 4.62s	remaining: 9.9s
318:	learn: 68.3430879	total: 4.63s	remaining: 9.89s
319:	learn: 68.2747822	total: 4.65s	remaining: 9.88s
320:	learn: 68.0476668	total: 4.66s	remaining: 9.86s
321:	learn: 68.0229808	total: 4.68s	remaining: 9.86s
322:	learn: 67.8665572	total: 4.7s	remaining: 9.84s
323:	learn: 67.8081819	total: 4.72s	remaining: 9.84s
324:	learn: 67.6009872	total: 4.73s	remaining: 9.82s
325:	learn: 67.5617403	total: 4.75s	remaining: 9.81s
326:	learn: 67.5477355	total: 4.76s	remaining: 9.8s
327:	learn: 67.4164520	total: 4.78s	remaining: 9.79s
328:	learn: 67.3974480	total: 4.79s	remaining: 9.77s
329:	learn: 67.3871775	total: 4.81s	remaining: 9.76s
330:	learn: 67.2327382	total: 4.82s	remaining: 9.75s
331:	learn: 67.1674445	total: 4.84s	remaining: 9.74s
332:	learn: 67.0783570	total: 4.85s	remaining: 9.72s
333:	learn: 67.0384137	total: 4.87s	remaining: 9.71s
334:	learn: 67.0347039	total: 4.88s	remaining: 9.7s
335:	learn: 66.9090413	total: 4.9s	remaining: 9.69

478:	learn: 59.6441550	total: 7.16s	remaining: 7.79s
479:	learn: 59.4804050	total: 7.17s	remaining: 7.77s
480:	learn: 59.4235859	total: 7.2s	remaining: 7.76s
481:	learn: 59.2788615	total: 7.21s	remaining: 7.75s
482:	learn: 59.1871677	total: 7.22s	remaining: 7.73s
483:	learn: 59.1601369	total: 7.24s	remaining: 7.72s
484:	learn: 59.1592962	total: 7.26s	remaining: 7.71s
485:	learn: 59.0486688	total: 7.27s	remaining: 7.69s
486:	learn: 59.0459593	total: 7.29s	remaining: 7.67s
487:	learn: 58.9874110	total: 7.3s	remaining: 7.66s
488:	learn: 58.9805971	total: 7.32s	remaining: 7.65s
489:	learn: 58.9299932	total: 7.33s	remaining: 7.63s
490:	learn: 58.8888016	total: 7.35s	remaining: 7.62s
491:	learn: 58.8361070	total: 7.36s	remaining: 7.6s
492:	learn: 58.7583053	total: 7.38s	remaining: 7.59s
493:	learn: 58.6203806	total: 7.39s	remaining: 7.57s
494:	learn: 58.5940444	total: 7.41s	remaining: 7.56s
495:	learn: 58.5513197	total: 7.42s	remaining: 7.54s
496:	learn: 58.5395823	total: 7.44s	remaining: 7.

643:	learn: 51.2534146	total: 9.69s	remaining: 5.36s
644:	learn: 51.1857800	total: 9.71s	remaining: 5.34s
645:	learn: 51.1668220	total: 9.72s	remaining: 5.33s
646:	learn: 51.1319789	total: 9.73s	remaining: 5.31s
647:	learn: 51.1148930	total: 9.75s	remaining: 5.3s
648:	learn: 51.1115073	total: 9.77s	remaining: 5.28s
649:	learn: 51.0932531	total: 9.79s	remaining: 5.27s
650:	learn: 51.0529087	total: 9.8s	remaining: 5.25s
651:	learn: 50.9899444	total: 9.82s	remaining: 5.24s
652:	learn: 50.9726362	total: 9.83s	remaining: 5.22s
653:	learn: 50.9670119	total: 9.85s	remaining: 5.21s
654:	learn: 50.9650805	total: 9.86s	remaining: 5.19s
655:	learn: 50.9522633	total: 9.88s	remaining: 5.18s
656:	learn: 50.9297549	total: 9.89s	remaining: 5.16s
657:	learn: 50.9172903	total: 9.91s	remaining: 5.15s
658:	learn: 50.7062090	total: 9.92s	remaining: 5.13s
659:	learn: 50.6809023	total: 9.94s	remaining: 5.12s
660:	learn: 50.6446901	total: 9.96s	remaining: 5.11s
661:	learn: 50.6360984	total: 9.97s	remaining: 5

807:	learn: 43.9023789	total: 12.2s	remaining: 2.91s
808:	learn: 43.8884242	total: 12.2s	remaining: 2.89s
809:	learn: 43.8785067	total: 12.3s	remaining: 2.88s
810:	learn: 43.7804780	total: 12.3s	remaining: 2.86s
811:	learn: 43.6604777	total: 12.3s	remaining: 2.85s
812:	learn: 43.5930393	total: 12.3s	remaining: 2.83s
813:	learn: 43.5859010	total: 12.3s	remaining: 2.82s
814:	learn: 43.5717758	total: 12.3s	remaining: 2.8s
815:	learn: 43.4642735	total: 12.4s	remaining: 2.79s
816:	learn: 43.3920222	total: 12.4s	remaining: 2.77s
817:	learn: 43.2553221	total: 12.4s	remaining: 2.76s
818:	learn: 43.1595412	total: 12.4s	remaining: 2.74s
819:	learn: 43.1570263	total: 12.4s	remaining: 2.73s
820:	learn: 43.1465895	total: 12.4s	remaining: 2.71s
821:	learn: 43.1431631	total: 12.5s	remaining: 2.7s
822:	learn: 43.1414973	total: 12.5s	remaining: 2.68s
823:	learn: 43.0734055	total: 12.5s	remaining: 2.67s
824:	learn: 43.0702307	total: 12.5s	remaining: 2.65s
825:	learn: 43.0097517	total: 12.5s	remaining: 2

966:	learn: 38.0706184	total: 14.7s	remaining: 503ms
967:	learn: 38.0469432	total: 14.8s	remaining: 488ms
968:	learn: 37.9626523	total: 14.8s	remaining: 473ms
969:	learn: 37.9566658	total: 14.8s	remaining: 458ms
970:	learn: 37.9549351	total: 14.8s	remaining: 442ms
971:	learn: 37.9506605	total: 14.8s	remaining: 427ms
972:	learn: 37.9423062	total: 14.8s	remaining: 412ms
973:	learn: 37.8487752	total: 14.9s	remaining: 397ms
974:	learn: 37.8396384	total: 14.9s	remaining: 381ms
975:	learn: 37.8313480	total: 14.9s	remaining: 366ms
976:	learn: 37.8198321	total: 14.9s	remaining: 351ms
977:	learn: 37.8189872	total: 14.9s	remaining: 336ms
978:	learn: 37.7746605	total: 14.9s	remaining: 320ms
979:	learn: 37.7471284	total: 15s	remaining: 305ms
980:	learn: 37.7440517	total: 15s	remaining: 290ms
981:	learn: 37.6915636	total: 15s	remaining: 275ms
982:	learn: 37.6877352	total: 15s	remaining: 259ms
983:	learn: 37.6853312	total: 15s	remaining: 244ms
984:	learn: 37.6704924	total: 15s	remaining: 229ms
985:	

137:	learn: 8.1886941	total: 2.1s	remaining: 13.1s
138:	learn: 8.1806850	total: 2.12s	remaining: 13.1s
139:	learn: 8.1623180	total: 2.14s	remaining: 13.1s
140:	learn: 8.1535463	total: 2.16s	remaining: 13.1s
141:	learn: 8.1491183	total: 2.17s	remaining: 13.1s
142:	learn: 8.1331526	total: 2.19s	remaining: 13.1s
143:	learn: 8.1259452	total: 2.21s	remaining: 13.1s
144:	learn: 8.1125066	total: 2.22s	remaining: 13.1s
145:	learn: 8.0999444	total: 2.24s	remaining: 13.1s
146:	learn: 8.0798910	total: 2.25s	remaining: 13.1s
147:	learn: 8.0699539	total: 2.27s	remaining: 13.1s
148:	learn: 8.0553334	total: 2.29s	remaining: 13.1s
149:	learn: 8.0332713	total: 2.3s	remaining: 13s
150:	learn: 8.0163431	total: 2.32s	remaining: 13s
151:	learn: 7.9995767	total: 2.33s	remaining: 13s
152:	learn: 7.9806062	total: 2.35s	remaining: 13s
153:	learn: 7.9678449	total: 2.37s	remaining: 13s
154:	learn: 7.9552898	total: 2.38s	remaining: 13s
155:	learn: 7.9326906	total: 2.4s	remaining: 13s
156:	learn: 7.9153550	total: 

302:	learn: 6.7604684	total: 4.85s	remaining: 11.1s
303:	learn: 6.7456234	total: 4.87s	remaining: 11.1s
304:	learn: 6.7424771	total: 4.88s	remaining: 11.1s
305:	learn: 6.7287897	total: 4.9s	remaining: 11.1s
306:	learn: 6.7257507	total: 4.92s	remaining: 11.1s
307:	learn: 6.7218990	total: 4.93s	remaining: 11.1s
308:	learn: 6.7190769	total: 4.95s	remaining: 11.1s
309:	learn: 6.7098276	total: 4.97s	remaining: 11.1s
310:	learn: 6.7045899	total: 4.99s	remaining: 11s
311:	learn: 6.6988185	total: 5s	remaining: 11s
312:	learn: 6.6942525	total: 5.02s	remaining: 11s
313:	learn: 6.6900009	total: 5.04s	remaining: 11s
314:	learn: 6.6795798	total: 5.05s	remaining: 11s
315:	learn: 6.6660914	total: 5.07s	remaining: 11s
316:	learn: 6.6516345	total: 5.08s	remaining: 11s
317:	learn: 6.6465701	total: 5.1s	remaining: 10.9s
318:	learn: 6.6430089	total: 5.12s	remaining: 10.9s
319:	learn: 6.6293074	total: 5.14s	remaining: 10.9s
320:	learn: 6.6209617	total: 5.15s	remaining: 10.9s
321:	learn: 6.6178523	total: 5.

468:	learn: 5.7035392	total: 7.78s	remaining: 8.8s
469:	learn: 5.6902474	total: 7.8s	remaining: 8.8s
470:	learn: 5.6896802	total: 7.82s	remaining: 8.79s
471:	learn: 5.6843416	total: 7.84s	remaining: 8.77s
472:	learn: 5.6775531	total: 7.86s	remaining: 8.75s
473:	learn: 5.6766719	total: 7.87s	remaining: 8.74s
474:	learn: 5.6678128	total: 7.89s	remaining: 8.72s
475:	learn: 5.6598489	total: 7.91s	remaining: 8.7s
476:	learn: 5.6546262	total: 7.92s	remaining: 8.68s
477:	learn: 5.6452869	total: 7.94s	remaining: 8.67s
478:	learn: 5.6327672	total: 7.95s	remaining: 8.65s
479:	learn: 5.6161698	total: 7.97s	remaining: 8.63s
480:	learn: 5.6112314	total: 7.99s	remaining: 8.62s
481:	learn: 5.6012090	total: 8s	remaining: 8.6s
482:	learn: 5.6007269	total: 8.02s	remaining: 8.59s
483:	learn: 5.5991700	total: 8.04s	remaining: 8.57s
484:	learn: 5.5897476	total: 8.06s	remaining: 8.56s
485:	learn: 5.5864960	total: 8.08s	remaining: 8.54s
486:	learn: 5.5728033	total: 8.1s	remaining: 8.53s
487:	learn: 5.5668507

630:	learn: 4.7179446	total: 10.5s	remaining: 6.14s
631:	learn: 4.7150282	total: 10.5s	remaining: 6.13s
632:	learn: 4.7083235	total: 10.5s	remaining: 6.11s
633:	learn: 4.7016120	total: 10.6s	remaining: 6.1s
634:	learn: 4.6908728	total: 10.6s	remaining: 6.08s
635:	learn: 4.6900010	total: 10.6s	remaining: 6.06s
636:	learn: 4.6730388	total: 10.6s	remaining: 6.04s
637:	learn: 4.6639347	total: 10.6s	remaining: 6.02s
638:	learn: 4.6565054	total: 10.6s	remaining: 6.01s
639:	learn: 4.6462539	total: 10.6s	remaining: 5.99s
640:	learn: 4.6449793	total: 10.7s	remaining: 5.97s
641:	learn: 4.6425022	total: 10.7s	remaining: 5.96s
642:	learn: 4.6351066	total: 10.7s	remaining: 5.94s
643:	learn: 4.6334040	total: 10.7s	remaining: 5.92s
644:	learn: 4.6245454	total: 10.7s	remaining: 5.91s
645:	learn: 4.6179643	total: 10.8s	remaining: 5.89s
646:	learn: 4.6074952	total: 10.8s	remaining: 5.87s
647:	learn: 4.6060977	total: 10.8s	remaining: 5.86s
648:	learn: 4.6002828	total: 10.8s	remaining: 5.84s
649:	learn: 4

794:	learn: 3.9245251	total: 13.2s	remaining: 3.41s
795:	learn: 3.9189291	total: 13.3s	remaining: 3.4s
796:	learn: 3.9167790	total: 13.3s	remaining: 3.38s
797:	learn: 3.9162900	total: 13.3s	remaining: 3.36s
798:	learn: 3.9120407	total: 13.3s	remaining: 3.35s
799:	learn: 3.9069712	total: 13.3s	remaining: 3.33s
800:	learn: 3.9042575	total: 13.3s	remaining: 3.31s
801:	learn: 3.9028253	total: 13.3s	remaining: 3.29s
802:	learn: 3.8963970	total: 13.4s	remaining: 3.28s
803:	learn: 3.8928370	total: 13.4s	remaining: 3.26s
804:	learn: 3.8900300	total: 13.4s	remaining: 3.25s
805:	learn: 3.8895320	total: 13.4s	remaining: 3.23s
806:	learn: 3.8886194	total: 13.4s	remaining: 3.21s
807:	learn: 3.8799657	total: 13.5s	remaining: 3.2s
808:	learn: 3.8789738	total: 13.5s	remaining: 3.18s
809:	learn: 3.8763838	total: 13.5s	remaining: 3.16s
810:	learn: 3.8711149	total: 13.5s	remaining: 3.15s
811:	learn: 3.8672036	total: 13.5s	remaining: 3.13s
812:	learn: 3.8633908	total: 13.5s	remaining: 3.11s
813:	learn: 3.

956:	learn: 3.3664416	total: 15.9s	remaining: 716ms
957:	learn: 3.3637356	total: 16s	remaining: 700ms
958:	learn: 3.3603404	total: 16s	remaining: 683ms
959:	learn: 3.3567089	total: 16s	remaining: 666ms
960:	learn: 3.3565189	total: 16s	remaining: 650ms
961:	learn: 3.3548890	total: 16s	remaining: 633ms
962:	learn: 3.3501539	total: 16s	remaining: 616ms
963:	learn: 3.3498167	total: 16.1s	remaining: 600ms
964:	learn: 3.3497055	total: 16.1s	remaining: 583ms
965:	learn: 3.3466313	total: 16.1s	remaining: 566ms
966:	learn: 3.3395978	total: 16.1s	remaining: 549ms
967:	learn: 3.3394687	total: 16.1s	remaining: 533ms
968:	learn: 3.3357728	total: 16.1s	remaining: 516ms
969:	learn: 3.3343971	total: 16.1s	remaining: 499ms
970:	learn: 3.3338334	total: 16.2s	remaining: 483ms
971:	learn: 3.3335075	total: 16.2s	remaining: 466ms
972:	learn: 3.3317741	total: 16.2s	remaining: 450ms
973:	learn: 3.3308296	total: 16.2s	remaining: 433ms
974:	learn: 3.3268742	total: 16.2s	remaining: 416ms
975:	learn: 3.3239274	to

0

# Feature selection

In [30]:
def get_del_cols(df):
    # return the columns where no more than one value in the categorical features
    # exists in the test data - those features can be ignored
    cols_to_drop = []
    for col in df.select_dtypes(object).columns:
        col_vals = df[col].unique()
        n_vals = len(col_vals)
        n_irrelevant = 0
        for val in col_vals:
            if val not in df[col][m_train:].values:
                n_irrelevant += 1
        if n_irrelevant >= n_vals - 1:
            cols_to_drop.append(col)
    return cols_to_drop

In [31]:
cols_to_drop = get_del_cols(df)
df.drop(cols_to_drop, axis=1, inplace=True)
for col_to_drop in cols_to_drop:
    cols_cat.remove(col_to_drop)
    print("Dropped ", col_to_drop)

Dropped  Utilities


<!-- # Change ordinal columns to numeric, and encode accordingly -->

# Feature Engineering

## Features from the Internet - prefex FE0_

In [32]:
df["FE0_HighQualSF"] = df["GrLivArea"]+df["1stFlrSF"] + df["2ndFlrSF"]+0.5*df["GarageArea"]+0.5*df["TotalBsmtSF"]+1*df["MasVnrArea"]

df["FE0_SqFtPerRoom"] = df["GrLivArea"] / (df["TotRmsAbvGrd"] +
                                           df["FullBath"] +
                                           df["HalfBath"] +
                                           df["KitchenAbvGr"])

## My features - prefex FE1_

In [33]:
n_stories_dict = {
    "1Story": 1.0,
    "1.5Fin": 1.5,
    "1.5Unf": 1.5,
    "2Story": 2.0,
    "2.5Fin": 2.5,
    "2.5Unf": 2.5,
    "SFoyer": 2.0,
    "SLvl": 2.0,
}

df["FE1_n_stories"] = df["HouseStyle"].replace(n_stories_dict)
df["FE1_n_stories"].value_counts()

1.0    1471
2.0    1083
1.5     333
2.5      32
Name: FE1_n_stories, dtype: int64

In [34]:
df["FE1_age_sold"] = df["YrSold"] - df["YearBuilt"]
df["FE1_age_sold_Remod"] = df["YrSold"] - df["YearRemodAdd"]
df["FE1_GarageYrSold"] = df["YrSold"] - df["GarageYrBlt"]

In [35]:
df["CentralAir"] = df["CentralAir"].replace({"Y": 1, "N": 0})
df["CentralAir"].value_counts()

1    2723
0     196
Name: CentralAir, dtype: int64

In [37]:
df['FE1_total_area_hq'] = df["FE0_HighQualSF"] - df['LowQualFinSF']

df['FE1_Total_Home_Quality'] = df['OverallQual'] * df['OverallCond']
df['FE1_Total_bsmt_Quality'] = df['BsmtQual'] * df['BsmtCond']
df['FE1_Total_garage_Quality'] = df['GarageQual'] * df['GarageCond']

df["FE1_weighted_home_area"] = df["FE1_total_area_hq"] * df['FE1_Total_Home_Quality'] 
df["FE1_weighted_bsmt_area"] = df["TotalBsmtSF"] * df['FE1_Total_bsmt_Quality'] 
df["FE1_weighted_garage_area"] = df["GarageArea"] * df['FE1_Total_garage_Quality']
df["FE1_weighted_n_stories"] = df["FE1_n_stories"] * df['OverallQual']


# Split test and train

In [38]:
X_train, X_test = df[:m_train], df[m_train:]
y_train, _ = target[:m_train], target[m_train:]
n = X_train.shape[1]
X_train.shape, y_train.shape

((1460, 110), (1460,))

In [39]:
# df_ohe = pd.get_dummies(df)
# df_ohe[target_col] = target
# _ = setup(data=df_ohe[:m_train], target='SalePrice')
# best = compare_models()

# Training - CatBoost Grid Search 

In [None]:
# cat_features = df.select_dtypes(object).columns.to_list()
# model = CatBoostRegressor(random_state=10, cat_features=cat_features)
# kwargs = {
#     "n_estimators": [50 * n],
#     "max_depth": [8],
#     "subsample": [0.65],
#     "reg_lambda": [0.1], 
# }
# clf = GridSearchCV(model, kwargs, verbose=1, n_jobs=2)
# clf.fit(X_train, y_train)
# print(clf.best_score_)
# print(clf.best_params_)

# CatBoost Params:
    {'max_depth': 8, 'n_estimators': 5500, 'reg_lambda': 0.1, 'subsample': 0.65}

In [None]:
# clf.best_score_

# Training - CatBoost with Best Parameters

In [None]:
model = CatBoostRegressor(max_depth=8, random_seed=10,
                          subsample=0.65, n_estimators=5500,
                          cat_features=cols_cat)
model.fit(X_train, y_train)

In [None]:
print(model.best_score_)
fe = model.get_feature_importance(prettified=True)
fe.head(20)

In [None]:
sub_name = "data/submission_15.csv"
pd.DataFrame(model.predict(X_test), 
            index=range(1461, len(df)+1), 
            columns=['SalePrice']).reset_index().\
            rename(columns={'index': 'id'}).to_csv(sub_name, index=False)

# Score: 0.21233