In [1]:
from sklearn.metrics import mean_squared_error
from sklearn.preprocessing import StandardScaler, PolynomialFeatures, OrdinalEncoder
from sklearn.model_selection import KFold, cross_val_score
from sklearn.impute import KNNImputer
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import mean_squared_error
from pycaret.regression import setup, compare_models
import xgboost as xgb
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import numpy as np
import pandas as pd
import multiprocessing

In [2]:
from sklearn.ensemble import GradientBoostingRegressor, RandomForestRegressor, ExtraTreesRegressor
from sklearn.linear_model import BayesianRidge, OrthogonalMatchingPursuit, LinearRegression, Lasso, Ridge
from xgboost import XGBRegressor
from lightgbm import LGBMRegressor

# Read train and test sets

In [3]:
df_train = pd.read_csv("data/train.csv")
df_test = pd.read_csv("data/test.csv")

# Store both dfs lengths

In [4]:
m_train = df_train.shape[0]
m_test = df_test.shape[0]
m_train, m_test

(1460, 1459)

# Concat the dfs

In [5]:
df = pd.concat([df_train, df_test])
assert df.shape[0] == m_train + m_test

In [6]:
df.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500.0
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500.0
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500.0
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000.0
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000.0


# Save target

In [7]:
target_col, target = "SalePrice", df["SalePrice"]

#  Drop Id and target columns

In [8]:
cols_to_drop = ["Id", target_col]
df.drop(cols_to_drop, axis=1, inplace=True)

# View and fix columns data-types

In [9]:
df.select_dtypes(object).columns

Index(['MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour', 'Utilities',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [10]:
df.select_dtypes(np.number).columns

Index(['MSSubClass', 'LotFrontage', 'LotArea', 'OverallQual', 'OverallCond',
       'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'BsmtFinSF1', 'BsmtFinSF2',
       'BsmtUnfSF', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'TotRmsAbvGrd', 'Fireplaces',
       'GarageYrBlt', 'GarageCars', 'GarageArea', 'WoodDeckSF', 'OpenPorchSF',
       'EnclosedPorch', '3SsnPorch', 'ScreenPorch', 'PoolArea', 'MiscVal',
       'MoSold', 'YrSold'],
      dtype='object')

In [11]:
num_to_obj_cols = ['MSSubClass', 'MoSold']
df[num_to_obj_cols] = df[num_to_obj_cols].astype(object)

In [12]:
cols_cat = df.select_dtypes(object).columns
cols_num = df.select_dtypes(np.number).columns

# Impute categorical columns

In [13]:
cols_cat_na = df[cols_cat].isnull().sum()[df[cols_cat].isnull().sum() > 0]
cols_cat_na

MSZoning           4
Alley           2721
Utilities          2
Exterior1st        1
Exterior2nd        1
MasVnrType        24
BsmtQual          81
BsmtCond          82
BsmtExposure      82
BsmtFinType1      79
BsmtFinType2      80
Electrical         1
KitchenQual        1
Functional         2
FireplaceQu     1420
GarageType       157
GarageFinish     159
GarageQual       159
GarageCond       159
PoolQC          2909
Fence           2348
MiscFeature     2814
SaleType           1
dtype: int64

In [14]:
mode_filled_cols = ["MSZoning", "Utilities", "Exterior1st", "Exterior2nd", "MasVnrType", "Electrical", "KitchenQual", "Functional", "SaleType"]
for col in mode_filled_cols:
    df[col].fillna(df[col].mode()[0], inplace=True)

none_filled_cols = ["Alley", "BsmtQual", "BsmtCond", "BsmtExposure", "BsmtFinType1", "BsmtFinType2", "FireplaceQu", "GarageType", "GarageFinish", "GarageQual", "GarageCond", "PoolQC", "Fence", "MiscFeature"]
for col in none_filled_cols:
    df[col].fillna("None", inplace=True)
    
df[cols_cat].isnull().sum().sum()

0

# Impute numerical columns

In [15]:
cols_num_na = df[cols_num].isnull().sum()[df[cols_num].isnull().sum() > 0]
cols_num_na

LotFrontage     486
MasVnrArea       23
BsmtFinSF1        1
BsmtFinSF2        1
BsmtUnfSF         1
TotalBsmtSF       1
BsmtFullBath      2
BsmtHalfBath      2
GarageYrBlt     159
GarageCars        1
GarageArea        1
dtype: int64

In [16]:
zero_filled_cols = ["BsmtFinSF1", "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "BsmtFullBath", "BsmtHalfBath"]
for col in zero_filled_cols:
    df[col].fillna(0, inplace=True)

In [17]:
cols_num_na = df[cols_num].isnull().sum()[df[cols_num].isnull().sum() > 0]
cols_num_na

LotFrontage    486
MasVnrArea      23
GarageYrBlt    159
GarageCars       1
GarageArea       1
dtype: int64

In [18]:
def gbr_imputer(df, cols_num_na):
    imputer = KNNImputer(n_neighbors=8)
    df_c = df.copy()
    df_d = pd.get_dummies(df_c)
    for col in tqdm(cols_num_na):
        df_i = df_d.copy()
        X, y = df_i.drop(col, axis=1), df[col]
        X[:] = imputer.fit_transform(X)
        
        train_indexes, test_indexes = ~df_i[col].isnull(), df_i[col].isnull()
        X_train, X_test = X.loc[train_indexes, :], X.loc[test_indexes, :]
        y_train, y_test = y.loc[train_indexes], y.loc[test_indexes]
        
        model = GradientBoostingRegressor(n_estimators=X.shape[1] * 2, max_depth=9)
        model.fit(X_train, y_train)
        df.loc[df[col].isnull(), col] = model.predict(X_test)
    return df

In [19]:
df = gbr_imputer(df, cols_num_na.index)
df.isnull().sum().sum()

100%|█████████████████████████████████████████████| 5/5 [02:16<00:00, 27.30s/it]


0

# Feature selection

In [20]:
def get_del_cols(df):
    # return the columns where no more than one value in the categorical features
    # exists in the test data - those features can be ignored
    cols_to_drop = []
    for col in df.select_dtypes(object).columns:
        col_vals = df[col].unique()
        n_vals = len(col_vals)
        n_irrelevant = 0
        for val in col_vals:
            if val not in df[col][m_train:].values:
                n_irrelevant += 1
        if n_irrelevant >= n_vals - 1:
            cols_to_drop.append(col)
    return cols_to_drop

In [21]:
cols_to_drop = get_del_cols(df)
df.drop(cols_to_drop, axis=1, inplace=True)
cols_to_drop

['Utilities']

# Change ordinal columns to numeric, and encode accordingly

In [22]:
df.select_dtypes(object).columns

Index(['MSSubClass', 'MSZoning', 'Street', 'Alley', 'LotShape', 'LandContour',
       'LotConfig', 'LandSlope', 'Neighborhood', 'Condition1', 'Condition2',
       'BldgType', 'HouseStyle', 'RoofStyle', 'RoofMatl', 'Exterior1st',
       'Exterior2nd', 'MasVnrType', 'ExterQual', 'ExterCond', 'Foundation',
       'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
       'Heating', 'HeatingQC', 'CentralAir', 'Electrical', 'KitchenQual',
       'Functional', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual',
       'GarageCond', 'PavedDrive', 'PoolQC', 'Fence', 'MiscFeature',
       'SaleType', 'SaleCondition'],
      dtype='object')

In [23]:
oe = OrdinalEncoder(categories=[['Reg', 'IR1', 'IR2', 'IR3']])
df.loc[:, "LotShape"] = oe.fit_transform(df[["LotShape"]])
df["LotShape"].value_counts()

0.0    1859
1.0     968
2.0      76
3.0      16
Name: LotShape, dtype: int64

In [24]:
oe = OrdinalEncoder(categories=[['Gtl', 'Mod', 'Sev']])
df.loc[:, "LandSlope"] = oe.fit_transform(df[["LandSlope"]])
df["LandSlope"].value_counts()

0.0    2778
1.0     125
2.0      16
Name: LandSlope, dtype: int64

In [25]:
qual_oe = OrdinalEncoder(categories=[['None', 'Po', 'Fa', 'TA', 'Gd', 'Ex']])
for col in ["ExterQual", "ExterCond", 'BsmtQual', 'BsmtCond', 'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']:
    df.loc[:, col] = qual_oe.fit_transform(df[[col]])

In [26]:
oe = OrdinalEncoder(categories=[['None', 'No', 'Mn', 'Av', 'Gd']])
df.loc[:, "BsmtExposure"] = oe.fit_transform(df[["BsmtExposure"]])
df["LandSlope"].value_counts()

0.0    2778
1.0     125
2.0      16
Name: LandSlope, dtype: int64

In [27]:
oe = OrdinalEncoder(categories=[['None', 'Unf', 'LwQ', 'Rec', 'BLQ', 'ALQ', 'GLQ']])
for col in ["BsmtFinType1", "BsmtFinType2"]:
    df.loc[:, col] = oe.fit_transform(df[[col]])

In [28]:
oe = OrdinalEncoder(categories=[['None', 'Unf', 'RFn', 'Fin']])
df.loc[:, "GarageFinish"] = oe.fit_transform(df[["GarageFinish"]])
df["GarageFinish"].value_counts()

1.0    1230
2.0     811
3.0     719
0.0     159
Name: GarageFinish, dtype: int64

In [29]:
oe = OrdinalEncoder(categories=[['N', 'P', 'Y']])
df.loc[:, "PavedDrive"] = oe.fit_transform(df[["PavedDrive"]])
df["PavedDrive"].value_counts()

2.0    2641
0.0     216
1.0      62
Name: PavedDrive, dtype: int64

In [30]:
oe = OrdinalEncoder(categories=[['None', 'MnWw', 'GdWo', 'MnPrv', 'GdPrv']])
df.loc[:, "Fence"] = oe.fit_transform(df[["Fence"]])
df["Fence"].value_counts()

0.0    2348
3.0     329
4.0     118
2.0     112
1.0      12
Name: Fence, dtype: int64

# Feature Engineering

In [31]:
n_stories_dict = {
    "1Story": 1.0,
    "1.5Fin": 1.5,
    "1.5Unf": 1.5,
    "2Story": 2.0,
    "2.5Fin": 2.5,
    "2.5Unf": 2.5,
    "SFoyer": 2.0,
    "SLvl": 2.0,
}

df["n_stories"] = df["HouseStyle"].replace(n_stories_dict)
df["n_stories"].value_counts()

1.0    1471
2.0    1083
1.5     333
2.5      32
Name: n_stories, dtype: int64

In [32]:
df["age_sold"] = df["YrSold"] - df["YearBuilt"]
df["age_sold_Remod"] = df["YrSold"] - df["YearRemodAdd"]
df["GarageYrSold"] = df["YrSold"] - df["GarageYrBlt"]

In [33]:
df["CentralAir"] = df["CentralAir"].replace({"Y": 1, "N": 0})
df["CentralAir"].value_counts()

1    2723
0     196
Name: CentralAir, dtype: int64

In [34]:
df.select_dtypes(np.number).columns

Index(['LotFrontage', 'LotArea', 'LotShape', 'LandSlope', 'OverallQual',
       'OverallCond', 'YearBuilt', 'YearRemodAdd', 'MasVnrArea', 'ExterQual',
       'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1',
       'BsmtFinSF1', 'BsmtFinType2', 'BsmtFinSF2', 'BsmtUnfSF', 'TotalBsmtSF',
       'HeatingQC', 'CentralAir', '1stFlrSF', '2ndFlrSF', 'LowQualFinSF',
       'GrLivArea', 'BsmtFullBath', 'BsmtHalfBath', 'FullBath', 'HalfBath',
       'BedroomAbvGr', 'KitchenAbvGr', 'KitchenQual', 'TotRmsAbvGrd',
       'Fireplaces', 'FireplaceQu', 'GarageYrBlt', 'GarageFinish',
       'GarageCars', 'GarageArea', 'GarageQual', 'GarageCond', 'PavedDrive',
       'WoodDeckSF', 'OpenPorchSF', 'EnclosedPorch', '3SsnPorch',
       'ScreenPorch', 'PoolArea', 'PoolQC', 'Fence', 'MiscVal', 'MoSold',
       'YrSold', 'n_stories', 'age_sold', 'age_sold_Remod', 'GarageYrSold'],
      dtype='object')

In [35]:
df["total_living_area"] = df['TotalBsmtSF'] + df['GrLivArea']
df['HighQualFinSF'] = df["total_living_area"] - df['LowQualFinSF']
df['Total_Home_Quality'] = df['OverallQual'] + df['OverallCond']
df["HighQualSF"] = df["GrLivArea"] + df["1stFlrSF"] + df["2ndFlrSF"] + df["GarageArea"]/2 + df["TotalBsmtSF"]/2 + df["MasVnrArea"]


df["SqFtPerRoom"] = df["GrLivArea"] / (df["TotRmsAbvGrd"] +
                                       df["FullBath"] +
                                       df["HalfBath"] +
                                       df["KitchenAbvGr"])

# One-hot encoding

In [36]:
df = pd.get_dummies(df)
m, n = df.shape
m, n

(2919, 251)

# Split test and train

In [37]:
X_train, X_test = df[:m_train], df[m_train:]
y_train, _ = target[:m_train], target[m_train:]
X_train.shape, y_train.shape

((1460, 251), (1460,))

# Training - XGB Grid Search 

In [None]:
# model = XGBRegressor(n_jobs=multiprocessing.cpu_count() // 2, eval_metric=mean_squared_error)

# kwargs = {
#     "n_estimators": [n, 1.5 * n, 2 * n, int(2.5 * n), 3 * n],
#     "max_depth": [2, 3, 4, 5, 6],
#     "learning_rate": [0.08, 0.1, 0.15],
#     "tree_method": ["auto", "hist"],
#     "random_state": [10,],
#     "subsample": [0.5, 0.65, 0.8],
#     "colsample_bytree": [0.5, 0.65, 0.8],
#     "reg_lambda": [0.1, 1, 2, 5, 10], 
#     "min_child_weight": [0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 20] 
# }
# clf = GridSearchCV(model, kwargs, verbose=1, n_jobs=2)
# clf.fit(X_train, y_train)
# print(clf.best_score_)
# print(clf.best_params_)

In [None]:
# model = XGBRegressor(n_jobs=multiprocessing.cpu_count() // 2, eval_metric=mean_squared_error)

# kwargs = {
#     "n_estimators": [2 * n],
#     "max_depth": [3, 4, 5, 6],
#     "learning_rate": [0.1],
#     "tree_method": ["auto", "hist"],
#     "random_state": [10,],
#     "subsample": [0.5, 0.65, 0.8],
#     "colsample_bytree": [0.5, 0.65, 0.8],
#     "reg_lambda": [0.1, 1, 2, 5, 10], 
#     "min_child_weight": [0.01, 0.1, 0.5, 1, 2, 5, 10] 
# }
# clf = GridSearchCV(model, kwargs, verbose=1, n_jobs=2)
# clf.fit(X_train, y_train)
# print(clf.best_score_)
# print(clf.best_params_)

In [None]:
model = XGBRegressor(n_jobs=multiprocessing.cpu_count() // 2, eval_metric=mean_squared_error)

kwargs = {
    "n_estimators": [2 * n],
    "max_depth": [3, 4, 5, 6],
    "learning_rate": [0.1],
    "tree_method": ["auto", "hist"],
    "random_state": [10,],
    "subsample": [0.5, 0.65, 0.8],
    "colsample_bytree": [0.5, 0.65, 0.8],
    "reg_lambda": [0.1, 1, 2, 5, 10], 
    "min_child_weight": [0.01, 0.05, 0.1, 0.2, 0.5, 1, 2, 5, 10, 20] 
}
clf = GridSearchCV(model, kwargs, verbose=1, n_jobs=2)
clf.fit(X_train, y_train)
print(clf.best_score_)
print(clf.best_params_)

Fitting 5 folds for each of 3600 candidates, totalling 18000 fits


In [45]:
pd.concat((X_train, y_train), axis=1)

Unnamed: 0,LotFrontage,LotArea,LotShape,LandSlope,OverallQual,OverallCond,YearBuilt,YearRemodAdd,MasVnrArea,ExterQual,...,SaleType_New,SaleType_Oth,SaleType_WD,SaleCondition_Abnorml,SaleCondition_AdjLand,SaleCondition_Alloca,SaleCondition_Family,SaleCondition_Normal,SaleCondition_Partial,SalePrice
0,65.0,8450,0.0,0.0,7,5,2003,2003,196.0,4.0,...,0,0,1,0,0,0,0,1,0,208500.0
1,80.0,9600,0.0,0.0,6,8,1976,1976,0.0,3.0,...,0,0,1,0,0,0,0,1,0,181500.0
2,68.0,11250,1.0,0.0,7,5,2001,2002,162.0,4.0,...,0,0,1,0,0,0,0,1,0,223500.0
3,60.0,9550,1.0,0.0,7,5,1915,1970,0.0,3.0,...,0,0,1,1,0,0,0,0,0,140000.0
4,84.0,14260,1.0,0.0,8,5,2000,2000,350.0,4.0,...,0,0,1,0,0,0,0,1,0,250000.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1455,62.0,7917,0.0,0.0,6,5,1999,2000,0.0,3.0,...,0,0,1,0,0,0,0,1,0,175000.0
1456,85.0,13175,0.0,0.0,6,6,1978,1988,119.0,3.0,...,0,0,1,0,0,0,0,1,0,210000.0
1457,66.0,9042,0.0,0.0,7,9,1941,2006,0.0,5.0,...,0,0,1,0,0,0,0,1,0,266500.0
1458,68.0,9717,0.0,0.0,5,6,1950,1996,0.0,3.0,...,0,0,1,0,0,0,0,1,0,142125.0


In [46]:
s = setup(data=(pd.concat((X_train, y_train), axis=1)), target='SalePrice')
best = compare_models()

Unnamed: 0,Description,Value
0,Session id,429
1,Target,SalePrice
2,Target type,Regression
3,Data shape,"(1460, 252)"
4,Train data shape,"(1021, 252)"
5,Test data shape,"(439, 252)"
6,Numeric features,251
7,Preprocess,True
8,Imputation type,simple
9,Numeric imputation,mean


Unnamed: 0,Model,MAE,MSE,RMSE,R2,RMSLE,MAPE,TT (Sec)
gbr,Gradient Boosting Regressor,16557.8336,790216771.9162,27436.1588,0.8778,0.128,0.0934,0.115
et,Extra Trees Regressor,16899.4082,834884051.4588,28001.205,0.8715,0.1349,0.0978,0.255
lightgbm,Light Gradient Boosting Machine,17290.5537,902065360.3435,29277.8978,0.8606,0.1338,0.0966,0.039
rf,Random Forest Regressor,17900.6856,975283486.4985,30303.88,0.8515,0.1422,0.1034,0.257
xgboost,Extreme Gradient Boosting,18572.4902,1233964108.8,33460.335,0.8142,0.1451,0.1027,0.184
ada,AdaBoost Regressor,23209.4467,1213493802.9275,34285.4394,0.8123,0.1854,0.1469,0.07
llar,Lasso Least Angle Regression,19673.7971,1453952761.2649,35119.1521,0.7751,0.1843,0.1153,0.015
dt,Decision Tree Regressor,25720.0136,1523822411.5928,38453.3251,0.7581,0.1964,0.1457,0.012
omp,Orthogonal Matching Pursuit,20923.5807,1615087778.1012,37121.1498,0.7514,0.1963,0.1223,0.009
ridge,Ridge Regression,21738.6971,1697235036.2458,38285.8815,0.75,0.2086,0.1289,0.009


In [53]:
models = {
    "gbr": GradientBoostingRegressor(n_estimators=2*n, max_depth=3),
    "rf": RandomForestRegressor(),
    "xgb": XGBRegressor(),
    "lgbm": LGBMRegressor(),
    "et": ExtraTreesRegressor(),
#     "br": BayesianRidge(),
#     "omp": OrthogonalMatchingPursuit(),
}

In [57]:
results = {}
for name, model in models.items():
    result = cross_val_score(model, 
                            X_train, 
                            y_train, 
                            scoring="neg_mean_squared_error",
                            cv=KFold(n_splits=7))
    results[name] = -result

In [61]:
means = []
for name, result in results.items():
    means.append(result.mean())
    print(f"{name}: {result.mean():.5f}")
top_models = [name for _,name in sorted(zip(means, models.keys()))]
print(top_models)

gbr: 621700623.66867
rf: 844384618.01942
xgb: 772447499.74578
lgbm: 851821250.65051
et: 816485581.18209
['gbr', 'xgb', 'et', 'rf', 'lgbm']


# Combine models

In [64]:
n_models = len(top_models)
pred = np.zeros(X_test.shape[0])
for model in top_model[:n_models]:
    models[model].fit(X_train, y_train)
    pred += models[model].predict(X_test) / n_models

# Training - XGB with Best Parameters

In [None]:
# model = XGBRegressor(n_jobs=multiprocessing.cpu_count() // 2, eval_metric=mean_squared_error, **clf.best_params_)
# model.fit(X_train, y_train)

In [65]:
sub_name = "data/ensemble0.csv"
pd.DataFrame(pred, 
            index=range(1461, len(df)+1), 
            columns=['SalePrice']).reset_index().\
            rename(columns={'index': 'id'}).to_csv(sub_name, index=False)