## Imports

In [101]:
# imports
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

# ml related imports
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import KFold, cross_val_score, train_test_split, GridSearchCV, train_test_split
from sklearn import metrics 
from sklearn.metrics import mean_squared_error
from lightgbm import LGBMRegressor
from xgboost import XGBRegressor
from sklearn.linear_model import Ridge, Lasso, ElasticNet
from catboost import CatBoostRegressor

# silence settingWithCopyWarning
import warnings
from pandas.core.common import SettingWithCopyWarning
warnings.simplefilter(action="ignore", category=SettingWithCopyWarning)

In [2]:
# get the data
train = pd.read_csv('data/train.csv')
test = pd.read_csv('data/test.csv')

In [3]:
# test shape
train.shape, test.shape

((1460, 81), (1459, 80))

## Preprocessing

In [81]:
train_pre = train.copy()
test_pre = test.copy()

In [82]:
# combine data
all_data = pd.concat([train_pre, test_pre], ignore_index=True)

# a lot of the missing values are just encodings for the instance that a specific feaure isn't available
# list of features with worng encodign for NA
feature_NA = ['Alley', 'FireplaceQu', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 
              'MiscFeature', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2']

# assign NA to None to indicate the lack of a certain feature
all_data[feature_NA] = all_data[feature_NA].fillna('None')

# imute missing categorical features mostly with the mode
all_data['MSZoning'] = all_data['MSZoning'].fillna(all_data['MSZoning'].mode()[0])
all_data['Utilities'] = all_data['Utilities'].fillna(all_data['Utilities'].mode()[0])
all_data['Exterior1st'] = all_data['Exterior1st'].fillna(all_data['Exterior1st'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
all_data['Exterior2nd'] = all_data['Exterior2nd'].fillna(all_data['Exterior2nd'].mode()[0])
all_data['MasVnrType'] = all_data['MasVnrType'].fillna('None')
all_data['Electrical'] = all_data['Electrical'].fillna(all_data['Electrical'].mode()[0])
all_data['KitchenQual'] = all_data['KitchenQual'].fillna(all_data['KitchenQual'].mode()[0])
all_data['Functional'] = all_data['Functional'].fillna(all_data['Functional'].mode()[0])
all_data['SaleType'] = all_data['SaleType'].fillna(all_data['SaleType'].mode()[0])

# imput missing numerical features (most numercial had only 1-2 missing values in that case I just imputet 0)
all_data['MasVnrArea'] = all_data['MasVnrArea'].fillna(0)
all_data['BsmtFinSF1'] = all_data['BsmtFinSF1'].fillna(0)
all_data['BsmtFinSF2'] = all_data['BsmtFinSF2'].fillna(0)
all_data['BsmtUnfSF'] = all_data['BsmtUnfSF'].fillna(0)
all_data['TotalBsmtSF'] = all_data['TotalBsmtSF'].fillna(0)
all_data['BsmtFullBath'] = all_data['BsmtFullBath'].fillna(0)
all_data['BsmtHalfBath'] = all_data['BsmtHalfBath'].fillna(0)
all_data['GarageCars'] = all_data['GarageCars'].fillna(0)
all_data['GarageArea'] = all_data['GarageArea'].fillna(0)
all_data['GarageYrBlt'] = all_data['GarageYrBlt'].fillna(0)
# Neighorhood should impact the size of of street connected to the property
# code from https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
all_data["LotFrontage"] = all_data.groupby("Neighborhood")["LotFrontage"].transform(
    lambda x: x.fillna(x.median()))

# transform features into different types
# MSSubClasee
all_data['MSSubClass'] = all_data['MSSubClass'].astype('str')
# month sold
MoSold_dict = {1:'Jan', 2:'Feb', 3:'Mar', 4:'Apr', 5:'May', 6:'Jun', 7:'Jul', 8:'Aug', 9:'Sep', 10:'Oct', 11:'Nov', 12:'Dec'}
all_data['MoSold'] = all_data['MoSold'].map(MoSold_dict)

# convert categorical to ordinal features 
ord_feats = ['ExterQual', 'ExterCond', 'BsmtQual', 'BsmtCond', 'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2',
             'HeatingQC', 'KitchenQual', 'FireplaceQu', 'GarageQual', 'GarageCond', 'PoolQC']
map_dict_ord = {'None': 0, 'Po': 1, 'Fa': 2, 'TA':3, 'Gd':4, 'Ex': 5, 'No': 1, 'Mn': 2, 'Av': 3, 'Unf': 1, 
                'LwQ': 2, 'Rec': 3, 'BLQ': 4, 'ALQ': 5, 'GLQ': 6}
for ord_ in ord_feats:
    all_data[ord_] = all_data[ord_].map(map_dict_ord)

# convert categorical to binary
bin_feats = ['Street', 'Utilities', 'CentralAir']
map_dict_bin = {'Pave': 0, 'Grvl': 1, 'AllPub': 0, 'NoSeWa': 1, 'Y': 1, 'N': 0}
for bin_ in bin_feats:
    all_data[bin_] = all_data[bin_].map(map_dict_bin)
    
# dummy varibales
dummy_feats = list(all_data.select_dtypes('object'))
df_dummy = pd.get_dummies(all_data[dummy_feats], drop_first=True)
all_data.drop(columns=dummy_feats, inplace=True)
all_data = pd.concat([all_data, df_dummy], axis=1)


In [83]:
# check missing values
# should only return SalePrice with 1459 missing values
isnull = all_data.isnull().sum()
isnull[isnull > 0]

SalePrice    1459
dtype: int64

### Feature Engineering

In [88]:
all_data['TotalSF'] = all_data['TotalBsmtSF'] + all_data['1stFlrSF'] + all_data['2ndFlrSF']

### Feature Selection 

In [89]:
# drop columns with no predictive value
all_data.drop(columns=['Utilities'], inplace=True)

### Split data in train and test

In [90]:
# split all_data in train and test to perform more preprocessing and feature engineering speratly (prevent data leakage)
train_pre = all_data.loc[:train.shape[0]-1]
test_pre = all_data.loc[train.shape[0]:]

In [91]:
test_pre.reset_index(drop=True, inplace=True)

### Drop outliers

In [92]:
train_pre = train_pre.drop(train_pre[(train_pre['GrLivArea']>4000) & (train_pre['SalePrice']<300000)].index)

### take care of target

In [93]:
train_pre['SalePrice_log'] = np.log(train_pre.SalePrice)

### save ID

In [94]:
# save id 
Id = test_pre['Id']
# drop id
test_pre.drop(columns='Id', inplace=True)
train_pre.drop(columns='Id', inplace=True)

## Modeling

In [95]:
# use train set to make x_train and y_train
x_train = train_pre.drop(columns=['SalePrice_log', 'SalePrice'])
y_train = train_pre['SalePrice_log']
test_pre.drop(columns='SalePrice', inplace=True)

In [96]:
x_train.shape, y_train.shape, test_pre.shape

((1458, 240), (1458,), (1459, 240))

In [97]:
# check if x_train and test_pre are identical
list(set(x_train) - set(test_pre)), list(set(test_pre) - set(x_train))

([], [])

### Cross Validation

In [99]:
# code: https://www.kaggle.com/serigne/stacked-regressions-top-4-on-leaderboard
n_folds = 5

def rmsle_cv(model):
    kf = KFold(n_folds, shuffle=True, random_state=42).get_n_splits(train.values)
    rmsle= np.sqrt(-cross_val_score(model, x_train.values, y_train, scoring="neg_mean_squared_error", cv = kf))
    return(rmsle)

In [102]:
# XGBoost
model_xgb = XGBRegressor()
# LightGBM
model_lgb = LGBMRegressor()
# CatBoost
model_cat = CatBoostRegressor(verbose=False)
# Lasso Regression
model_lasso = Lasso()
# Ridge Regression
model_ridge = Ridge()
# ElasticNet
model_elnet = ElasticNet()

# list of models for cross validation
models_list = [model_xgb, model_lgb, model_cat, model_lasso, model_ridge, model_elnet]
model_names = ['model_xgb', 'model_lgb', 'model_cat', 'model_lasso', 'model_ridge', 'model_elnet']

for model, name in zip(models_list, model_names):
    print(name + ' rmsle score:')
    print(np.mean(rmsle_cv(model)))
    print('#'*30)

model_xgb rmsle score:
0.13338894335073376
##############################
model_lgb rmsle score:
0.12705947760877712
##############################
model_cat rmsle score:
0.11583596871896921
##############################
model_lasso rmsle score:
0.16972133840519274
##############################
model_ridge rmsle score:
0.12005726930700245
##############################
model_elnet rmsle score:
0.1617088823853497
##############################


### Blended Model

In [104]:
# train
model_xgb.fit(x_train, y_train)
model_lgb.fit(x_train, y_train)
model_ridge.fit(x_train, y_train)
model_cat.fit(x_train, y_train)

# predict
xgb_preds =model_xgb.predict(test_pre)
lgb_preds = model_lgb.predict(test_pre)
ridge_preds = model_ridge.predict(test_pre)
cat_preds = model_cat.predict(test_pre)

# calculate
preds = (lgb_preds + ridge_preds + cat_preds + xgb_preds)/4

# create DataFrame for submission
submission = pd.DataFrame()
submission['Id'] = Id
# transform log of SalePrice
submission['SalePrice'] = np.exp(preds)
# save DataFrame
submission.to_csv('blended_model_new_4.csv', index=False)
# show submission
submission

Unnamed: 0,Id,SalePrice
0,1461,123562.475466
1,1462,162425.429248
2,1463,185510.117742
3,1464,198046.519728
4,1465,191230.450861
...,...,...
1454,2915,80462.938929
1455,2916,81599.384028
1456,2917,165687.404436
1457,2918,112753.794800
