This script is developed based on 'KProject_HousePrice_i5'

### Outline:
0. Load libraries and custom functions.
1. Load data.
2. Preliminary data analysis: explore features and a target, delete unneeded features, create new features.
3. Train-test split.
4. Missing values. In some cases it may be useful to explore skew and perform log-transform before imputing missing values.
5. Feature engineering. Transform skewed variables, do OHC and scaling.
6. Fit models.
7. Evaluate models.
8. Feature importance, error analysis. Based on the results, go to 2. and iterate.
9. Make predictions.

In [4]:
# 0. Load libraries #

import numpy as np
import pandas as pd
import os, time, warnings
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.svm import SVC, SVR
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
from scipy.special import inv_boxcox
from xgboost import XGBClassifier, XGBRegressor

pd.set_option('display.max_columns', 20)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore')

def draw_histograms(df, variables, n_rows, n_cols):
    # stolen from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  
    plt.show()


def fillna_mp_i1(df_train, df_test, df_pred, num_features, cat_features, num_fill='median', cat_fill='mode'):
    """This function speeds up filling missing values for 3 main datasets using different imputation methods.
    Later may replace it with some subclass.
    Example: fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)"""
    # set df_pred to None if it does not exist
    if (cat_fill=='mode'):
    
        df_train[cat_features] = df_train[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
        df_test[cat_features] = df_test[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
        if (df_pred is not None):
            df_pred[cat_features] = df_pred[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
            
    if (cat_fill=='missing'):
    
        df_train[cat_features] = df_train[cat_features].fillna(value='missing')
        df_test[cat_features] = df_test[cat_features].fillna(value='missing')
        if (df_pred is not None):
            df_pred[cat_features] = df_pred[cat_features].fillna(value='missing')
        
    if (num_fill=='median'):
        df_train[num_features] = df_train[num_features].fillna(value=df_train[num_features].median())
        df_test[num_features] = df_test[num_features].fillna(value=df_train[num_features].median())
        if (df_pred is not None):
            df_pred[num_features] = df_pred[num_features].fillna(value=df_train[num_features].median())    
    
    all_good = (
    (np.prod(df_train[num_features+cat_features].shape)==df_train[num_features+cat_features].count().sum()) and 
    (np.prod(df_test[num_features+cat_features].shape) == df_test[num_features+cat_features].count().sum()) and 
    (np.prod(df_pred[num_features+cat_features].shape) == df_pred[num_features+cat_features].count().sum()))
    if (all_good):
        print('Missing values imputed successfully')
    else:
        print('There are still some missing values...')
    
    
    
def add_misDummy_mp_i1(df_train, df_test, df_pred, features):
    """This function creates new dummy columns for missing features.
    Example: add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])"""
    # set df_pred to None if it does not exist
    for feature_name in features:
        misColName = 'mis'+feature_name
        df_train.loc[df_train[feature_name].isnull(), misColName]=1
        df_train.loc[df_train[feature_name].notnull(), misColName]=0
        df_test.loc[df_test[feature_name].isnull(), misColName]=1
        df_test.loc[df_test[feature_name].notnull(), misColName]=0
        if (df_pred is not None):
            df_pred.loc[df_pred[feature_name].isnull(), misColName]=1
            df_pred.loc[df_pred[feature_name].notnull(), misColName]=0
   

def discretize_mp_i1(df_train, df_test, df_pred, feature, ntiles, delete_feature=False):
    """This function divides a continuous feature into quantile groups.
    Example: discretize_mp_i1(X_train, X_test, X_pred, 'Age', 15)"""
    # set df_pred to None if it does not exist
    _,bin = pd.qcut(df_train[feature], ntiles, retbins = True, labels = False, duplicates = 'drop')
    df_train[feature+'Ntile'] = pd.cut(df_train[feature], labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
    df_test[feature+'Ntile'] = pd.cut(df_test[feature], labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
    if (df_pred is not None):
        df_pred[feature+'Ntile'] = pd.cut(df_pred[feature], labels=False, duplicates = 'drop', bins = bin ,include_lowest = True)
    if (delete_feature==True):
        df_train.drop(columns=[feature], inplace=True)
        df_test.drop(columns=[feature], inplace=True)
        df_pred.drop(columns=[feature], inplace=True)
    print('Discretized ',feature, ' into ', len(bin)-1, ' bins')


def log_transformer_mp_i1(df_train, df_test, df_pred, feature_subset=False, min_skew=3):
    """This function divides a continuous feature into quantile groups.
    Example: log_transformer_mp_i1(X_train, X_test, X_pred, feature_subset=num_cols)"""
    # set df_pred to None if it does not exist
    if (feature_subset==False):
        features_totransform = df_train.columns
    else:
        features_totransform = feature_subset.copy()
    skewed_vars = list(df_train.skew()[abs(df_train.skew())>min_skew].index)
    for col in list(set(skewed_vars)&set(features_totransform)):
        df_train[col] = np.log1p(df_train[col])
        df_test[col] = np.log1p(df_test[col])
        if (df_pred is not None):
            df_pred[col] = np.log1p(df_pred[col])
    print('Skewed columns log-transformed: ', list(set(skewed_vars)&set(features_totransform)))
    
    
def add_dummyfeatures(df_train, df_test, df_pred, feature_dict):
    """This function adds dummy feature when some feature is equal to value, specified in a dictionary.
    Example: add_dummyfeatures(X_train, X_test, X_pred, {'RoomService':0, 'Spa':0, 'VRDeck':0, 'ShoppingMall':0})"""
    input_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    for i in range(len(list(feature_dict.items()))):
        feature,value = list(feature_dict.keys())[i], list(feature_dict.values())[i]
        df_train.loc[df_train[feature]==value,(str(feature)+str(value))]=1
        df_train.loc[df_train[feature]!=value,(str(feature)+str(value))]=0
        df_test.loc[df_test[feature]==value,(str(feature)+str(value))]=1
        df_test.loc[df_test[feature]!=value,(str(feature)+str(value))]=0
        df_pred.loc[df_pred[feature]==value,(str(feature)+str(value))]=1
        df_pred.loc[df_pred[feature]!=value,(str(feature)+str(value))]=0
    output_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    print(output_dimensions-input_dimensions, ' variables created') 
    

In [5]:
# 1. Load data #

time0 = time.time()
path = '../input/house-prices-advanced-regression-techniques/train.csv'
df = pd.read_csv(path) 
df0 = df.copy()

pred=pd.read_csv('../input/house-prices-advanced-regression-techniques/test.csv')
pred0 = pred.copy()

print(df.shape, pred.shape)
df

irrelevant_features = pd.read_csv('../input/homeprice-features30/KP20_irrel_features_30.csv')

# 2. pEDA #

cols_tokeep = ['Id', 'SalePrice', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'ExterCond', 
               'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', 'HeatingQC', '1stFlrSF', '2ndFlrSF', 'GrLivArea',  
               'KitchenQual', 'GarageArea', 'GarageCars', 'TotRmsAbvGrd', 'BedroomAbvGr', 'FullBath', 
               'HalfBath', 'MiscVal', 'LotFrontage', 
               'ExterQual', 'MSSubClass', 'MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'Neighborhood',
               'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd',
               'Foundation', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'PavedDrive',
               'SaleType', 'SaleCondition', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond', 
               'BsmtExposure', 'BsmtFinType1', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
df = df[cols_tokeep]
X_pred = pred[list(set(cols_tokeep) - set(['SalePrice']))]

# preliminary feature engineering:
df['GrLivArea_log'] = np.log1p(df['GrLivArea'])
X_pred['GrLivArea_log'] = np.log1p(X_pred['GrLivArea'])
# w/o logtransform, scatterplot looks better. not sure whether log tranform helps.


# 3. train-test split #

train_y = df['SalePrice']
train_x = df.drop(columns = ['SalePrice'])

ord_cols = ['ExterCond', 'HeatingQC', 'KitchenQual', 'ExterQual', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond']
#num_cols = [col for col in train_x.columns if train_x[col].nunique() > 12]
num_cols = ['Id', 'LotArea', 'YearBuilt', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
            'GrLivArea', 'GarageArea', 'MiscVal', 'LotFrontage', 'MasVnrArea',
           'TotRmsAbvGrd', 'GarageCars', 'BedroomAbvGr', 'OverallCond', 'OverallQual', 'GrLivArea_log']
cat_cols = list(set(train_x.columns)-set(num_cols)-set(ord_cols))
# for now, view ordinal features as categorical features
print("Numerical features ", num_cols, "\n",
      'Ordinal features', ord_cols, '\n',
      "Categorical features ", cat_cols)

train_x[ord_cols] = train_x[ord_cols].replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1,2,3,4,5])
X_pred[ord_cols] = X_pred[ord_cols].replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1,2,3,4,5])

X_train, X_test, y_train, y_test = train_test_split(train_x, train_y, test_size=0.01, random_state=2)
print(X_train.shape, X_test.shape, y_train.shape, X_pred.shape)

# 4. Missing values #

fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols, num_fill = 'median', cat_fill='missing')

# fill na for ordinal columns. missing values if those columns ususally mean that that feature DNE, so 0.
X_train[ord_cols] = X_train[ord_cols].fillna(value=0)
X_test[ord_cols] = X_test[ord_cols].fillna(value=0)
X_pred[ord_cols] = X_pred[ord_cols].fillna(value=0)

# 5. Feature engineering #

# add dummy features
add_dummyfeatures(X_train, X_test, X_pred, {'OverallQual':1, 'OverallQual':8, 'OverallQual':9, 'OverallQual':10})

log_transformer_mp_i1(X_train, X_test, X_pred, feature_subset=num_cols)

feature_transformer = ColumnTransformer([
    ("num", StandardScaler(), num_cols+ord_cols),
    ("cat", OneHotEncoder(sparse = False, handle_unknown="ignore"), cat_cols),
    ])

X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), columns=feature_transformer.get_feature_names_out())
X_pred = pd.DataFrame(feature_transformer.transform(X_pred), columns=feature_transformer.get_feature_names_out())

# there are many dummies... may wish to use pca here later.

print(X_train.shape, X_test.shape, y_train.shape, X_pred.shape)
# another way to deal with redundant features is to delete those, which do not help in feature importance:
cols = list(X_train.columns)
cols_few = list(set(cols)-set(list(irrelevant_features.loc[irrelevant_features.freq>19, 'col'])))
cols_veryfew = list(set(cols)-set(list(irrelevant_features.loc[irrelevant_features.freq>15, 'col'])))
cols_veryveryfew = list(set(cols)-set(list(irrelevant_features.loc[irrelevant_features.freq>9, 'col'])))
print('Feature sets: ', len(cols), len(cols_few), len(cols_veryfew), len(cols_veryveryfew))
# after running _v5 of this script for like 30 times, 
# I believe that the feature set of 56 features is the best due to decreasing overfitting.

(1460, 81) (1459, 80)
Numerical features  ['Id', 'LotArea', 'YearBuilt', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 'GrLivArea', 'GarageArea', 'MiscVal', 'LotFrontage', 'MasVnrArea', 'TotRmsAbvGrd', 'GarageCars', 'BedroomAbvGr', 'OverallCond', 'OverallQual', 'GrLivArea_log'] 
 Ordinal features ['ExterCond', 'HeatingQC', 'KitchenQual', 'ExterQual', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond'] 
 Categorical features  ['SaleType', 'FullBath', 'MSSubClass', 'Heating', 'Electrical', 'LotShape', 'BsmtExposure', 'RoofStyle', 'Condition1', 'LandContour', 'BldgType', 'MasVnrType', 'GarageFinish', 'Foundation', 'PavedDrive', 'HouseStyle', 'HalfBath', 'Exterior1st', 'GarageType', 'MSZoning', 'Exterior2nd', 'Neighborhood', 'Functional', 'BsmtFinType1', 'SaleCondition', 'LotConfig', 'CentralAir']
(1445, 54) (15, 54) (1445,) (1459, 54)
Missing values imputed successfully
[1 1 1]  variables created
Skewed columns log-transformed:  ['LotArea', 'BsmtFinSF2', 'MiscVal']
(1

In [10]:
# 6. Model Fitting #

print(X_train.shape)

lr = LinearRegression()
lr.fit(X_train[cols_veryveryfew], y_train)
print('OLS ', mean_squared_error(y_train, lr.predict(X_train[cols_veryveryfew])))

time1 = time.time()
svr4 = SVR()
grid_param = {'C': [50000, 100000, 200000, 400000, 600000, 900000]}
svrm4 = GridSearchCV(svr4, grid_param, cv=8, scoring='neg_root_mean_squared_error')
svrm4.fit(X_train[cols_veryveryfew], y_train)
print('SVR 56 cols', 
      svrm4.best_params_, 
      svrm4.best_score_, 
      np.sqrt(mean_squared_error(y_train, svrm4.predict(X_train[cols_veryveryfew]))), 
      time.time()-time1)

xgbb = XGBRegressor(n_estimators=200,
                   max_depth=5,
                   eta=0.06,
                   subsample=0.8,
                   colsample_bytree=0.6)
xgbb.fit(X_train[cols_veryveryfew], y_train)

xgb4 = XGBRegressor()
grid_param = {'n_estimators':[200], 
              'max_depth':[2,3,4,5], 
              'eta':[0.04, 0.06, 0.08, 0.1],
             'subsample':[0.7], 
              'colsample_bytree':[0.5]}
xgbm4 = GridSearchCV(xgb4, grid_param, cv=8, scoring='neg_root_mean_squared_error')
xgbm4.fit(X_train[cols_veryveryfew], y_train)
print('XGB 56 cols', 
      xgbm4.best_params_, 
      xgbm4.best_score_, 
      np.sqrt(mean_squared_error(y_train, xgbm4.predict(X_train[cols_veryveryfew]))), 
      time.time()-time1)

# 7. Model Evaluation #

print('SVR 56', np.sqrt(mean_squared_error(y_test, svrm4.predict(X_test[cols_veryveryfew]))))
print('XGB 56', np.sqrt(mean_squared_error(y_test, xgbm4.predict(X_test[cols_veryveryfew]))))

# sometimes ridge may fail really bad.
print('Total Time is ', time.time()-time0)

# all 3 models perform best with the smallest features set (56 features)

(1445, 224)
OLS  940569570.3653979
SVR 56 cols {'C': 400000} -24838.943023766722 9407.464303121382 22.546037673950195


KeyboardInterrupt: 

In [None]:
# time1 = time.time()
# xgb3 = XGBRegressor()
# grid_param = {'n_estimators':[400,500], 'max_depth':[3,4,5], 'eta':[0.025, 0.035, 0.05, 0.06, 0.07, 0.08], 'subsample':[0.6],
#              'colsample_bytree':[0.2]}
# xgbm3 = GridSearchCV(xgb3, grid_param, scoring='neg_root_mean_squared_error', cv=4, verbose=1)
# xgbm3.fit(X_train[cols_veryveryfew], y_train)
# print('XGB 56 cols', xgbm3.best_params_, xgbm3.best_score_, np.sqrt(mean_squared_error(y_train, xgbm3.predict(X_train[cols_veryveryfew]))), time.time()-time1)
# print('XGB 56', np.sqrt(mean_squared_error(y_test, xgbm3.predict(X_test[cols_veryveryfew]))))


In [22]:
print('train lr 56', np.sqrt(mean_squared_error(y_train, lr.predict(X_train[cols_veryveryfew]))))
print('train SVR 56', np.sqrt(mean_squared_error(y_train, svrm4.predict(X_train[cols_veryveryfew]))))
print('train xgb 56', np.sqrt(mean_squared_error(y_train, xgbb.predict(X_train[cols_veryveryfew]))))
print('test lr 56', np.sqrt(mean_squared_error(y_test, lr.predict(X_test[cols_veryveryfew]))))
print('test SVR 56', np.sqrt(mean_squared_error(y_test, svrm4.predict(X_test[cols_veryveryfew]))))
print('test xgb 56', np.sqrt(mean_squared_error(y_test, xgbb.predict(X_test[cols_veryveryfew]))))

train lr 56 30668.70669534987
train SVR 56 9407.464303121382
train xgb 56 8996.301111825484
test lr 56 22568.400941729713
test SVR 56 33096.39100001603
test xgb 56 24508.902015229287


In [23]:
# 8. Feature importance #

results = permutation_importance(xgbb, X_test[cols_veryveryfew], y_test, n_jobs=-1)
fi = pd.DataFrame({'col':X_test[cols_veryveryfew].columns, 'FI':results.importances_mean})
fi = fi.sort_values('FI', ascending = False)
fi
# OverallQual and GrLivArea ate the two most important features

Unnamed: 0,col,FI
11,num__GrLivArea_log,0.128308
10,num__OverallQual,0.125048
51,num__OverallCond,0.039769
0,num__LotArea,0.036809
15,num__GrLivArea,0.033852
20,cat__Neighborhood_Crawfor,0.028638
49,cat__Exterior1st_BrkFace,0.026266
16,num__YearBuilt,0.026236
50,num__1stFlrSF,0.013787
34,num__BsmtFinSF1,0.012424


In [None]:
pred

In [None]:
#submission_df_vc = pd.DataFrame({'Id': pred.Id, 'SalePrice': yhat}, columns=['Id', 'SalePrice'])
#submission_df_svm = pd.DataFrame({'Id': pred.Id, 'SalePrice': svrm4.predict(X_pred[cols_veryveryfew])}, columns=['Id', 'SalePrice'])
submission_df_bt = pd.DataFrame({'Id': pred.Id, 'SalePrice': xgbm3.predict(X_pred[cols_veryveryfew])}, columns=['Id', 'SalePrice'])

#submission_df_vc.to_csv('KP11_vc.csv',index=False)
#submission_df_svm.to_csv('KP20_svr.csv',index=False)
#submission_df_rf.to_csv('KP11_rf.csv',index=False)
submission_df_bt.to_csv('KP20_bt.csv',index=False)

os.chdir(r'/kaggle/working')

from IPython.display import FileLink
FileLink(r'KP20_bt.csv')

In [None]:
sns.barplot(data=df, x='OverallQual', y='SalePrice')

In [None]:
sns.scatterplot(data=df, x='GrLivArea', y='SalePrice')
# transformed

In [None]:
sns.scatterplot(data=df, x='GrLivArea', y='SalePrice')
# not transformed

In [None]:
train_x[['OverallQual', 'GrLivArea']].skew()

In [None]:
train_x[['GrLivArea']].hist()