## This is Dev notebook for house prices project from Kaggle

#### 1. Load environment and data

In [None]:
import numpy as np
import pandas as pd
import os, time, warnings, random, shap
import seaborn as sns
import matplotlib.pyplot as plt

from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.svm import SVC, SVR
from sklearn.preprocessing import LabelBinarizer, LabelEncoder, OrdinalEncoder, OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression, LinearRegression, Lasso, Ridge
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV, train_test_split, KFold
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestRegressor, RandomForestClassifier, VotingClassifier
from sklearn.metrics import accuracy_score, f1_score, r2_score, mean_squared_error
from sklearn.inspection import permutation_importance
from scipy.special import inv_boxcox
from category_encoders import MEstimateEncoder
from xgboost import XGBClassifier, XGBRegressor

pd.set_option('display.max_columns', 100)
pd.set_option('mode.chained_assignment', None)
pd.set_option('display.expand_frame_repr', False)
warnings.filterwarnings('ignore')

def draw_histograms(df, variables, n_rows, n_cols):
    # stolen from https://stackoverflow.com/questions/29530355/plotting-multiple-histograms-in-grid
    fig=plt.figure()
    for i, var_name in enumerate(variables):
        ax=fig.add_subplot(n_rows,n_cols,i+1)
        df[var_name].hist(bins=10,ax=ax)
        ax.set_title(var_name+" Distribution")
    fig.tight_layout()  
    plt.show()


def fillna_mp_i1(df_train, df_test, df_pred, num_features, cat_features, num_fill='median', cat_fill='mode'):
    """
    This function speeds up filling missing values for 3 main datasets using different imputation methods.
    Later may replace it with some subclass.
    Example: fillna_mp_i1(X_train, X_test, X_pred, num_cols, cat_cols)
    """
    # set df_pred to None if it does not exist
    if (cat_fill=='mode'):
    
        df_train[cat_features] = df_train[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
        df_test[cat_features] = df_test[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
        if (df_pred is not None):
            df_pred[cat_features] = df_pred[cat_features].fillna(value=df_train[cat_features].mode().iloc[0])
            
    if (cat_fill=='missing'):
    
        df_train[cat_features] = df_train[cat_features].fillna(value='missing')
        df_test[cat_features] = df_test[cat_features].fillna(value='missing')
        if (df_pred is not None):
            df_pred[cat_features] = df_pred[cat_features].fillna(value='missing')
        
    if (num_fill=='median'):
        df_train[num_features] = df_train[num_features].fillna(value=df_train[num_features].median())
        df_test[num_features] = df_test[num_features].fillna(value=df_train[num_features].median())
        if (df_pred is not None):
            df_pred[num_features] = df_pred[num_features].fillna(value=df_train[num_features].median())    
    
    all_good = (
    (np.prod(df_train[num_features+cat_features].shape)==df_train[num_features+cat_features].count().sum()) and 
    (np.prod(df_test[num_features+cat_features].shape) == df_test[num_features+cat_features].count().sum()) and 
    (np.prod(df_pred[num_features+cat_features].shape) == df_pred[num_features+cat_features].count().sum()))
    if (all_good):
        print('Missing values imputed successfully')
    else:
        print('There are still some missing values...')
    
    
    
def add_misDummy_mp_i1(df_train, df_test, df_pred, features):
    """
    This function creates new dummy columns for missing features.
    Example: add_misDummy_mp_i1(X_train, X_test, X_pred, ['Age'])
    """
    # set df_pred to None if it does not exist
    for feature_name in features:
        misColName = 'mis'+feature_name
        df_train.loc[df_train[feature_name].isnull(), misColName]=1
        df_train.loc[df_train[feature_name].notnull(), misColName]=0
        df_test.loc[df_test[feature_name].isnull(), misColName]=1
        df_test.loc[df_test[feature_name].notnull(), misColName]=0
        if (df_pred is not None):
            df_pred.loc[df_pred[feature_name].isnull(), misColName]=1
            df_pred.loc[df_pred[feature_name].notnull(), misColName]=0
   

def discretize_mp_i1(df_train, df_test, df_pred, feature, ntiles, delete_feature=False):
    """
    This function divides a continuous feature into quantile groups.
    Example: discretize_mp_i1(X_train, X_test, X_pred, 'Age', 15)
    """
    # set df_pred to None if it does not exist
    _,bin = pd.qcut(df_train[feature], ntiles, retbins = True, labels = False, duplicates = 'drop')
    df_train[feature+'Ntile'] = pd.cut(df_train[feature], 
                                       labels=False, 
                                       duplicates = 'drop', 
                                       bins = bin , 
                                       include_lowest = True)
    df_test[feature+'Ntile'] = pd.cut(df_test[feature], 
                                      labels=False, 
                                      duplicates = 'drop', 
                                      bins = bin , 
                                      include_lowest = True)
    if (df_pred is not None):
        df_pred[feature+'Ntile'] = pd.cut(df_pred[feature], 
                                          labels=False, 
                                          duplicates = 'drop', 
                                          bins = bin , 
                                          include_lowest = True)
    if (delete_feature==True):
        df_train.drop(columns=[feature], inplace=True)
        df_test.drop(columns=[feature], inplace=True)
        df_pred.drop(columns=[feature], inplace=True)
    print('Discretized ',feature, ' into ', len(bin)-1, ' bins')


def log_transformer_mp_i1(df_train, df_test, feature_subset=False, min_skew=3, df_pred=None):
    """
    This function divides a continuous feature into quantile groups.
    Example: log_transformer_mp_i1(X_train, X_test, X_pred, feature_subset=num_cols)
    """
    # set df_pred to None if it does not exist
    if (feature_subset==False):
        features_totransform = df_train.columns
    else:
        features_totransform = feature_subset.copy()
    skewed_vars = list(df_train.skew()[abs(df_train.skew())>min_skew].index)
    for col in list(set(skewed_vars)&set(features_totransform)):
        df_train[col] = np.log1p(df_train[col])
        df_test[col] = np.log1p(df_test[col])
        if df_pred:
            df_pred[col] = np.log1p(df_pred[col])
    print('Skewed columns log-transformed: ', list(set(skewed_vars)&set(features_totransform)))
    
    
def add_dummyfeatures(df_train, df_test, feature_dict, df_pred=None):
    """
    This function adds dummy feature when some feature is equal to value, specified in a dictionary.
    Example: add_dummyfeatures(X_train, X_test, X_pred, {'RoomService':0, 'Spa':0, 'VRDeck':0, 'ShoppingMall':0})
    """
    if df_pred:
        input_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    else:
        input_dimensions = np.array([df_train.shape[1], df_test.shape[1]])    
    for i in range(len(list(feature_dict.items()))):
        feature,value = list(feature_dict.keys())[i], list(feature_dict.values())[i]
        df_train.loc[df_train[feature]==value,(str(feature)+str(value))]=1
        df_train.loc[df_train[feature]!=value,(str(feature)+str(value))]=0
        df_test.loc[df_test[feature]==value,(str(feature)+str(value))]=1
        df_test.loc[df_test[feature]!=value,(str(feature)+str(value))]=0
        if df_pred:
            df_pred.loc[df_pred[feature]==value,(str(feature)+str(value))]=1
            df_pred.loc[df_pred[feature]!=value,(str(feature)+str(value))]=0
    if df_pred:
        output_dimensions = np.array([df_train.shape[1], df_test.shape[1], df_pred.shape[1]])
    else:
        output_dimensions = np.array([df_train.shape[1], df_test.shape[1]])
    print(output_dimensions-input_dimensions, ' variables created') 
    

### target encoding ###
# source: https://www.kaggle.com/code/ryanholbrook/feature-engineering-for-house-prices/notebook

class CrossFoldEncoder:
    def __init__(self, encoder, **kwargs):
        self.encoder_ = encoder
        self.kwargs_ = kwargs  # keyword arguments for the encoder
        self.cv_ = KFold(n_splits=4)

    # Fit an encoder on one split and transform the feature on the
    # other. Iterating over the splits in all folds gives a complete
    # transformation. We also now have one trained encoder on each
    # fold.
    def fit_transform(self, X, y, cols):
        self.fitted_encoders_ = []
        self.cols_ = cols
        X_encoded = []
        for idx_encode, idx_train in self.cv_.split(X):
            fitted_encoder = self.encoder_(cols=cols, **self.kwargs_)
            fitted_encoder.fit(
                X.iloc[idx_encode, :], y.iloc[idx_encode],
            )
            X_encoded.append(fitted_encoder.transform(X.iloc[idx_train, :])[cols])
            self.fitted_encoders_.append(fitted_encoder)
        X_encoded = pd.concat(X_encoded)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded

    # To transform the test data, average the encodings learned from
    # each fold.
    def transform(self, X):
        from functools import reduce

        X_encoded_list = []
        for fitted_encoder in self.fitted_encoders_:
            X_encoded = fitted_encoder.transform(X)
            X_encoded_list.append(X_encoded[self.cols_])
        X_encoded = reduce(
            lambda x, y: x.add(y, fill_value=0), X_encoded_list
        ) / len(X_encoded_list)
        X_encoded.columns = [name + "_encoded" for name in X_encoded.columns]
        return X_encoded    

In [None]:
time0 = time.time()

os.chdir('/home/jupyter/projects_data/house_price')
df = pd.read_csv('train.csv') 
# df.drop(columns = ['Id'], inplace=True)
pred=pd.read_csv('test.csv')
pred0 = pred.copy()

print(df.shape, pred.shape)
df

#### 2. Data cleaning and EDA

In [None]:
# 2. pEDA #

cols_tokeep = ['SalePrice', 'LotArea', 'OverallQual', 'OverallCond', 'YearBuilt', 'ExterCond', 
               'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', 'HeatingQC', '1stFlrSF', '2ndFlrSF', 'GrLivArea',  
               'KitchenQual', 'GarageArea', 'GarageCars', 'TotRmsAbvGrd', 'BedroomAbvGr', 'FullBath', 
               'HalfBath', 'MiscVal', 'LotFrontage', 
               'ExterQual', 'MSSubClass', 'MSZoning', 'LotShape', 'LandContour', 'LotConfig', 'Neighborhood',
               'Condition1', 'BldgType', 'HouseStyle', 'RoofStyle', 'Exterior1st', 'Exterior2nd',
               'Foundation', 'Heating', 'CentralAir', 'Electrical', 'Functional', 'PavedDrive',
               'SaleType', 'SaleCondition', 'BsmtQual', 'BsmtCond', 
               'BsmtExposure', 'BsmtFinType1', 'GarageType', 'GarageFinish', 'GarageQual', 'GarageCond']
df = df[cols_tokeep]

# preliminary feature engineering:
df['GrLivArea_log'] = np.log1p(df['GrLivArea'])
pred['GrLivArea_log'] = np.log1p(pred['GrLivArea'])
# w/o logtransform, scatterplot looks better. not sure whether log tranform helps.

df['MisGarage'] = df.GarageType.isnull().astype(int)
df['MisBsmt'] = df.BsmtCond.isnull().astype(int)
df

In [None]:
ord_cols = ['ExterCond', 'HeatingQC', 'KitchenQual', 'ExterQual', 'BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond']
num_cols = ['LotArea', 'YearBuilt', 'BsmtFinSF1', 'BsmtFinSF2', 'TotalBsmtSF', '1stFlrSF', '2ndFlrSF', 
            'GrLivArea', 'GarageArea', 'MiscVal', 'LotFrontage', 
           'TotRmsAbvGrd', 'GarageCars', 'BedroomAbvGr', 'OverallCond', 'OverallQual', 'GrLivArea_log']
cat_cols = list(set(df.columns)-set(num_cols)-set(ord_cols)-set(['SalePrice']))
print("Numerical features ", num_cols, "\n",
      'Ordinal features', ord_cols, '\n',
      "Categorical features ", cat_cols)

df[ord_cols] = df[ord_cols].replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1,2,3,4,5])
pred[ord_cols] = pred[ord_cols].replace(['Po', 'Fa', 'TA', 'Gd', 'Ex'], [1,2,3,4,5])

In [None]:
df[cat_cols].nunique()

According to feature importances, only Neighboorhood and possibly Exterior1 categorical features are really useful.
And they have too many unique values to use OHC. All other categorical features are not important enough to bother with ohe.
So use target encoding for all of them.

In [None]:
test_size = 0.2
df.reset_index(inplace=True, drop=True)
#random.seed(2)
test_index = random.sample(list(df.index), int(test_size*df.shape[0]))
train = df.iloc[list(set(df.index)-set(test_index))]
test = df.iloc[test_index]
train.reset_index(drop=True, inplace=True)
test.reset_index(drop=True, inplace=True)
train.drop(columns=['id'],inplace=True, errors='ignore')
test.drop(columns=['id'],inplace=True, errors='ignore')
display(train.shape, test.shape, train.head(3), test.head(3))

In [None]:
# fill missing values
display(train.info())

mis_col_mode = ['LotFrontage', 'Electrical']
mis_cat_cols = ['BsmtExposure', 'BsmtFinType1', 'GarageType', 'GarageFinish']
mis_num_cols = ['BsmtQual', 'BsmtCond', 'GarageQual', 'GarageCond']

for col in mis_col_mode:
    train[col] = train[col].fillna(train[col].mode()[0])
    test[col] = test[col].fillna(train[col].mode()[0])

for col in mis_cat_cols:
    train[col] = train[col].fillna(value='missing')
    test[col] = test[col].fillna(value='missing')
    
for col in mis_num_cols:
    train[col] = train[col].fillna(value=-1)
    test[col] = test[col].fillna(value=-1)

In [None]:
sns.histplot(x='SalePrice', data=train)

In [None]:
sns.barplot(x='SalePrice', y='Neighborhood', data = train)
plt.show()
sns.barplot(y='SalePrice', x='OverallQual', data = train)
plt.show()
sns.scatterplot(y='SalePrice', x='GrLivArea', data = train)
plt.show()


#### 3. Feature engineering

In [None]:
train['SalePrice'] = np.log(train['SalePrice'])
test['SalePrice'] = np.log(test['SalePrice'])

# do target encoding #

encoder = CrossFoldEncoder(MEstimateEncoder, m=10)
train_encoded = encoder.fit_transform(train, train.SalePrice, cols=cat_cols)
test_encoded = encoder.transform(test)

train.drop(columns=cat_cols, inplace=True)
test.drop(columns=cat_cols,  inplace=True)
train = pd.concat([train, train_encoded], axis = 1)
test = pd.concat([test, test_encoded], axis = 1)

display(train.shape, train.head(), train.count())
train0 = train.copy()
test0 = test.copy()

In [None]:
add_dummyfeatures(train, test, {'OverallQual':1})
add_dummyfeatures(train, test, {'OverallQual':8})
add_dummyfeatures(train, test, {'OverallQual':9})
add_dummyfeatures(train, test, {'OverallQual':10})

log_transformer_mp_i1(train, test, feature_subset=num_cols)

train

In [None]:
X_train = train.copy()
y_train = X_train.pop('SalePrice')
X_test = test.copy()
y_test = X_test.pop('SalePrice')
print(X_train.shape, X_test.shape, y_train.shape)


In [None]:
feature_transformer = ColumnTransformer([
    ("num", StandardScaler(), num_cols+ord_cols),
    ], remainder = "passthrough")

X_train = pd.DataFrame(feature_transformer.fit_transform(X_train), 
                       columns=feature_transformer.get_feature_names_out())
X_test = pd.DataFrame(feature_transformer.transform(X_test), 
                      columns=feature_transformer.get_feature_names_out())

# there are many dummies... may wish to use pca here later.

print(X_train.shape, X_test.shape, y_train.shape)
display(X_train)

#### 4. Modeling

In [None]:
lr = LinearRegression()
lr.fit(X_train, y_train)
print('OLS ', mean_squared_error(y_train, lr.predict(X_train))**0.5)

In [None]:
xgbb = XGBRegressor(n_estimators=200,
                   max_depth=5,
                   eta=0.06,
                   subsample=0.8,
                   colsample_bytree=0.6)
xgbb.fit(X_train, y_train)
print('xgb ', mean_squared_error(y_train, xgbb.predict(X_train))**0.5)

In [None]:
xgbgs = XGBRegressor()
grid_param = {'n_estimators':[200], 
              'max_depth':[2, 3, 4], 
              'eta':[0.05, 0.07, 0.09],
              'subsample':[0.7], 
              'colsample_bytree':[0.5]}
xgbgs = GridSearchCV(xgbgs, grid_param, cv=2, scoring='neg_root_mean_squared_error')
xgbgs.fit(X_train, y_train)

print('xgbgs ',       
      xgbgs.best_params_, 
      xgbgs.best_score_, 
      mean_squared_error(y_train, xgbgs.predict(X_train))**0.5)

In [None]:
y_train = np.exp(y_train)
y_test = np.exp(y_test)

# performance evaluation: train set
print('Train: ')
print('OLS', np.sqrt(mean_squared_error(y_train, np.exp(lr.predict(X_train)))))
print('XGB', np.sqrt(mean_squared_error(y_train, np.exp(xgbb.predict(X_train)))))
print('XGBgs', np.sqrt(mean_squared_error(y_train, np.exp(xgbgs.predict(X_train)))))

# performance evaluation: test set
print('Test: ')
print('OLS', np.sqrt(mean_squared_error(y_test, np.exp(lr.predict(X_test)))))
print('XGB', np.sqrt(mean_squared_error(y_test, np.exp(xgbb.predict(X_test)))))
print('XGBgs', np.sqrt(mean_squared_error(y_test, np.exp(xgbgs.predict(X_test)))))

In [None]:
# SVR sucks, so I omit it.
# as usual, XGB GS fails to clearly beat XGB baseline.

#### 5. Feature importance

In [None]:
# template here: https://www.kaggle.com/code/kaanboke/catboost-lightgbm-xgboost-explained-by-shap/notebook
explainerxgbc = shap.TreeExplainer(xgbb)
shap_values_XGBoost_test = explainerxgbc.shap_values(X_test)
shap_values_XGBoost_train = explainerxgbc.shap_values(X_train)

vals = np.abs(shap_values_XGBoost_test).mean(0)
feature_names = X_test.columns
feature_importance = pd.DataFrame(list(zip(feature_names, vals)),
                                 columns=['col_name','feature_importance_vals'])
feature_importance.sort_values(by=['feature_importance_vals'],
                              ascending=False, inplace=True)
#display(feature_importance)

shap.summary_plot(shap_values_XGBoost_test, X_test, plot_type="bar", plot_size=(6,6), max_display=20)
shap.summary_plot(shap_values_XGBoost_train, X_train,plot_type="dot", plot_size=(6,6), max_display=20)

In [None]:
print('Total time for this Dev script: ', time.time() - time0)