In [1]:
import pandas as pd
import numpy as np
import os

from datetime import datetime

from sklearn.metrics import mean_squared_error, mean_absolute_percentage_error
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV, StratifiedKFold
from sklearn.preprocessing import OneHotEncoder
import xgboost as xgb

import warnings
warnings.filterwarnings('ignore')

In [2]:
def RMSE(y, y_hat):
    return np.sqrt(mean_squared_error(y, y_hat))

def MAPE(y, y_hat):
    return mean_absolute_percentage_error(y_pred=y_hat, y_true=y)

def timer(start_time=None):
    if not start_time:
        start_time = datetime.now()
        return start_time
    elif start_time:
        thour, temp_sec = divmod((datetime.now() - start_time).total_seconds(), 3600)
        tmin, tsec = divmod(temp_sec, 60)
        print('\n Time taken: %i hours %i minutes and %s seconds.' % (thour, tmin, round(tsec, 2)))



In [3]:
" Importing data "
path = os.getcwd() + "\\"
df_train = pd.read_csv(path+"train.csv")
df_test = pd.read_csv(path+"test.csv")
df_test_id = df_test.Id

In [4]:
"""
First things first, lets check for missing values in the dataset
Then, I will drop every collumn with more than 10% of missing values and fill the remaining ones with the column median
"""
cols_to_keep = []
for col in df_train.columns:
    N       = len(df_train[col])
    num_na  = df_train[col].isna().sum()
    perc_na = 100*num_na/N
    print("%s -> %.2f"%(col, perc_na))
    if perc_na <= 10:
        cols_to_keep.append(col)

df_train = df_train.loc[:,cols_to_keep]
df_test  = df_test.loc[:,cols_to_keep[0:-1]]

print("Shape dt_train = (%d, %d)"%(df_train.shape[0],df_train.shape[1]))
print("Shape dt_test = (%d, %d)"%(df_test.shape[0],df_test.shape[1]))

Id -> 0.00
MSSubClass -> 0.00
MSZoning -> 0.00
LotFrontage -> 17.74
LotArea -> 0.00
Street -> 0.00
Alley -> 93.77
LotShape -> 0.00
LandContour -> 0.00
Utilities -> 0.00
LotConfig -> 0.00
LandSlope -> 0.00
Neighborhood -> 0.00
Condition1 -> 0.00
Condition2 -> 0.00
BldgType -> 0.00
HouseStyle -> 0.00
OverallQual -> 0.00
OverallCond -> 0.00
YearBuilt -> 0.00
YearRemodAdd -> 0.00
RoofStyle -> 0.00
RoofMatl -> 0.00
Exterior1st -> 0.00
Exterior2nd -> 0.00
MasVnrType -> 59.73
MasVnrArea -> 0.55
ExterQual -> 0.00
ExterCond -> 0.00
Foundation -> 0.00
BsmtQual -> 2.53
BsmtCond -> 2.53
BsmtExposure -> 2.60
BsmtFinType1 -> 2.53
BsmtFinSF1 -> 0.00
BsmtFinType2 -> 2.60
BsmtFinSF2 -> 0.00
BsmtUnfSF -> 0.00
TotalBsmtSF -> 0.00
Heating -> 0.00
HeatingQC -> 0.00
CentralAir -> 0.00
Electrical -> 0.07
1stFlrSF -> 0.00
2ndFlrSF -> 0.00
LowQualFinSF -> 0.00
GrLivArea -> 0.00
BsmtFullBath -> 0.00
BsmtHalfBath -> 0.00
FullBath -> 0.00
HalfBath -> 0.00
BedroomAbvGr -> 0.00
KitchenAbvGr -> 0.00
KitchenQual -> 0

In [5]:
" Now I will fill every missing value from a numeric column as the column's median wile also filling every string missing value as NA"

for col in df_train.columns:
    if df_train[col].isna().sum() > 0 :
        print("NA prior = ",df_train[col].isna().sum())
        if isinstance(df_train[col].iloc[0], (int, float)):
            print("%s fill with median"%col)
            train_median = np.nanmedian(df_train[col])
            df_train[col] = df_train[col].astype('float64')
            df_train[col].fillna(train_median, inplace = True)
        else:
            print("%s fill with NA as string"%col)
            df_train[col].fillna("NA", inplace = True)
        print("NA after = ",df_train[col].isna().sum())

for col in df_test.columns:
    if df_test[col].isna().sum() > 0 :
        if isinstance(df_test[col].iloc[0], (int, float)):
            print("%s fill with median"%col)
            test_median  = np.nanmedian(df_test[col])
            df_test[col] = df_test[col].astype('float64')
            df_test[col].fillna(test_median, inplace = True)
        else:
            print("%s fill with NA as string"%col)
            df_test[col].fillna("NA", inplace = True)

NA prior =  8
MasVnrArea fill with median
NA after =  0
NA prior =  37
BsmtQual fill with NA as string
NA after =  0
NA prior =  37
BsmtCond fill with NA as string
NA after =  0
NA prior =  38
BsmtExposure fill with NA as string
NA after =  0
NA prior =  37
BsmtFinType1 fill with NA as string
NA after =  0
NA prior =  38
BsmtFinType2 fill with NA as string
NA after =  0
NA prior =  1
Electrical fill with NA as string
NA after =  0
NA prior =  81
GarageType fill with NA as string
NA after =  0
NA prior =  81
GarageYrBlt fill with median
NA after =  0
NA prior =  81
GarageFinish fill with NA as string
NA after =  0
NA prior =  81
GarageQual fill with NA as string
NA after =  0
NA prior =  81
GarageCond fill with NA as string
NA after =  0
MSZoning fill with NA as string
Utilities fill with NA as string
Exterior1st fill with NA as string
Exterior2nd fill with NA as string
MasVnrArea fill with median
BsmtQual fill with NA as string
BsmtCond fill with NA as string
BsmtExposure fill with NA 

In [6]:

" We have to take care of the string columns - lets do a One Hot Encoding to each of them "
for col in df_train.columns:
    if not pd.api.types.is_numeric_dtype(df_train[col][0]):
        print(col)
        encoder = OneHotEncoder()
        df_aux = pd.DataFrame(encoder.fit_transform(df_train[[col]]).toarray())
        unique_values = df_train[col].unique()
        df_aux.columns = [col+"_"+unique_values[i] for i in range(0,len(unique_values))]
        print("Len new columns = ",len(df_aux.columns))
        print("Total columns prior drop = ",df_train.shape[1])
        df_train.drop(col, axis = 1, inplace = True)
        print("Total columns after drop = ",df_train.shape[1])
        df_train = pd.concat([df_train, df_aux], axis = 1)
        print("Total columns after add = ",df_train.shape[1])

for col in df_test.columns:
    if not pd.api.types.is_numeric_dtype(df_test[col][0]):
        print(col)
        encoder = OneHotEncoder()
        df_aux = pd.DataFrame(encoder.fit_transform(df_test[[col]]).toarray())
        unique_values = df_test[col].unique()
        print(unique_values)
        df_aux.columns = [col+"_"+unique_values[i] for i in range(0,len(unique_values))]
        print("Len new columns = ",len(df_aux.columns))
        print("Total columns prior drop = ",df_test.shape[1])
        df_test.drop(col, axis = 1, inplace = True)
        print("Total columns after drop = ",df_test.shape[1])
        df_test = pd.concat([df_test, df_aux], axis = 1)
        print("Total columns after add = ",df_test.shape[1])

common_cols = list(set(df_test.columns) & set(df_train.columns))
sale_price = df_train.SalePrice

df_test = df_test.loc[:, common_cols]
df_train = df_train.loc[:, common_cols]
df_train['SalePrice'] = sale_price

print("Shape dt_train = (%d, %d)"%(df_train.shape[0],df_train.shape[1]))
print("Shape dt_test = (%d, %d)"%(df_test.shape[0],df_test.shape[1]))

MSZoning
Len new columns =  5
Total columns prior drop =  74
Total columns after drop =  73
Total columns after add =  78
Street
Len new columns =  2
Total columns prior drop =  78
Total columns after drop =  77
Total columns after add =  79
LotShape
Len new columns =  4
Total columns prior drop =  79
Total columns after drop =  78
Total columns after add =  82
LandContour
Len new columns =  4
Total columns prior drop =  82
Total columns after drop =  81
Total columns after add =  85
Utilities
Len new columns =  2
Total columns prior drop =  85
Total columns after drop =  84
Total columns after add =  86
LotConfig
Len new columns =  5
Total columns prior drop =  86
Total columns after drop =  85
Total columns after add =  90
LandSlope
Len new columns =  3
Total columns prior drop =  90
Total columns after drop =  89
Total columns after add =  92
Neighborhood
Len new columns =  25
Total columns prior drop =  92
Total columns after drop =  91
Total columns after add =  116
Condition1
Len

In [7]:
" Now that our dataset is clean, lets divide the train dataset into a fit and validation datasets"

X = df_train.drop(columns="SalePrice")
y = df_train.SalePrice

X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.33, random_state=42)

In [8]:
n_estimators  = [1500, 2500]
max_depth     = [3, 5]
max_leaves    = [3, 5]
learning_rate = [0.01, 0.001]
booster       = ['gbtree', 'gblinear'] # gbtree, gblinear or dart
random_state  = [0]
eval_metric   = mean_squared_error  
early_stopping_rounds = [3, 5]

params = {  'n_estimators'  : n_estimators,
            'max_depth'     : max_depth,
            'max_leaves'    : max_leaves,
            'learning_rate' : learning_rate,
            'booster'       : booster,
            'random_state'  : random_state,
            # 'early_stopping_rounds': early_stopping_rounds
}

reg = xgb.XGBRegressor(silent = True)

In [9]:
skf = StratifiedKFold(n_splits=5, shuffle = True, random_state = 0)

random_search = GridSearchCV(reg, param_grid = params, scoring = 'neg_root_mean_squared_error', cv = skf.split(X_train,y_train))

In [10]:
start_time = timer(None) # timing starts from this point for "start_time" variable
random_search.fit(X_train, y_train)
timer(start_time) # timing ends here for "start_time" variable


 Time taken: 0 hours 13 minutes and 24.58 seconds.


In [11]:
print(random_search.best_params_)

random_state = random_search.best_params_["random_state"]
n_estimators = random_search.best_params_["n_estimators"]
max_leaves = random_search.best_params_["max_leaves"]
max_depth = random_search.best_params_["max_depth"]
learning_rate = random_search.best_params_["learning_rate"]
booster = random_search.best_params_["booster"]

reg = xgb.XGBRegressor(silent = True, random_state = random_state,
                        n_estimators = n_estimators, max_leaves = max_leaves,
                        max_depth = max_depth, learning_rate = learning_rate,
                        booster = booster)

{'booster': 'gbtree', 'learning_rate': 0.01, 'max_depth': 5, 'max_leaves': 5, 'n_estimators': 2500, 'random_state': 0}


In [12]:
start_time = timer(None)
reg.fit(X_train,y_train);
timer(start_time)


 Time taken: 0 hours 0 minutes and 7.4 seconds.


In [13]:
y_hat_train = reg.predict(X_train)
y_hat_val   = reg.predict(X_val)

In [14]:
print("MAPE train = ",MAPE(y_train, y_hat_train))
print("MAPE val = ",MAPE(y_val, y_hat_val))

print("RMSE train = ",RMSE(y_train, y_hat_train))
print("RMSE val = ",RMSE(y_val, y_hat_val))

MAPE train =  0.0554371102030655
MAPE val =  0.09624251648633741
RMSE train =  13016.30459074641
RMSE val =  30648.841101424172


In [15]:
reg = xgb.XGBRegressor(silent = True, random_state = random_state,
                        n_estimators = n_estimators, max_leaves = max_leaves,
                        max_depth = max_depth, learning_rate = learning_rate,
                        booster = booster)

start_time = timer(None)
reg.fit(X,y);
timer(start_time)


 Time taken: 0 hours 0 minutes and 4.68 seconds.


In [20]:
X_test = df_test

y_hat      = reg.predict(X)
y_hat_test = reg.predict(X_test)

print("MAPE = ",MAPE(y, y_hat))
print("RMSE = ",RMSE(y, y_hat))

prediction = pd.DataFrame({"Id":df_test_id, "SalePrice":y_hat_test})
prediction.to_csv("test_prediction_xgboost_v0.csv", index=False)

MAPE =  0.06443982043560775
RMSE =  14866.262671218194
