# 6 - Train-Dev-Test preparation and Model Selection


In [1]:
# Import libraries:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pyMechkar.analysis import Table1, train_test
import sklearn.metrics as metrics 
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

  import pandas.util.testing as tm


In [2]:
def drop_additional_index_columns(_df):
    # dropping additional index columns that start with "Unnamed" - dropping these columns
    columns_to_drop = [x for x in _df.columns.to_list() if x.startswith("Unnamed")]
    print("dropping coulmns: ", columns_to_drop) # [Unamed..., Unamed..]
    return _df.drop(columns=columns_to_drop, axis=1, inplace=False)

In [3]:
df_variables_selection = pd.read_csv('variables_selection_final.csv')
df_variables_selection = drop_additional_index_columns(df_variables_selection)
display(df_variables_selection.head(5))
# x_selected_columns include the variables that selected in '5 - Feature Selection'
x_selected_columns = df_variables_selection['var'].to_list()

y_column = 'booked_up_target'
print("x_selected_columns: ")
print(x_selected_columns)
# listing_id is not in x_selected_columns - adding it for the serving of the model
x_selected_columns = x_selected_columns + ['listing_id']

dropping coulmns:  ['Unnamed: 0']


Unnamed: 0,var,spearmanr,py_mechkar,univariable_analysis_count,lasso,random_forest,gradient_boosting,linear_svc,ridge,multivariable_analysis_count,total_count
0,kmean_cluster_availability,1,1,2,1,1,0,1,1,4,6
1,target_num_of_day_in_period_minus_num_of_day_i...,1,1,2,1,1,1,1,0,4,6
2,room_type_Entire home/apt_require_guest_phone_...,1,1,2,0,0,0,1,1,2,4
3,host_response_time_missing_host_response_time_...,1,1,2,0,1,0,0,1,2,4
4,host_response_time_within an hour_require_gues...,1,1,2,0,0,0,1,1,2,4


x_selected_columns: 
['kmean_cluster_availability', 'target_num_of_day_in_period_minus_num_of_day_in_previous_period', 'room_type_Entire home/apt_require_guest_phone_verification_t', 'host_response_time_missing_host_response_time_bed_type_Real Bed', 'host_response_time_within an hour_require_guest_phone_verification_f', 'host_response_time_within an hour_require_guest_phone_verification_t', 'host_response_time_missing_host_response_time_require_guest_phone_verification_f', 'host_is_superhost_t_require_guest_phone_verification_t', 'require_guest_phone_verification_f_concat_comments_sentiment_missing_concat_comments_sentiment', 'children���s_dinnerware', 'long_term_stays_allowed', 'property_type_Apartment', 'host_response_time_missing_host_response_time', 'host_response_rate_cat_host_response_rate_missing', 'avg_dollar_price_in_previous_period_cat_avg_dollar_price_in_previous_period_0%_to_25%', 'avg_dollar_price_in_previous_period_cat_avg_dollar_price_in_previous_period_50%_to_75%', 'avg

In [4]:
# reading the flat table after feature enrichement 
df = pd.read_csv("flat_file_after_feature_enrichment.csv")
df = drop_additional_index_columns(df)
df.head(3)

  interactivity=interactivity, compiler=compiler, result=result)


dropping coulmns:  ['Unnamed: 0']


Unnamed: 0,listing_id,name,target_start_date_period,target_end_date_period,start_date_previous_period,end_date_previous_period,host_id,host_name,neighbourhood,latitude,...,host_total_listings_count_cat_host_total_listings_count_0%_to_25%,host_total_listings_count_cat_host_total_listings_count_50%_to_75%,host_total_listings_count_cat_host_total_listings_count_75%_to_100%,host_total_listings_count_cat_host_total_listings_count_missing,bedrooms_cat_bedrooms_0%_to_25%,bedrooms_cat_bedrooms_75%_to_100%,bedrooms_cat_bedrooms_missing,sqrt_bedrooms_cat_sqrt_bedrooms_0%_to_25%,sqrt_bedrooms_cat_sqrt_bedrooms_75%_to_100%,sqrt_bedrooms_cat_sqrt_bedrooms_missing
0,7071,BrightRoom with sunny greenview!,2019-06-01,2019-08-31,2018-11-07,2019-05-31,17391,Bright,Helmholtzplatz,52.543157,...,1,0,0,0,1,0,0,1,0,0
1,7071,BrightRoom with sunny greenview!,2019-07-01,2019-09-30,2018-11-07,2019-06-30,17391,Bright,Helmholtzplatz,52.543157,...,1,0,0,0,1,0,0,1,0,0
2,7071,BrightRoom with sunny greenview!,2019-08-01,2019-11-06,2018-11-07,2019-07-31,17391,Bright,Helmholtzplatz,52.543157,...,1,0,0,0,1,0,0,1,0,0


In [5]:
# Getting only columns that have been selected in '5 - Feature Selection'
df = df[x_selected_columns + [y_column]]

In [6]:
df.shape

(157864, 55)

## 1. Train-Dev-Test preparation


### Step 1:
Create a Test partition base on 20% of the total data.

This partition will be set aside and will not be used until the end of the project.

The train + dev should be balanced with the test. 

In [7]:
# after few retries, using seed 9 to get perfectly balanced training and test datasets.
df_train_dev, df_test = train_test(data=df, prop=0.8, seed=9, tableone=True)

Begining analysis...
Factorizing... please wait
*******************************************************
[]
[********************************************************
[********************************************************
[]
------ Finished in 59.59532284736633econds -----
 
You got a perfectly balanced training and test datasets
 
                                             Variables    Categories  \
0                                          Individuals             n   
1                           kmean_cluster_availability             1   
2                           kmean_cluster_availability             2   
3                           kmean_cluster_availability             0   
4    target_num_of_day_in_period_minus_num_of_day_i...     Mean (SD)   
..                                                 ...           ...   
146                               distance_from_center  Median (IQR)   
148                                         listing_id     Mean (SD)   
149              

In [8]:
# verify the df_train_dev is shuffeled
df_train_dev[0:10]

Unnamed: 0,kmean_cluster_availability,target_num_of_day_in_period_minus_num_of_day_in_previous_period,room_type_Entire home/apt_require_guest_phone_verification_t,host_response_time_missing_host_response_time_bed_type_Real Bed,host_response_time_within an hour_require_guest_phone_verification_f,host_response_time_within an hour_require_guest_phone_verification_t,host_response_time_missing_host_response_time_require_guest_phone_verification_f,host_is_superhost_t_require_guest_phone_verification_t,require_guest_phone_verification_f_concat_comments_sentiment_missing_concat_comments_sentiment,children���s_dinnerware,...,mean_precipitation_days_in_previous_period,mean_precipitation_days_in_target_period,mean_sunshine_hours_in_day_in_previous_period,mean_sunshine_hours_in_day_in_target_period,mean_sunshine_hours_in_month_in_previous_period,mean_sunshine_hours_in_month_in_target_period,distance_from_center,listing_id,booked_up_target,split
136492,1,-169,0,0,0,0,1,0,1,0,...,9.222222,8.0,4.388889,5.333333,133.888889,165.0,3.307769,11119225,1,train
148535,1,-54,0,0,0,0,0,0,0,0,...,9.6,9.0,2.2,7.0,67.0,210.0,3.857295,14418548,1,train
23227,1,-22,0,1,0,0,1,0,0,0,...,9.75,8.666667,1.875,5.666667,55.0,175.0,4.385238,18972711,1,train
123479,0,-169,0,1,0,0,1,0,0,0,...,9.222222,8.0,4.388889,5.333333,133.888889,165.0,3.308469,1211965,1,train
107254,0,-144,0,0,0,0,0,0,0,0,...,9.375,8.0,4.0,6.5,120.625,205.0,6.017371,15779804,1,train
36734,2,-144,0,0,1,0,0,0,1,0,...,9.375,8.0,4.0,6.5,120.625,205.0,2.710462,21193465,0,train
89511,2,-54,0,0,0,0,0,0,0,0,...,9.6,9.0,2.2,7.0,67.0,210.0,1.762141,4441427,0,train
30684,1,3,0,0,0,0,0,0,1,0,...,10.333333,8.333333,1.666667,4.0,48.333333,123.333333,1.544672,28377052,1,train
59388,0,3,0,0,1,0,0,0,1,0,...,10.333333,8.333333,1.666667,4.0,48.333333,123.333333,3.552645,28970028,0,train
88204,1,-114,0,0,0,0,0,0,0,0,...,9.285714,8.666667,3.5,7.333333,106.428571,226.666667,3.164938,3801906,1,train


### Step 2:

Divide the train_dev dataset into 80% train and 20% development/validation.

The train dataset will be used to train the models, while the development dataset will be used for assessment of the model performance (using AUC as the performance measurement).

The train should be balanced with the development dataset.

In [9]:
# after few retries, using seed 9 to get perfectly balanced training and development datasets.

df_train, df_dev = train_test(data=df_train_dev, prop=0.8, seed=9, tableone=True)

Begining analysis...
Factorizing... please wait
*******************************************************
[]
[********************************************************
[********************************************************
[]
------ Finished in 51.79312467575073econds -----
 
You got a perfectly balanced training and test datasets
 
                                             Variables    Categories  \
0                                          Individuals             n   
1                           kmean_cluster_availability             1   
2                           kmean_cluster_availability             2   
3                           kmean_cluster_availability             0   
4    target_num_of_day_in_period_minus_num_of_day_i...     Mean (SD)   
..                                                 ...           ...   
146                               distance_from_center  Median (IQR)   
148                                         listing_id     Mean (SD)   
149              

In [10]:
print(f"df_train shape: {df_train.shape}.  {round(df_train.shape[0]*100/df.shape[0],2)}% of records"  )
print(f"df_dev  shape: {df_dev .shape}.  {round(df_dev .shape[0]*100/df.shape[0],2)}% of records"  )
print(f"df_test shape: {df_test.shape}.  {round(df_test.shape[0]*100/df.shape[0],2)}% of records"  )

df_train shape: (101032, 56).  64.0% of records
df_dev  shape: (25259, 56).  16.0% of records
df_test shape: (31573, 56).  20.0% of records


In [11]:
# Splting each of 3 datasets to X and y
X_train = df_train[x_selected_columns]
y_train = df_train[y_column]
X_dev = df_dev[x_selected_columns]
y_dev = df_dev[y_column]
X_test = df_test[x_selected_columns]
y_test = df_test[y_column]

# 2 . Model Selection


In this section, I am going to run many different prediction models, without changing their parameters.

The outcome is 'booked_up_target' is binary variable which is set to 1 if at least 70 out of 90 days are booked and 0 otherwise. Therefore, I wll use classification models.

Checking the model performance using AUC metric. By using Area Under the Curve (AUC) metric, I will Select the best performing model. 

Area Under the Curve (AUC) is a statistical metric that indicates the degree of accuracy of a classification model.

● Indicates the probability that predicting the outcome is better than chance
● It is an approximation of the concordance statistic (C-statistic).

● Values range from 0.5 to 1.0:

    ○ 0.5 indicates that the model do not perform better than chance
    ○ 1.0 indicates that the model perfectly predicts the outcome.
    
##### I decided to choose Area Under the Curve (AUC) metric because the outcome 'booked_up_target' is unbalanced. We saw in the EDA section that the percentage of "1" value is 72% and "0" value is 28%. Therefore, the best metric that can suit us and also considered a quality one is AUC. 

In [12]:
# Getting as input instance of model object (for eaxample RidgeClassifier()) 
# for training (fit) and predict on validation data set 

def model_classifier_auc(_X_train, _y_train, _X_validation, _y_validation, _model_object):
    
    # Model object should support fit method
    _model_object.fit(_X_train, _y_train)
    y_hat_train = _model_object.predict(_X_train)
    y_hat_validation = _model_object.predict(_X_validation)
    auc_res_train = metrics.roc_auc_score(_y_train, y_hat_train)
    auc_res_validation = metrics.roc_auc_score(_y_validation, y_hat_validation)
    return {"model": type(_model_object).__name__, 
            "model_params": str(_model_object.get_params()), 
            "auc_train": auc_res_train,
            "auc_dev": auc_res_validation}

In [13]:
from datetime import datetime

# running different prediction models
models_objects_to_test = [RidgeClassifier(), # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.RidgeClassifier.html
                          LogisticRegression(), # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegression.html
                          DecisionTreeClassifier(), # https://scikit-learn.org/stable/modules/generated/sklearn.tree.DecisionTreeClassifier.html
                          RandomForestClassifier(),# https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.RandomForestClassifier.html
                          AdaBoostClassifier(), # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.AdaBoostClassifier.html
                          GradientBoostingClassifier(), # https://scikit-learn.org/stable/modules/generated/sklearn.ensemble.GradientBoostingClassifier.html
                          KNeighborsClassifier(), # https://scikit-learn.org/stable/modules/generated/sklearn.neighbors.KNeighborsClassifier.html
                          XGBClassifier()] # https://xgboost.readthedocs.io/en/latest/python/python_api.html

models_predict_train_dev_list = []

for model_obj in models_objects_to_test:
    #print(datetime.now(), "start model_classifier_auc of ", type(model_obj).__name__)
    model_predict_dev_dict = model_classifier_auc(X_train, y_train, X_dev, y_dev, model_obj)
    models_predict_train_dev_list.append(model_predict_dev_dict)

    #print(datetime.now(), "end model_classifier_auc of ", type(model_obj).__name__)

df_models_train_dev_predict = pd.DataFrame(models_predict_train_dev_list)
df_models_train_dev_predict

  overwrite_a=True).T


Unnamed: 0,model,model_params,auc_train,auc_dev
0,RidgeClassifier,"{'alpha': 1.0, 'class_weight': None, 'copy_X':...",0.923442,0.920604
1,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.5,0.5
2,DecisionTreeClassifier,"{'class_weight': None, 'criterion': 'gini', 'm...",1.0,0.984514
3,RandomForestClassifier,"{'bootstrap': True, 'class_weight': None, 'cri...",0.999369,0.986292
4,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.93866,0.935821
5,GradientBoostingClassifier,"{'criterion': 'friedman_mse', 'init': None, 'l...",0.9765,0.976543
6,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...",0.924828,0.855204
7,XGBClassifier,"{'base_score': 0.5, 'booster': 'gbtree', 'cols...",0.972966,0.972697


##### Base on AUC performance measurement selecting XGBClassifier as the classifier model for this project. It can be seen that the score both in train and validation is high and balanced between them. It can be seen that there are models with higher scores than XGB. I preferred not choosing them for fear of over-fitting. 

# 3 . Model Fine-Tuning

Fine-tune XGBClassifier (selected model). 
1. Create vectors with a wide range of different values for part of the parameters that may affect the performance of the model.
2. Using Grid-Search for finding the best matching parameters.


#### 3.1 create vectors with a wide range of different values for each of the parameters that may affect the performance of XGBClassifier()

https://xgboost.readthedocs.io/en/stable/parameter.html


In [14]:
# XGBClassifier defulat params (these are the params we used in step 2)
XGBClassifier().get_params()

{'base_score': 0.5,
 'booster': 'gbtree',
 'colsample_bylevel': 1,
 'colsample_bynode': 1,
 'colsample_bytree': 1,
 'gamma': 0,
 'learning_rate': 0.1,
 'max_delta_step': 0,
 'max_depth': 3,
 'min_child_weight': 1,
 'missing': None,
 'n_estimators': 100,
 'n_jobs': 1,
 'nthread': None,
 'objective': 'binary:logistic',
 'random_state': 0,
 'reg_alpha': 0,
 'reg_lambda': 1,
 'scale_pos_weight': 1,
 'seed': None,
 'silent': None,
 'subsample': 1,
 'verbosity': 1}

In [15]:
xgb = XGBClassifier()
parameters = {'booster':['gbtree', 'gblinear', 'dart'],
              'gamma':[i/10.0 for i in range(0,5)],
              'max_depth': [i for i in range(3,7)],
              'learning_rate': [i/10.0 for i in range(0,4)],
              }

xgb_grid = GridSearchCV(xgb,
                        parameters,
                        cv = 2,
                        n_jobs = 5,
                        verbose=True)
xgb_grid.fit(X_train, y_train)
print(xgb_grid.best_score_)
print(xgb_grid.best_params_)

Fitting 2 folds for each of 240 candidates, totalling 480 fits


[Parallel(n_jobs=5)]: Using backend LokyBackend with 5 concurrent workers.
[Parallel(n_jobs=5)]: Done  40 tasks      | elapsed:  3.5min
[Parallel(n_jobs=5)]: Done 190 tasks      | elapsed: 13.7min
[Parallel(n_jobs=5)]: Done 440 tasks      | elapsed: 30.0min
[Parallel(n_jobs=5)]: Done 480 out of 480 | elapsed: 33.9min finished


0.9865785097790799
{'booster': 'gbtree', 'gamma': 0.4, 'learning_rate': 0.3, 'max_depth': 6}


In [24]:
xgb_model = XGBClassifier(booster='gbtree', gamma=0.2, max_depth=6, learning_rate=0.3)
print(f"model {type(xgb_model).__name__} params:")
print(xgb_model.get_params())
xgb_model.fit(X_train, y_train)
y_hat_train = xgb_model.predict(X_train)
y_hat_dev = xgb_model.predict(X_dev)
y_hat_test = xgb_model.predict(X_test)
auc_res_train = metrics.roc_auc_score(y_train, y_hat_train)
auc_res_dev = metrics.roc_auc_score(y_dev, y_hat_dev)
auc_res_test = metrics.roc_auc_score(y_test, y_hat_test)

df_xgb_final_res = pd.DataFrame([{"model": type(xgb_model).__name__, 
                                "model_params_final": str(xgb_model.get_params()), 
                                "auc_train_final": auc_res_train,
                                "auc_dev_final": auc_res_dev,
                                "auc_test_final": auc_res_test}])
df_xgb_final_res

model XGBClassifier params:
{'base_score': 0.5, 'booster': 'gbtree', 'colsample_bylevel': 1, 'colsample_bynode': 1, 'colsample_bytree': 1, 'gamma': 0.2, 'learning_rate': 0.3, 'max_delta_step': 0, 'max_depth': 6, 'min_child_weight': 1, 'missing': None, 'n_estimators': 100, 'n_jobs': 1, 'nthread': None, 'objective': 'binary:logistic', 'random_state': 0, 'reg_alpha': 0, 'reg_lambda': 1, 'scale_pos_weight': 1, 'seed': None, 'silent': None, 'subsample': 1, 'verbosity': 1}


Unnamed: 0,model,model_params_final,auc_train_final,auc_dev_final,auc_test_final
0,XGBClassifier,"{'base_score': 0.5, 'booster': 'gbtree', 'cols...",0.996086,0.98593,0.985762


In [27]:
df_comp = df_models_train_dev_predict.merge(df_xgb_final_res, how="inner", left_on='model', right_on='model')
# model_params, auc_train, auc_dev are columns of xgb model with default params,
# model_params_final, auc_train_final,auc_dev_final, auc_test_final are columns of xgb model after fine tunning

df_comp["auc_train_diff"] = (df_comp['auc_train_final'] - df_comp['auc_train'])*100.0/df_comp['auc_train']
df_comp["auc_dev_diff"] = (df_comp['auc_dev_final'] - df_comp['auc_dev'])*100.0/df_comp['auc_dev']
df_comp

Unnamed: 0,model,model_params,auc_train,auc_dev,model_params_final,auc_train_final,auc_dev_final,auc_test_final,auc_train_diff,auc_dev_diff
0,XGBClassifier,"{'base_score': 0.5, 'booster': 'gbtree', 'cols...",0.972966,0.972697,"{'base_score': 0.5, 'booster': 'gbtree', 'cols...",0.996086,0.98593,0.985762,2.376169,1.360466


After fine tuning I got:
1. an improvenet of 2.37% in auc train socre. 
2. an improvenet of 1.36% in auc dev socre.
3. auc of the test partition is 0.985 which almost equal to auc of dev (as expected). 