# 6 - Train-Dev-Test preparation


In [1]:
# Import libraries:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
from pyMechkar.analysis import Table1, train_test
import sklearn.metrics as metrics 
from sklearn.linear_model import Lasso
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier
import lightgbm as lgb
from sklearn.model_selection import GridSearchCV

  import pandas.util.testing as tm


In [2]:
def drop_additional_index_columns(_df):
    # dropping additional index columns that start with "Unnamed" - dropping these columns
    columns_to_drop = [x for x in _df.columns.to_list() if x.startswith("Unnamed")]
    print("dropping coulmns: ", columns_to_drop) # [Unamed..., Unamed..]
    return _df.drop(columns=columns_to_drop, axis=1, inplace=False)

In [3]:

df_variables_selection = pd.read_csv('variables_selection_final.csv')
df_variables_selection = drop_additional_index_columns(df_variables_selection)
display(df_variables_selection.head(5))
x_selected_columns = df_variables_selection['var'].to_list()
y_column = 'booked_up_target'
print("x_selected_columns: ")
print(x_selected_columns)


dropping coulmns:  ['Unnamed: 0']


Unnamed: 0,var,spearmanr,py_mechkar,univariable_analysis_count,lasso,random_forest,gradient_boosting,linear_svc,ridge,multivariable_analysis_count,total_count
0,kmean_cluster_availability,1,1,2,1,1,0,1,1,4,6
1,target_num_of_day_in_period_minus_num_of_day_i...,1,1,2,1,1,1,1,0,4,6
2,room_type_Entire home/apt_require_guest_phone_...,1,1,2,0,0,0,1,1,2,4
3,host_response_time_missing_host_response_time_...,1,1,2,0,1,0,0,1,2,4
4,host_response_time_within an hour_require_gues...,1,1,2,0,0,0,1,1,2,4


x_selected_columns: 
['kmean_cluster_availability', 'target_num_of_day_in_period_minus_num_of_day_in_previous_period', 'room_type_Entire home/apt_require_guest_phone_verification_t', 'host_response_time_missing_host_response_time_bed_type_Real Bed', 'host_response_time_within an hour_require_guest_phone_verification_f', 'host_response_time_within an hour_require_guest_phone_verification_t', 'host_response_time_missing_host_response_time_require_guest_phone_verification_f', 'host_is_superhost_t_require_guest_phone_verification_t', 'require_guest_phone_verification_f_concat_comments_sentiment_missing_concat_comments_sentiment', 'children���s_dinnerware', 'long_term_stays_allowed', 'property_type_Apartment', 'host_response_time_missing_host_response_time', 'host_response_rate_cat_host_response_rate_missing', 'avg_dollar_price_in_previous_period_cat_avg_dollar_price_in_previous_period_0%_to_25%', 'avg_dollar_price_in_previous_period_cat_avg_dollar_price_in_previous_period_50%_to_75%', 'avg

In [4]:
# reading the flat table after feature enrichement 
df = pd.read_csv("flat_file_after_feature_enrichment.csv")
df = drop_additional_index_columns(df)
df.head(3)

  interactivity=interactivity, compiler=compiler, result=result)


dropping coulmns:  ['Unnamed: 0']


Unnamed: 0,listing_id,name,target_start_date_period,target_end_date_period,start_date_previous_period,end_date_previous_period,host_id,host_name,neighbourhood,latitude,...,host_total_listings_count_cat_host_total_listings_count_0%_to_25%,host_total_listings_count_cat_host_total_listings_count_50%_to_75%,host_total_listings_count_cat_host_total_listings_count_75%_to_100%,host_total_listings_count_cat_host_total_listings_count_missing,bedrooms_cat_bedrooms_0%_to_25%,bedrooms_cat_bedrooms_75%_to_100%,bedrooms_cat_bedrooms_missing,sqrt_bedrooms_cat_sqrt_bedrooms_0%_to_25%,sqrt_bedrooms_cat_sqrt_bedrooms_75%_to_100%,sqrt_bedrooms_cat_sqrt_bedrooms_missing
0,7071,BrightRoom with sunny greenview!,2019-06-01,2019-08-31,2018-11-07,2019-05-31,17391,Bright,Helmholtzplatz,52.543157,...,1,0,0,0,1,0,0,1,0,0
1,7071,BrightRoom with sunny greenview!,2019-07-01,2019-09-30,2018-11-07,2019-06-30,17391,Bright,Helmholtzplatz,52.543157,...,1,0,0,0,1,0,0,1,0,0
2,7071,BrightRoom with sunny greenview!,2019-08-01,2019-11-06,2018-11-07,2019-07-31,17391,Bright,Helmholtzplatz,52.543157,...,1,0,0,0,1,0,0,1,0,0


In [5]:
# Getting only columns that have been selected in '5 - Feature Selection'
df = df[x_selected_columns + [y_column]]

In [6]:
df_train_dev, df_test = train_test(data=df, prop=0.8, seed=9, tableone=True)

Begining analysis...
Factorizing... please wait
[******************************************************
[]
*******************************************************
*******************************************************
[]
------ Finished in 63.22610020637512econds -----
 
You got a perfectly balanced training and test datasets
 
                                             Variables    Categories  \
0                                          Individuals             n   
1                           kmean_cluster_availability             1   
2                           kmean_cluster_availability             2   
3                           kmean_cluster_availability             0   
4    target_num_of_day_in_period_minus_num_of_day_i...     Mean (SD)   
..                                                 ...           ...   
143      mean_sunshine_hours_in_month_in_target_period  Median (IQR)   
145                               distance_from_center     Mean (SD)   
146                  

In [7]:
# verify the df_train_dev is shuffeled
df_train_dev[0:10]

Unnamed: 0,kmean_cluster_availability,target_num_of_day_in_period_minus_num_of_day_in_previous_period,room_type_Entire home/apt_require_guest_phone_verification_t,host_response_time_missing_host_response_time_bed_type_Real Bed,host_response_time_within an hour_require_guest_phone_verification_f,host_response_time_within an hour_require_guest_phone_verification_t,host_response_time_missing_host_response_time_require_guest_phone_verification_f,host_is_superhost_t_require_guest_phone_verification_t,require_guest_phone_verification_f_concat_comments_sentiment_missing_concat_comments_sentiment,children���s_dinnerware,...,mean_precipitation_millimeters_in_target_period,mean_precipitation_days_in_previous_period,mean_precipitation_days_in_target_period,mean_sunshine_hours_in_day_in_previous_period,mean_sunshine_hours_in_day_in_target_period,mean_sunshine_hours_in_month_in_previous_period,mean_sunshine_hours_in_month_in_target_period,distance_from_center,booked_up_target,split
136492,1,-169,0,0,0,0,1,0,1,0,...,46.666667,9.222222,8.0,4.388889,5.333333,133.888889,165.0,3.307769,1,train
148535,1,-54,0,0,0,0,0,0,0,0,...,53.333333,9.6,9.0,2.2,7.0,67.0,210.0,3.857295,1,train
23227,1,-22,0,1,0,0,1,0,0,0,...,43.333333,9.75,8.666667,1.875,5.666667,55.0,175.0,4.385238,1,train
123479,0,-169,0,1,0,0,1,0,0,0,...,46.666667,9.222222,8.0,4.388889,5.333333,133.888889,165.0,3.308469,1,train
107254,0,-144,0,0,0,0,0,0,0,0,...,53.333333,9.375,8.0,4.0,6.5,120.625,205.0,6.017371,1,train
36734,2,-144,0,0,1,0,0,0,1,0,...,53.333333,9.375,8.0,4.0,6.5,120.625,205.0,2.710462,0,train
89511,2,-54,0,0,0,0,0,0,0,0,...,53.333333,9.6,9.0,2.2,7.0,67.0,210.0,1.762141,0,train
30684,1,3,0,0,0,0,0,0,1,0,...,36.666667,10.333333,8.333333,1.666667,4.0,48.333333,123.333333,1.544672,1,train
59388,0,3,0,0,1,0,0,0,1,0,...,36.666667,10.333333,8.333333,1.666667,4.0,48.333333,123.333333,3.552645,0,train
88204,1,-114,0,0,0,0,0,0,0,0,...,61.666667,9.285714,8.666667,3.5,7.333333,106.428571,226.666667,3.164938,1,train


In [8]:
df_train, df_dev = train_test(data=df_train_dev, prop=0.8, seed=9, tableone=True)

Begining analysis...
Factorizing... please wait
[******************************************************
[]
*******************************************************
*******************************************************
[]
------ Finished in 52.98942542076111econds -----
 
You got a perfectly balanced training and test datasets
 
                                             Variables    Categories  \
0                                          Individuals             n   
1                           kmean_cluster_availability             1   
2                           kmean_cluster_availability             2   
3                           kmean_cluster_availability             0   
4    target_num_of_day_in_period_minus_num_of_day_i...     Mean (SD)   
..                                                 ...           ...   
143      mean_sunshine_hours_in_month_in_target_period  Median (IQR)   
145                               distance_from_center     Mean (SD)   
146                  

In [9]:
print(f"df_train shape: {df_train.shape}.  {round(df_train.shape[0]*100/df.shape[0],2)}% of records"  )
print(f"df_dev  shape: {df_dev .shape}.  {round(df_dev .shape[0]*100/df.shape[0],2)}% of records"  )
print(f"df_test shape: {df_test.shape}.  {round(df_test.shape[0]*100/df.shape[0],2)}% of records"  )

df_train shape: (101032, 55).  64.0% of records
df_dev  shape: (25259, 55).  16.0% of records
df_test shape: (31573, 55).  20.0% of records


# 2 . Model Selection

In [10]:
def classificationMetrics(y, yhat):
    
    prf1 = metrics.precision_recall_fscore_support(y,yhat)
    res = {'Accuracy': metrics.accuracy_score(y,yhat),
           'Precision':prf1[0][1],
           'Recall': prf1[1][1],
           'f1-score': prf1[2][1],
           'Log-loss': metrics.log_loss(y,yhat),
           'AUC': metrics.roc_auc_score(y,yhat)
          }
    return res

def get_auc(y, yhat):
    """
    """
    prf1 = metrics.precision_recall_fscore_support(y,yhat)
    return metrics.roc_auc_score(y,yhat)
   

In [11]:
X_train = df_train[x_selected_columns]
y_train = df_train[y_column]
X_dev = df_dev[x_selected_columns]
y_dev = df_dev[y_column]
X_test = df_test[x_selected_columns]
y_test = df_test[y_column]

In [12]:
mode_predict_list = []

In [13]:
str(RidgeClassifier().get_params())

"{'alpha': 1.0, 'class_weight': None, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'normalize': False, 'random_state': None, 'solver': 'auto', 'tol': 0.001}"

In [23]:

def model_classifier_auc(_X_train, _y_train, _X_dev, _y_dev, _model_object):
    _model_object.fit(_X_train, _y_train)
    y_hat = _model_object.predict(_X_dev)
    auc_res = metrics.roc_auc_score(_y_dev, y_hat)
    return {"model": type(_model_object).__name__, "model_params": str(_model_object.get_params()), "auc": auc_res}

In [26]:
from datetime import datetime

models_objects_to_test = [RidgeClassifier(), LogisticRegression(), DecisionTreeClassifier(), RandomForestClassifier(),
                          AdaBoostClassifier(), GradientBoostingClassifier(),  KNeighborsClassifier(), XGBClassifier()]

models_predict_dev_list = []

for model_obj in models_objects_to_test:
    print(datetime.now(), "start model_classifier_auc of ", type(model_obj).__name__)
    model_predict_dev_dict = model_classifier_auc(X_train, y_train, X_dev, y_dev, model_obj)
    models_predict_dev_list.append(model_predict_dev_dict)

    print(datetime.now(), "end model_classifier_auc of ", type(model_obj).__name__)

models_predict_dev_list

2021-09-23 21:13:29.610091 start model_classifier_auc of  RidgeClassifier
2021-09-23 21:13:29.752004 end model_classifier_auc of  RidgeClassifier
2021-09-23 21:13:29.753003 start model_classifier_auc of  LogisticRegression




2021-09-23 21:13:41.002087 end model_classifier_auc of  LogisticRegression
2021-09-23 21:13:41.003088 start model_classifier_auc of  DecisionTreeClassifier
2021-09-23 21:13:41.742633 end model_classifier_auc of  DecisionTreeClassifier
2021-09-23 21:13:41.743633 start model_classifier_auc of  RandomForestClassifier




2021-09-23 21:13:42.867941 end model_classifier_auc of  RandomForestClassifier
2021-09-23 21:13:42.868940 start model_classifier_auc of  AdaBoostClassifier
2021-09-23 21:13:52.008323 end model_classifier_auc of  AdaBoostClassifier
2021-09-23 21:13:52.009322 start model_classifier_auc of  GradientBoostingClassifier
2021-09-23 21:14:14.142714 end model_classifier_auc of  GradientBoostingClassifier
2021-09-23 21:14:14.142714 start model_classifier_auc of  KNeighborsClassifier
2021-09-23 21:14:29.837066 end model_classifier_auc of  KNeighborsClassifier
2021-09-23 21:14:29.837066 start model_classifier_auc of  XGBClassifier
2021-09-23 21:14:54.219078 end model_classifier_auc of  XGBClassifier


[{'model': 'RidgeClassifier',
  'model_params': "{'alpha': 1.0, 'class_weight': None, 'copy_X': True, 'fit_intercept': True, 'max_iter': None, 'normalize': False, 'random_state': None, 'solver': 'auto', 'tol': 0.001}",
  'auc': 0.9207474695281355},
 {'model': 'LogisticRegression',
  'model_params': "{'C': 1.0, 'class_weight': None, 'dual': False, 'fit_intercept': True, 'intercept_scaling': 1, 'l1_ratio': None, 'max_iter': 100, 'multi_class': 'warn', 'n_jobs': None, 'penalty': 'l2', 'random_state': None, 'solver': 'warn', 'tol': 0.0001, 'verbose': 0, 'warm_start': False}",
  'auc': 0.9390947691327365},
 {'model': 'DecisionTreeClassifier',
  'model_params': "{'class_weight': None, 'criterion': 'gini', 'max_depth': None, 'max_features': None, 'max_leaf_nodes': None, 'min_impurity_decrease': 0.0, 'min_impurity_split': None, 'min_samples_leaf': 1, 'min_samples_split': 2, 'min_weight_fraction_leaf': 0.0, 'presort': False, 'random_state': None, 'splitter': 'best'}",
  'auc': 0.984037705868897

In [27]:
df_models_predict = pd.DataFrame(models_predict_dev_list)
df_models_predict

Unnamed: 0,model,model_params,auc
0,RidgeClassifier,"{'alpha': 1.0, 'class_weight': None, 'copy_X':...",0.920747
1,LogisticRegression,"{'C': 1.0, 'class_weight': None, 'dual': False...",0.939095
2,DecisionTreeClassifier,"{'class_weight': None, 'criterion': 'gini', 'm...",0.984038
3,RandomForestClassifier,"{'bootstrap': True, 'class_weight': None, 'cri...",0.985878
4,AdaBoostClassifier,"{'algorithm': 'SAMME.R', 'base_estimator': Non...",0.935821
5,GradientBoostingClassifier,"{'criterion': 'friedman_mse', 'init': None, 'l...",0.976543
6,KNeighborsClassifier,"{'algorithm': 'auto', 'leaf_size': 30, 'metric...",0.96477
7,XGBClassifier,"{'base_score': 0.5, 'booster': 'gbtree', 'cols...",0.972697


In [18]:
model_classifier_auc(X_train, y_train, X_test, y_test, AdaBoostClassifier())

{'model': 'AdaBoostClassifier',
 'model_params': "{'algorithm': 'SAMME.R', 'base_estimator': None, 'learning_rate': 1.0, 'n_estimators': 50, 'random_state': None}",
 'auc': 0.9388622788882371}