# 6 - Train-Dev-Test preparation


In [1]:
# Import libraries:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from scipy import stats
from pyMechkar.analysis import Table1, train_test
from sklearn.linear_model import Lasso
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SelectFromModel

  import pandas.util.testing as tm


In [11]:
def drop_additional_index_columns(_df):
    # dropping additional index columns that start with "Unnamed" - dropping these columns
    columns_to_drop = [x for x in _df.columns.to_list() if x.startswith("Unnamed")]
    print("dropping coulmns: ", columns_to_drop) # [Unamed..., Unamed..]
    return _df.drop(columns=columns_to_drop, axis=1, inplace=False)

In [12]:

df_variables_selection = pd.read_csv('variables_selection_final.csv')
df_variables_selection = drop_additional_index_columns(df_variables_selection)
display(df_variables_selection.head(5))
x_selected_columns = df_variables_selection['var'].to_list()
y_column = 'booked_up_target'
print("x_selected_columns: ")
print(x_selected_columns)


dropping coulmns:  ['Unnamed: 0']


Unnamed: 0,var,spearmanr,py_mechkar,univariable_analysis_count,lasso,random_forest,gradient_boosting,linear_svc,ridge,multivariable_analysis_count,total_count
0,kmean_cluster_availability,1,1,2,1,1,0,1,1,4,6
1,target_num_of_day_in_period_minus_num_of_day_i...,1,1,2,1,1,1,1,0,4,6
2,room_type_Entire home/apt_require_guest_phone_...,1,1,2,0,0,0,1,1,2,4
3,host_response_time_missing_host_response_time_...,1,1,2,0,1,0,0,1,2,4
4,host_response_time_within an hour_require_gues...,1,1,2,0,0,0,1,1,2,4


x_selected_columns: 
['kmean_cluster_availability', 'target_num_of_day_in_period_minus_num_of_day_in_previous_period', 'room_type_Entire home/apt_require_guest_phone_verification_t', 'host_response_time_missing_host_response_time_bed_type_Real Bed', 'host_response_time_within an hour_require_guest_phone_verification_f', 'host_response_time_within an hour_require_guest_phone_verification_t', 'host_response_time_missing_host_response_time_require_guest_phone_verification_f', 'host_is_superhost_t_require_guest_phone_verification_t', 'require_guest_phone_verification_f_concat_comments_sentiment_missing_concat_comments_sentiment', 'children���s_dinnerware', 'long_term_stays_allowed', 'property_type_Apartment', 'host_response_time_missing_host_response_time', 'host_response_rate_cat_host_response_rate_missing', 'avg_dollar_price_in_previous_period_cat_avg_dollar_price_in_previous_period_0%_to_25%', 'avg_dollar_price_in_previous_period_cat_avg_dollar_price_in_previous_period_50%_to_75%', 'avg

In [13]:
# reading the flat table after feature enrichement 
df = pd.read_csv("flat_file_after_feature_enrichment.csv")
df = drop_additional_index_columns(df)
df.head(3)

  interactivity=interactivity, compiler=compiler, result=result)


dropping coulmns:  ['Unnamed: 0']


Unnamed: 0,listing_id,name,target_start_date_period,target_end_date_period,start_date_previous_period,end_date_previous_period,host_id,host_name,neighbourhood,latitude,...,host_total_listings_count_cat_host_total_listings_count_0%_to_25%,host_total_listings_count_cat_host_total_listings_count_50%_to_75%,host_total_listings_count_cat_host_total_listings_count_75%_to_100%,host_total_listings_count_cat_host_total_listings_count_missing,bedrooms_cat_bedrooms_0%_to_25%,bedrooms_cat_bedrooms_75%_to_100%,bedrooms_cat_bedrooms_missing,sqrt_bedrooms_cat_sqrt_bedrooms_0%_to_25%,sqrt_bedrooms_cat_sqrt_bedrooms_75%_to_100%,sqrt_bedrooms_cat_sqrt_bedrooms_missing
0,7071,BrightRoom with sunny greenview!,2019-06-01,2019-08-31,2018-11-07,2019-05-31,17391,Bright,Helmholtzplatz,52.543157,...,1,0,0,0,1,0,0,1,0,0
1,7071,BrightRoom with sunny greenview!,2019-07-01,2019-09-30,2018-11-07,2019-06-30,17391,Bright,Helmholtzplatz,52.543157,...,1,0,0,0,1,0,0,1,0,0
2,7071,BrightRoom with sunny greenview!,2019-08-01,2019-11-06,2018-11-07,2019-07-31,17391,Bright,Helmholtzplatz,52.543157,...,1,0,0,0,1,0,0,1,0,0


In [22]:
# Getting only columns that have been selected in '5 - Feature Selection'
df = df[x_selected_columns + [y_column]]

In [30]:
df_train_dev, df_test = train_test(data=df, prop=0.8, seed=9, tableone=True)

Begining analysis...
Factorizing... please wait
[******************************************************
[]
*******************************************************
*******************************************************
[]
------ Finished in 60.05807828903198econds -----
 
You got a perfectly balanced training and test datasets
 
                                             Variables    Categories  \
0                                          Individuals             n   
1                           kmean_cluster_availability             1   
2                           kmean_cluster_availability             2   
3                           kmean_cluster_availability             0   
4    target_num_of_day_in_period_minus_num_of_day_i...     Mean (SD)   
..                                                 ...           ...   
143      mean_sunshine_hours_in_month_in_target_period  Median (IQR)   
145                               distance_from_center     Mean (SD)   
146                  

In [31]:
# verify the df_train_dev is shuffeled
df_train_dev[0:10]

Unnamed: 0,kmean_cluster_availability,target_num_of_day_in_period_minus_num_of_day_in_previous_period,room_type_Entire home/apt_require_guest_phone_verification_t,host_response_time_missing_host_response_time_bed_type_Real Bed,host_response_time_within an hour_require_guest_phone_verification_f,host_response_time_within an hour_require_guest_phone_verification_t,host_response_time_missing_host_response_time_require_guest_phone_verification_f,host_is_superhost_t_require_guest_phone_verification_t,require_guest_phone_verification_f_concat_comments_sentiment_missing_concat_comments_sentiment,children���s_dinnerware,...,mean_precipitation_millimeters_in_target_period,mean_precipitation_days_in_previous_period,mean_precipitation_days_in_target_period,mean_sunshine_hours_in_day_in_previous_period,mean_sunshine_hours_in_day_in_target_period,mean_sunshine_hours_in_month_in_previous_period,mean_sunshine_hours_in_month_in_target_period,distance_from_center,booked_up_target,split
136492,1,-169,0,0,0,0,1,0,1,0,...,46.666667,9.222222,8.0,4.388889,5.333333,133.888889,165.0,3.307769,1,train
148535,1,-54,0,0,0,0,0,0,0,0,...,53.333333,9.6,9.0,2.2,7.0,67.0,210.0,3.857295,1,train
23227,1,-22,0,1,0,0,1,0,0,0,...,43.333333,9.75,8.666667,1.875,5.666667,55.0,175.0,4.385238,1,train
123479,0,-169,0,1,0,0,1,0,0,0,...,46.666667,9.222222,8.0,4.388889,5.333333,133.888889,165.0,3.308469,1,train
107254,0,-144,0,0,0,0,0,0,0,0,...,53.333333,9.375,8.0,4.0,6.5,120.625,205.0,6.017371,1,train
36734,2,-144,0,0,1,0,0,0,1,0,...,53.333333,9.375,8.0,4.0,6.5,120.625,205.0,2.710462,0,train
89511,2,-54,0,0,0,0,0,0,0,0,...,53.333333,9.6,9.0,2.2,7.0,67.0,210.0,1.762141,0,train
30684,1,3,0,0,0,0,0,0,1,0,...,36.666667,10.333333,8.333333,1.666667,4.0,48.333333,123.333333,1.544672,1,train
59388,0,3,0,0,1,0,0,0,1,0,...,36.666667,10.333333,8.333333,1.666667,4.0,48.333333,123.333333,3.552645,0,train
88204,1,-114,0,0,0,0,0,0,0,0,...,61.666667,9.285714,8.666667,3.5,7.333333,106.428571,226.666667,3.164938,1,train


In [32]:
df_train, df_dev = train_test(data=df_train_dev, prop=0.8, seed=9, tableone=True)

Begining analysis...
Factorizing... please wait
[******************************************************
[]
*******************************************************
*******************************************************
[]
------ Finished in 51.27347946166992econds -----
 
You got a perfectly balanced training and test datasets
 
                                             Variables    Categories  \
0                                          Individuals             n   
1                           kmean_cluster_availability             1   
2                           kmean_cluster_availability             2   
3                           kmean_cluster_availability             0   
4    target_num_of_day_in_period_minus_num_of_day_i...     Mean (SD)   
..                                                 ...           ...   
143      mean_sunshine_hours_in_month_in_target_period  Median (IQR)   
145                               distance_from_center     Mean (SD)   
146                  

In [44]:
print(f"df_train shape: {df_train.shape}.  {round(df_train.shape[0]*100/df.shape[0],2)}% of records"  )
print(f"df_dev  shape: {df_dev .shape}.  {round(df_dev .shape[0]*100/df.shape[0],2)}% of records"  )
print(f"df_test shape: {df_test.shape}.  {round(df_test.shape[0]*100/df.shape[0],2)}% of records"  )

df_train shape: (101032, 55).  64.0% of records
df_dev  shape: (25259, 55).  16.0% of records
df_test shape: (31573, 55).  20.0% of records


101032