#  5 - Feature Selection
Selection based on voting: using many of the techniques (univariate and multivariate), we make a table with all the variables on the dataset and indicate the recommended variables for each technique, then we select a threshold for the total votings and on this basis we select the variables that will be used to train our models.

In [12]:
# Import libraries:
import pandas as pd
import numpy as np
import seaborn as sns
from sklearn.cluster import KMeans
import matplotlib.pyplot as plt
from scipy import stats
from pyMechkar.analysis import Table1
from sklearn.linear_model import Lasso
from sklearn.svm import LinearSVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.linear_model import RidgeClassifier
from sklearn.feature_selection import SelectFromModel


In [11]:
df = pd.read_csv("flat_file_after_feature_enrichment.csv")

def drop_additional_index_columns(_df):
    # dropping additional index columns that start with "Unnamed" - dropping these columns
    columns_to_drop = [x for x in _df.columns.to_list() if x.startswith("Unnamed")]
    print("dropping coulmns: ", columns_to_drop) # [Unamed..., Unamed..]
    return _df.drop(columns=columns_to_drop, axis=1, inplace=False)

df = drop_additional_index_columns(df)

  interactivity=interactivity, compiler=compiler, result=result)


dropping coulmns:  ['Unnamed: 0']


In [13]:
df.head(5)

Unnamed: 0,listing_id,name,target_start_date_period,target_end_date_period,start_date_previous_period,end_date_previous_period,host_id,host_name,neighbourhood,latitude,...,host_total_listings_count_cat_host_total_listings_count_0%_to_25%,host_total_listings_count_cat_host_total_listings_count_50%_to_75%,host_total_listings_count_cat_host_total_listings_count_75%_to_100%,host_total_listings_count_cat_host_total_listings_count_missing,bedrooms_cat_bedrooms_0%_to_25%,bedrooms_cat_bedrooms_75%_to_100%,bedrooms_cat_bedrooms_missing,sqrt_bedrooms_cat_sqrt_bedrooms_0%_to_25%,sqrt_bedrooms_cat_sqrt_bedrooms_75%_to_100%,sqrt_bedrooms_cat_sqrt_bedrooms_missing
0,7071,BrightRoom with sunny greenview!,2019-06-01,2019-08-31,2018-11-07,2019-05-31,17391,Bright,Helmholtzplatz,52.543157,...,1,0,0,0,1,0,0,1,0,0
1,7071,BrightRoom with sunny greenview!,2019-07-01,2019-09-30,2018-11-07,2019-06-30,17391,Bright,Helmholtzplatz,52.543157,...,1,0,0,0,1,0,0,1,0,0
2,7071,BrightRoom with sunny greenview!,2019-08-01,2019-11-06,2018-11-07,2019-07-31,17391,Bright,Helmholtzplatz,52.543157,...,1,0,0,0,1,0,0,1,0,0
3,7071,BrightRoom with sunny greenview!,2019-02-01,2019-04-30,2018-11-07,2019-01-31,17391,Bright,Helmholtzplatz,52.543157,...,1,0,0,0,1,0,0,1,0,0
4,7071,BrightRoom with sunny greenview!,2019-03-01,2019-05-31,2018-11-07,2019-02-28,17391,Bright,Helmholtzplatz,52.543157,...,1,0,0,0,1,0,0,1,0,0


In [14]:
# Arrange the columns by type

def get_dict_of_df_types(pdf: pd.DataFrame) -> dict:
    """
    Arrange dataframe columns in the dictionary by types:
    For each type having a list of columns
    """
    d_of_columns_types = {}

    for c,t in zip(pdf.columns, pdf.dtypes):
        t_str = str(t)
        if d_of_columns_types.get(t_str) == None:
            d_of_columns_types[t_str] = [c]
        else:
            d_of_columns_types[t_str].append(c)
    return d_of_columns_types



In [15]:
d_of_columns_types = get_dict_of_df_types(df)
d_of_columns_types.keys()

dict_keys(['int64', 'object', 'float64'])

In [16]:
print("object columns:")
d_of_columns_types['object']

object columns:


['name',
 'target_start_date_period',
 'target_end_date_period',
 'start_date_previous_period',
 'end_date_previous_period',
 'host_name',
 'neighbourhood',
 'last_review',
 'listing_url',
 'last_scraped',
 'summary',
 'space',
 'description',
 'experiences_offered',
 'notes',
 'transit',
 'access',
 'interaction',
 'house_rules',
 'neighborhood_overview',
 'host_about',
 'host_since',
 'picture_url',
 'host_url',
 'host_location',
 'host_thumbnail_url',
 'host_picture_url',
 'host_neighbourhood',
 'host_verifications',
 'street',
 'neighbourhood_cleansed',
 'neighbourhood_group_cleansed',
 'city',
 'state',
 'smart_location',
 'market',
 'country_code',
 'country',
 'is_location_exact',
 'amenities',
 'calendar_updated',
 'first_review',
 'calendar_last_scraped',
 'license',
 'concat_comments',
 'amenities_list',
 'host_verifications_list']

In [17]:
print("int64 columns:")
d_of_columns_types['int64']

int64 columns:


['listing_id',
 'host_id',
 'target_num_of_booked_days',
 'booked_up_target',
 'num_of_day_in_previous_period',
 'num_of_booked_days_in_previous_period',
 'kmean_cluster_availability',
 'target_num_of_day_in_period_minus_num_of_day_in_previous_period',
 'room_type_Private room_host_response_time_within an hour',
 'room_type_Private room_host_response_time_within a few hours',
 'room_type_Private room_host_response_time_missing_host_response_time',
 'room_type_Private room_host_response_time_within a day',
 'room_type_Private room_host_response_time_a few days or more',
 'room_type_Entire home/apt_host_response_time_within an hour',
 'room_type_Entire home/apt_host_response_time_within a few hours',
 'room_type_Entire home/apt_host_response_time_missing_host_response_time',
 'room_type_Entire home/apt_host_response_time_within a day',
 'room_type_Entire home/apt_host_response_time_a few days or more',
 'room_type_Shared room_host_response_time_within an hour',
 'room_type_Shared room_ho

In [9]:
print("float64 columns:")
d_of_columns_types['float64']

float64 columns:


['latitude',
 'longitude',
 'scrape_id',
 'zipcode',
 'price',
 'minimum_nights',
 'number_of_reviews',
 'availability_365',
 'accommodates',
 'security_deposit',
 'cleaning_fee',
 'extra_people',
 'maximum_nights',
 'sqrt_calculated_host_listings_count',
 'sigmoid_calculated_host_listings_count',
 'sqrt_guests_included',
 'availability_60_minus_availability_90',
 'availability_60_div_availability_90',
 'availability_30_minus_availability_60',
 'availability_30_div_availability_60',
 'availability_365_minus_availability_90',
 'availability_365_div_availability_90',
 'availability_30_minus_availability_90',
 'availability_30_div_availability_90',
 'availability_365_minus_availability_60',
 'availability_365_div_availability_60',
 'target_num_of_day_in_period_div_num_of_day_in_previous_period',
 'occupancy_last_period_minus_availability_60',
 'occupancy_last_period_div_availability_60',
 'occupancy_last_period_minus_availability_365',
 'occupancy_last_period_div_availability_365',
 'occu

In [18]:
target_colums =  ['num_of_booked_days_in_previous_period', 'num_of_day_in_previous_period', 
                  'target_num_of_booked_days', 'target_num_of_day_in_period', 'booked_up_target']
columns_to_remove = target_colums + ['zipcode', 'availability_60_div_availability_90',
                                     'availability_30_div_availability_60','availability_365_div_availability_90',
                                     'availability_30_div_availability_90','availability_365_div_availability_60',
                                     'occupancy_last_period_div_availability_60',
                                     'occupancy_last_period_div_availability_365',
                                     'occupancy_last_period_div_availability_90']

x_columns = d_of_columns_types['int64'] + d_of_columns_types['float64'] 

x_columns = [col for col in x_columns if col not in columns_to_remove]
y_column = 'booked_up_target'

In [19]:
df[x_columns].head(4)

Unnamed: 0,listing_id,host_id,kmean_cluster_availability,target_num_of_day_in_period_minus_num_of_day_in_previous_period,room_type_Private room_host_response_time_within an hour,room_type_Private room_host_response_time_within a few hours,room_type_Private room_host_response_time_missing_host_response_time,room_type_Private room_host_response_time_within a day,room_type_Private room_host_response_time_a few days or more,room_type_Entire home/apt_host_response_time_within an hour,...,mean_of_max_temperatures_in_target_period,mean_precipitation_millimeters_in_previous_period,mean_precipitation_millimeters_in_target_period,mean_precipitation_days_in_previous_period,mean_precipitation_days_in_target_period,mean_sunshine_hours_in_day_in_previous_period,mean_sunshine_hours_in_day_in_target_period,mean_sunshine_hours_in_month_in_previous_period,mean_sunshine_hours_in_month_in_target_period,distance_from_center
0,7071,17391,1,-114,1,0,0,0,0,0,...,23.333333,43.571429,61.666667,9.285714,8.666667,3.5,7.333333,106.428571,226.666667,4.551287
1,7071,17391,1,-144,1,0,0,0,0,0,...,22.333333,46.875,53.333333,9.375,8.0,4.0,6.5,120.625,205.0,4.551287
2,7071,17391,1,-169,1,0,0,0,0,0,...,18.666667,47.777778,46.666667,9.222222,8.0,4.388889,5.333333,133.888889,165.0,4.551287
3,7071,17391,1,3,1,0,0,0,0,0,...,8.666667,46.666667,36.666667,10.333333,8.333333,1.666667,4.0,48.333333,123.333333,4.551287


## Univariable Analysis

Each variable on the dataset is analyzed by comparing its relationship with the dependent variable.

### 1. Univariable Analysis with spearman correlation

In [20]:
# Checking for correlation between each of the variables and the target (using spearmanr correlation).

def get_spearmanr_of_feature_with_target(_df, _x_columns, _y_column):
    
    rows_list = []
    for x_col in _x_columns:
        # Spearman's rank correlation coefficient test, which is a nonparametric measure of rank correlation
        pearmanr_res  = stats.spearmanr(_df[x_col], _df[_y_column]) # (correlation, pvalue)
        row_dict = {"var": x_col, 
                    "spearmanr_correlation_with_target": pearmanr_res[0],
                    "spearmanr_pvalue_with_target": pearmanr_res[1] }   
        rows_list.append(row_dict)
    
    return pd.DataFrame(rows_list)

In [21]:
df_variables_selection = get_spearmanr_of_feature_with_target(df,x_columns, y_column)
df_variables_selection['spearmanr'] = np.where(df_variables_selection['spearmanr_pvalue_with_target'] < 0.05, 1, 0)
df_variables_selection = df_variables_selection[["var","spearmanr"]]

  c /= stddev[:, None]
  c /= stddev[None, :]
  return (a < x) & (x < b)
  return (a < x) & (x < b)
  cond2 = cond0 & (x <= _a)


### 2. Univariable Analysis with pyMechkar

In [41]:
from datetime import datetime
print(datetime.now(), "start pyMechkar")

tab1 = Table1(data=df[x_columns+[y_column]], y=y_column)
print(datetime.now(), "end pyMechkar")
## Univariable Analysis with pyMechkar.analysis

2021-09-22 08:59:11.181302 start pyMechkar
Begining analysis...
Factorizing... please wait
*************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************************
[]
[***************************************************************************************************

In [45]:
tab1

Unnamed: 0,Variables,Categories,Population,Category_1,Category_0,p_value
0,Individuals,n,157864,113873,43991,
1,listing_id,Mean (SD),"15,715,604.80 (8,551,879.00)","16,068,619.00 (8,126,508.20)","14,801,809.10 (9,504,815.10)",0.0
2,listing_id,Median (IQR),"16,866,381.00 (8,065,953.50-22,583,930.80)","17,416,440.00 (8,767,787.00-22,384,985.00)","15,428,418.00 (5,637,895.00-23,157,940.00)",
4,host_id,Mean (SD),"54,033,548.00 (58,161,613.10)","54,842,706.90 (56,666,286.30)","51,938,998.10 (61,815,526.00)",0.0
5,host_id,Median (IQR),"31,267,110.00 (9,240,002.20-80,675,177.00)","33,836,949.00 (11,180,297.00-81,243,480.00)","23,794,332.00 (5,526,024.00-79,255,369.00)",
...,...,...,...,...,...,...
1647,mean_sunshine_hours_in_month_in_previous_period,Median (IQR),85.80 ( 55.00- 120.60),85.80 ( 55.00- 120.60),85.80 ( 55.00- 106.40),
1649,mean_sunshine_hours_in_month_in_target_period,Mean (SD),190.70 ( 35.60),192.40 ( 34.30),186.30 ( 38.30),0.0
1650,mean_sunshine_hours_in_month_in_target_period,Median (IQR),205.00 ( 165.00- 226.70),205.00 ( 165.00- 226.70),205.00 ( 165.00- 226.70),
1652,distance_from_center,Mean (SD),4.50 ( 2.70),4.40 ( 2.40),5.00 ( 3.30),0.0


In [43]:
tab1.to_csv("feature_selection_variables_selection_using_py_mechkar.csv")

In [30]:
tab1 = pd.read_csv("feature_selection_variables_selection_using_py_mechkar.csv")
tab1[0:20]

Unnamed: 0.1,Unnamed: 0,Variables,Categories,Population,Category_1,Category_0,p_value
0,0,Individuals,n,157864,113873,43991,
1,1,listing_id,Mean (SD),"15,715,604.80 (8,551,879.00)","16,068,619.00 (8,126,508.20)","14,801,809.10 (9,504,815.10)",0.0
2,2,listing_id,Median (IQR),"16,866,381.00 (8,065,953.50-22,583,930.80)","17,416,440.00 (8,767,787.00-22,384,985.00)","15,428,418.00 (5,637,895.00-23,157,940.00)",
3,4,host_id,Mean (SD),"54,033,548.00 (58,161,613.10)","54,842,706.90 (56,666,286.30)","51,938,998.10 (61,815,526.00)",0.0
4,5,host_id,Median (IQR),"31,267,110.00 (9,240,002.20-80,675,177.00)","33,836,949.00 (11,180,297.00-81,243,480.00)","23,794,332.00 (5,526,024.00-79,255,369.00)",
5,7,kmean_cluster_availability,1,"103,012.00 ( 65.30%)","99,754.00 ( 87.60%)","3,258.00 ( 7.40%)",
6,8,kmean_cluster_availability,2,"29,442.00 ( 18.70%)",77.00 ( 0.10%),"29,365.00 ( 66.80%)",0.0
7,9,kmean_cluster_availability,0,"25,410.00 ( 16.10%)","14,042.00 ( 12.30%)","11,368.00 ( 25.80%)",
8,10,target_num_of_day_in_period_minus_num_of_day_i...,Mean (SD),-83.30 ( 58.60),-87.20 ( 57.90),-73.30 ( 59.20),0.0
9,11,target_num_of_day_in_period_minus_num_of_day_i...,Median (IQR),-83.00 ( -144.00- -22.00),-83.00 ( -144.00- -22.00),-83.00 ( -114.00- -22.00),


In [26]:
py_mechkar_selected_varaibles_list = tab1.loc[tab1['p_value']<0.05,'Variables'].unique()
py_mechkar_selected_varaibles_list

array(['listing_id', 'host_id', 'kmean_cluster_availability',
       'target_num_of_day_in_period_minus_num_of_day_in_previous_period',
       'room_type_Private room_host_response_time_within an hour',
       'room_type_Private room_host_response_time_within a few hours',
       'room_type_Private room_host_response_time_missing_host_response_time',
       'room_type_Private room_host_response_time_within a day',
       'room_type_Private room_host_response_time_a few days or more',
       'room_type_Entire home/apt_host_response_time_within an hour',
       'room_type_Entire home/apt_host_response_time_within a few hours',
       'room_type_Entire home/apt_host_response_time_missing_host_response_time',
       'room_type_Entire home/apt_host_response_time_within a day',
       'room_type_Entire home/apt_host_response_time_a few days or more',
       'room_type_Shared room_host_response_time_within an hour',
       'room_type_Shared room_host_response_time_missing_host_response_time',

In [28]:
df_variables_selection['py_mechkar'] = np.where(df_variables_selection['var'].isin(py_mechkar_selected_varaibles_list), 1, 0)

In [65]:
univariable_columns = ['py_mechkar','spearmanr']
df_variables_selection['univariable_analysis_count'] = df_variables_selection[univariable_columns].sum(axis=1)
print("df_variables_selection (only with Univariable Analysis): ")
df_variables_selection

df_variables_selection (only with Univariable Analysis): 


Unnamed: 0,var,spearmanr,py_mechkar,univariable_analysis_count,lasso,random_forest,gradient_boosting,linear_svc,ridge,multivariable_analysis_count
0,listing_id,1,1,2,0,1,0,0,0,1
1,host_id,1,1,2,0,1,0,0,0,1
2,kmean_cluster_availability,1,1,2,1,1,0,1,1,4
3,target_num_of_day_in_period_minus_num_of_day_i...,1,1,2,1,1,1,1,0,4
4,room_type_Private room_host_response_time_with...,1,1,2,0,0,0,1,0,1
...,...,...,...,...,...,...,...,...,...,...
800,mean_sunshine_hours_in_day_in_previous_period,1,1,2,0,1,1,1,0,3
801,mean_sunshine_hours_in_day_in_target_period,1,1,2,0,1,0,0,1,2
802,mean_sunshine_hours_in_month_in_previous_period,1,1,2,0,1,1,1,1,4
803,mean_sunshine_hours_in_month_in_target_period,1,1,2,1,1,0,1,1,4


In [49]:
print("py_mechkar sum of 'is feature signficant' to target", df_variables_selection['py_mechkar'].sum())
print("py_mechkar sum of 'is feature signficant' to target", df_variables_selection['spearmanr'].sum())
df_variables_selection.groupby(['univariable_analysis_count']).size().to_frame('sum').reset_index()

py_mechkar sum of 'is feature signficant' to target 766
py_mechkar sum of 'is feature signficant' to target 781


Unnamed: 0,univariable_analysis_count,sum
0,0,24
1,1,15
2,2,766


Using univariable analysis:
1. There are 24 vars that don't have correlation at all with the target. 
2. There are 15 vars that have correlation at 1 of the 2 tests.
3. 766 vars out of 805 vars have correlation at both tests. 

## Multivariable Analysis

Using the whole dataset and running predictive models that are able to return a list of recommended features by defining their influence in the model.

In [17]:
X = df[x_columns]
y = df[y_column]

### 1. Multivariable Analysis with Lasso Model


In [18]:
lasso_model = Lasso(alpha=0.01).fit(X, y)
model = SelectFromModel(lasso_model, prefit=True)

df_variables_selection['lasso'] = model.get_support().astype('int64')

  positive)


### 2. Multivariable Analysis with Random Forest Model

In [22]:
rfmod = RandomForestClassifier().fit(X, y)
model = SelectFromModel(rfmod, prefit=True)
df_variables_selection['random_forest'] = model.get_support().astype('int64')

### 3. Multivariable Analysis with Gradient Boosting Model

In [23]:
gbmod = GradientBoostingClassifier().fit(X, y)
model = SelectFromModel(gbmod, prefit=True)
df_variables_selection['gradient_boosting'] = model.get_support().astype('int64')

### 4. Multivariable Analysis with SVM Model

In [26]:
svmmod = LinearSVC(C=0.01, penalty="l1",dual=False).fit(X, y)
model = SelectFromModel(svmmod, prefit=True)
df_variables_selection['linear_svc'] = model.get_support().astype('int64')



### 5. Multivariable Analysis with Ridge Model

In [27]:
ridge_model = RidgeClassifier(alpha=0.01).fit(X, y)
model = SelectFromModel(ridge_model, prefit=True)
df_variables_selection['ridge'] = model.get_support().astype('int64')

  overwrite_a=True).T


In [28]:
df_variables_selection

Unnamed: 0,var,spearmanr,lasso,random_forest,gradient_boosting,linear_svc,ridge
0,listing_id,1,0,1,0,0,0
1,host_id,1,0,1,0,0,0
2,kmean_cluster_availability,1,1,1,0,1,1
3,target_num_of_day_in_period_minus_num_of_day_i...,1,1,1,1,1,0
4,room_type_Private room_host_response_time_with...,1,0,0,0,1,0
...,...,...,...,...,...,...,...
800,mean_sunshine_hours_in_day_in_previous_period,1,0,1,1,1,0
801,mean_sunshine_hours_in_day_in_target_period,1,0,1,0,0,1
802,mean_sunshine_hours_in_month_in_previous_period,1,0,1,1,1,1
803,mean_sunshine_hours_in_month_in_target_period,1,1,1,0,1,1


In [92]:
multivariable_columns = ['lasso','random_forest','gradient_boosting','linear_svc','ridge']

df_variables_selection['multivariable_analysis_count'] = df_variables_selection[multivariable_columns].sum(axis=1)
df_variables_selection.to_csv("feature_selection_variables_selection.csv")

In [93]:
print("df_variables_selection (only with Multivariable Analysis): ")
df_variables_selection[['var'] + multivariable_columns + ['multivariable_analysis_count']]

df_variables_selection (only with Multivariable Analysis): 


Unnamed: 0,var,lasso,random_forest,gradient_boosting,linear_svc,ridge,multivariable_analysis_count
0,listing_id,0,1,0,0,0,1
1,host_id,0,1,0,0,0,1
2,kmean_cluster_availability,1,1,0,1,1,4
3,target_num_of_day_in_period_minus_num_of_day_i...,1,1,1,1,0,4
4,room_type_Private room_host_response_time_with...,0,0,0,1,0,1
...,...,...,...,...,...,...,...
800,mean_sunshine_hours_in_day_in_previous_period,0,1,1,1,0,3
801,mean_sunshine_hours_in_day_in_target_period,0,1,0,0,1,2
802,mean_sunshine_hours_in_month_in_previous_period,0,1,1,1,1,4
803,mean_sunshine_hours_in_month_in_target_period,1,1,0,1,1,4


In [64]:
print("multivariable analysis summary")
for col in multivariable_columns:
    print(f"{col} sum of 'is feature signficant' to target", df_variables_selection[col].sum())

df_variables_selection.groupby(['multivariable_analysis_count']).size().to_frame('sum').reset_index()

lasso sum of 'is feature signficant' to target 15
random_forest sum of 'is feature signficant' to target 59
gradient_boosting sum of 'is feature signficant' to target 16
linear_svc sum of 'is feature signficant' to target 134
ridge sum of 'is feature signficant' to target 200


Unnamed: 0,multivariable_analysis_count,sum
0,0,482
1,1,270
2,2,22
3,3,17
4,4,11
5,5,3


### Univariable and Multivariable Analysis

In [66]:
df_variables_selection['total_count'] = df_variables_selection[univariable_columns + multivariable_columns].sum(axis=1)
df_variables_selection

Unnamed: 0,var,spearmanr,py_mechkar,univariable_analysis_count,lasso,random_forest,gradient_boosting,linear_svc,ridge,multivariable_analysis_count,total_count
0,listing_id,1,1,2,0,1,0,0,0,1,3
1,host_id,1,1,2,0,1,0,0,0,1,3
2,kmean_cluster_availability,1,1,2,1,1,0,1,1,4,6
3,target_num_of_day_in_period_minus_num_of_day_i...,1,1,2,1,1,1,1,0,4,6
4,room_type_Private room_host_response_time_with...,1,1,2,0,0,0,1,0,1,3
...,...,...,...,...,...,...,...,...,...,...,...
800,mean_sunshine_hours_in_day_in_previous_period,1,1,2,0,1,1,1,0,3,5
801,mean_sunshine_hours_in_day_in_target_period,1,1,2,0,1,0,0,1,2,4
802,mean_sunshine_hours_in_month_in_previous_period,1,1,2,0,1,1,1,1,4,6
803,mean_sunshine_hours_in_month_in_target_period,1,1,2,1,1,0,1,1,4,6


In [78]:
df_variables_selection_count_summary = df_variables_selection.groupby(['total_count']).size().to_frame('sum').reset_index()
df_variables_selection_count_summary['count_rate'] = df_variables_selection_count_summary['sum']*100.0/(df_variables_selection_count_summary['sum'].sum())
df_variables_selection_count_summary

Unnamed: 0,total_count,sum,count_rate
0,0,10,1.242236
1,1,22,2.732919
2,2,471,58.509317
3,3,249,30.931677
4,4,22,2.732919
5,5,17,2.111801
6,6,11,1.36646
7,7,3,0.372671


In [102]:
# selecting total count of 4 vars (and above)
print("num of selected vars:")
print(df_variables_selection_count_summary[df_variables_selection_count_summary['total_count'] >= 4 ]['sum'].sum(axis=0))
print("\n\nselected vars:")
df_variables_selection_final =  df_variables_selection[df_variables_selection['total_count']>=4]
df_variables_selection_final.to_csv("variables_selection_final.csv")
df_variables_selection_final

num of selected vars:
53


selected vars:


Unnamed: 0,var,spearmanr,py_mechkar,univariable_analysis_count,lasso,random_forest,gradient_boosting,linear_svc,ridge,multivariable_analysis_count,total_count
2,kmean_cluster_availability,1,1,2,1,1,0,1,1,4,6
3,target_num_of_day_in_period_minus_num_of_day_i...,1,1,2,1,1,1,1,0,4,6
64,room_type_Entire home/apt_require_guest_phone_...,1,1,2,0,0,0,1,1,2,4
125,host_response_time_missing_host_response_time_...,1,1,2,0,1,0,0,1,2,4
156,host_response_time_within an hour_require_gues...,1,1,2,0,0,0,1,1,2,4
157,host_response_time_within an hour_require_gues...,1,1,2,0,0,0,1,1,2,4
160,host_response_time_missing_host_response_time_...,1,1,2,0,1,0,0,1,2,4
227,host_is_superhost_t_require_guest_phone_verifi...,1,1,2,0,0,0,1,1,2,4
431,require_guest_phone_verification_f_concat_comm...,1,1,2,0,0,0,1,1,2,4
567,children���s_dinnerware,1,1,2,0,0,0,1,1,2,4


In [90]:
print("most significat vars:")
df_variables_selection[df_variables_selection['total_count']>=6].sort_values('total_count', ascending=False)

most significat vars:


Unnamed: 0,var,spearmanr,py_mechkar,univariable_analysis_count,lasso,random_forest,gradient_boosting,linear_svc,ridge,multivariable_analysis_count,total_count
783,availability_365_minus_availability_90,1,1,2,1,1,1,1,1,5,7
785,availability_365_minus_availability_60,1,1,2,1,1,1,1,1,5,7
789,occupancy_last_period_minus_availability_90,1,1,2,1,1,1,1,1,5,7
2,kmean_cluster_availability,1,1,2,1,1,0,1,1,4,6
3,target_num_of_day_in_period_minus_num_of_day_i...,1,1,2,1,1,1,1,0,4,6
699,avg_dollar_price_in_previous_period_cat_avg_do...,1,1,2,1,1,0,1,1,4,6
772,availability_365,1,1,2,1,1,0,1,1,4,6
787,occupancy_last_period_minus_availability_60,1,1,2,1,1,0,1,1,4,6
788,occupancy_last_period_minus_availability_365,1,1,2,0,1,1,1,1,4,6
791,mean_temperatures_in_target_period,1,1,2,0,1,1,1,1,4,6


We can see in the above table that the most sigificant vars are transformation vars that have been created in the feature enrichement section. 

In [105]:
print("selected vars: ")
df_variables_selection_final['var'].to_list()

selected vars: 


['kmean_cluster_availability',
 'target_num_of_day_in_period_minus_num_of_day_in_previous_period',
 'room_type_Entire home/apt_require_guest_phone_verification_t',
 'host_response_time_missing_host_response_time_bed_type_Real Bed',
 'host_response_time_within an hour_require_guest_phone_verification_f',
 'host_response_time_within an hour_require_guest_phone_verification_t',
 'host_response_time_missing_host_response_time_require_guest_phone_verification_f',
 'host_is_superhost_t_require_guest_phone_verification_t',
 'require_guest_phone_verification_f_concat_comments_sentiment_missing_concat_comments_sentiment',
 'children���s_dinnerware',
 'long_term_stays_allowed',
 'property_type_Apartment',
 'host_response_time_missing_host_response_time',
 'host_response_rate_cat_host_response_rate_missing',
 'avg_dollar_price_in_previous_period_cat_avg_dollar_price_in_previous_period_0%_to_25%',
 'avg_dollar_price_in_previous_period_cat_avg_dollar_price_in_previous_period_50%_to_75%',
 'avg_doll