In [1]:
import pandas as pd
from pprint import pprint
from sklearn.feature_selection import SelectKBest

In [2]:
data = pd.read_csv('hotel_bookings.csv')

## Data preprocessing

In [3]:
def data_overview(df):
    print("Rows :  ", df.shape[0])
    print("Columns:  ", df.shape[1])
    print()
    print(df.dtypes)
    print()
    print('Missing rates : ')
    print(df.isnull().sum().sort_values(ascending=False)[:10]/df.shape[0])
    print()
    print('Unique values:')
    print(df.nunique().sort_values(ascending=False)[:10])
    print()
    print("List of columns:")
    pprint(df.columns.to_list())

In [4]:
data_overview(data)

Rows :   119390
Columns:   32

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
age

In [5]:
data.drop(['company','adr','agent','country','children','reservation_status_date'],inplace=True,axis=1)

In [6]:
num_col = []
cat_col = []
X = data.drop('is_canceled', axis=1)
y = data['is_canceled']
for col in X.columns.to_list():
    if X[col].dtype == object:
        cat_col.append(col)
    else:
        num_col.append(col)
print('numerical features: ')
print(num_col)
print('categorical features: ')
print(cat_col)


numerical features: 
['lead_time', 'arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'babies', 'is_repeated_guest', 'previous_cancellations', 'previous_bookings_not_canceled', 'booking_changes', 'days_in_waiting_list', 'required_car_parking_spaces', 'total_of_special_requests']
categorical features: 
['hotel', 'arrival_date_month', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'deposit_type', 'customer_type', 'reservation_status']


In [7]:
from sklearn.preprocessing import LabelEncoder

cat_col_index = []

le = LabelEncoder()
for col in X.columns:
    if X[col].dtype == object:
        cat_col_index.append(1)
        X[col] = le.fit_transform(X[col])
    else:
        cat_col_index.append(0)

print(cat_col_index)

[1, 0, 0, 1, 0, 0, 0, 0, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 0, 1, 0, 1, 0, 0, 1]


In [8]:
X_num=X[num_col]
X_cat=X[cat_col]

## Feature selection

### Variance Threshold
+ numerical features

In [9]:
from sklearn.feature_selection import VarianceThreshold

In [10]:
vt = VarianceThreshold(threshold=3)
vt.fit(X_num)

VarianceThreshold(threshold=3)

In [11]:
# check variances for all num_cols
dict_variance = {}
for i, j in zip(X_num.columns.values, vt.variances_):
    dict_variance[i] = j
print(dict_variance)

{'lead_time': 11419.625860118034, 'arrival_date_year': 0.5005180197455278, 'arrival_date_week_number': 185.09823929289158, 'arrival_date_day_of_month': 77.10232038380376, 'stays_in_weekend_nights': 0.9972205588924927, 'stays_in_week_nights': 3.6415234872671034, 'adults': 0.33554049428790456, 'babies': 0.009493731855978594, 'is_repeated_guest': 0.030893830639671306, 'previous_cancellations': 0.7128979583873327, 'previous_bookings_not_canceled': 2.242298331424404, 'booking_changes': 0.42549899617103926, 'days_in_waiting_list': 309.57160983611016, 'required_car_parking_spaces': 0.06016724306623273, 'total_of_special_requests': 0.6285240747033594}


In [12]:
# check filtered num_cols
ls = list()
for i,j in dict_variance.items():
    if j>=3:
        ls.append(i)
print(ls)

X_num_filtered = pd.DataFrame(vt.fit_transform(X_num),columns=ls)

['lead_time', 'arrival_date_week_number', 'arrival_date_day_of_month', 'stays_in_week_nights', 'days_in_waiting_list']


### Mutual Information
+ categorical or numerical features
+ use mutual_info_classif for classification tasks
+ use mutual_info_regression for regression tasks
+ use an boolean array to index categorical features, send it to discrete_features
+ pro: can detect inter-variable relations

In [13]:
from sklearn.feature_selection import mutual_info_classif
mic = mutual_info_classif(X, y, discrete_features=cat_col_index, n_neighbors=3, copy=True, random_state=None)

In [14]:
dict_feature = {}
for i, j in zip(X.columns.values, mic):
    dict_feature[i] = j
print(dict_feature)

{'hotel': 0.009540756251329385, 'lead_time': 0.08367780013832715, 'arrival_date_year': 0.004160849961124535, 'arrival_date_month': 0.00354206442571936, 'arrival_date_week_number': 0.0046747839807406155, 'arrival_date_day_of_month': 0.0015786912079938098, 'stays_in_weekend_nights': 0.004256788611889828, 'stays_in_week_nights': 0.007021939085966489, 'adults': 0.009675442338508145, 'babies': 0.0005076352707731324, 'meal': 0.0025039180651031945, 'market_segment': 0.04101441669311767, 'distribution_channel': 0.02693803591662869, 'is_repeated_guest': 0.0017224223446075104, 'previous_cancellations': 0.03954334834292328, 'previous_bookings_not_canceled': 0.010869318874948242, 'reserved_room_type': 0.0029930098506292246, 'assigned_room_type': 0.024786529452321737, 'booking_changes': 0.01975101031877391, 'deposit_type': 0.13173483868329372, 'days_in_waiting_list': 0.014149958478510749, 'customer_type': 0.017112171963946432, 'required_car_parking_spaces': 0.0282172045060034, 'total_of_special_req

In [15]:
# sort features
ls = sorted(dict_feature.items(), key = lambda item:item[1], reverse=True)
print(ls)

[('reservation_status', 0.670333575031889), ('deposit_type', 0.13173483868329372), ('lead_time', 0.08367780013832715), ('market_segment', 0.04101441669311767), ('total_of_special_requests', 0.040728962719306816), ('previous_cancellations', 0.03954334834292328), ('required_car_parking_spaces', 0.0282172045060034), ('distribution_channel', 0.02693803591662869), ('assigned_room_type', 0.024786529452321737), ('booking_changes', 0.01975101031877391), ('customer_type', 0.017112171963946432), ('days_in_waiting_list', 0.014149958478510749), ('previous_bookings_not_canceled', 0.010869318874948242), ('adults', 0.009675442338508145), ('hotel', 0.009540756251329385), ('stays_in_week_nights', 0.007021939085966489), ('arrival_date_week_number', 0.0046747839807406155), ('stays_in_weekend_nights', 0.004256788611889828), ('arrival_date_year', 0.004160849961124535), ('arrival_date_month', 0.00354206442571936), ('reserved_room_type', 0.0029930098506292246), ('meal', 0.0025039180651031945), ('is_repeated_

In [16]:
# keep top 5 features
k = 5
ls_new = []
for i in range(k):
    ls_new.append(ls[i][0])

X_filtered = X[ls_new]

In [17]:
X_filtered.head()

Unnamed: 0,reservation_status,deposit_type,lead_time,market_segment,total_of_special_requests
0,1,0,342,3,0
1,1,0,737,3,0
2,1,0,7,3,0
3,1,0,13,2,0
4,1,0,14,6,1


### chi-square test
+ categorical features
+ classification tasks

In [18]:
from sklearn.feature_selection import chi2

In [19]:
(chi2,pval)=chi2(X_cat,y)
dict_feature = {}
for i, j in zip(X_cat.columns.values, chi2):
    dict_feature[i] = j
print(dict_feature)

{'hotel': 1478.7736762614823, 'arrival_date_month': 0.6037383087348858, 'meal': 76.65972752493069, 'market_segment': 136.87282078299256, 'distribution_channel': 1068.4837051992145, 'reserved_room_type': 1305.3276630649993, 'assigned_room_type': 9822.322343920914, 'deposit_type': 23515.169301053516, 'customer_type': 86.35982720364787, 'reservation_status': 38297.81334112092}


In [20]:
# sort features
ls = sorted(dict_feature.items(), key = lambda item:item[1], reverse=True)
print(ls)

[('reservation_status', 38297.81334112092), ('deposit_type', 23515.169301053516), ('assigned_room_type', 9822.322343920914), ('hotel', 1478.7736762614823), ('reserved_room_type', 1305.3276630649993), ('distribution_channel', 1068.4837051992145), ('market_segment', 136.87282078299256), ('customer_type', 86.35982720364787), ('meal', 76.65972752493069), ('arrival_date_month', 0.6037383087348858)]


In [21]:
# keep top 5 features
k = 5
ls_new = []
for i in range(k):
    ls_new.append(ls[i][0])

X_cat_filtered = X_cat[ls_new]

In [22]:
X_cat_filtered.head()

Unnamed: 0,reservation_status,deposit_type,assigned_room_type,hotel,reserved_room_type
0,1,0,2,1,2
1,1,0,2,1,2
2,1,0,2,1,0
3,1,0,0,1,0
4,1,0,0,1,0


### ANOVA F-test
+ numerical features
+ use f_classif for classification tasks
+ use f_regression for regression tasks
+ con: can only detect linear correlations

In [23]:
from sklearn.feature_selection import f_classif

In [24]:
(f_classif,pval)=f_classif(X_num,y)
dict_feature = {}
for i, j in zip(X_num.columns.values, f_classif):
    dict_feature[i] = j
print(dict_feature)

{'lead_time': 11222.19824775232, 'arrival_date_year': 33.145333976572005, 'arrival_date_week_number': 7.926811357141699, 'arrival_date_day_of_month': 4.486514962782778, 'stays_in_weekend_nights': 0.38299323858604195, 'stays_in_week_nights': 73.26402252202055, 'adults': 431.598080853803, 'babies': 126.16762626474032, 'is_repeated_guest': 864.6070812664263, 'previous_cancellations': 1465.864983065515, 'previous_bookings_not_canceled': 394.0720458051765, 'booking_changes': 2541.731400613087, 'days_in_waiting_list': 351.56776824240154, 'required_car_parking_spaces': 4744.260084629089, 'total_of_special_requests': 6957.100858553981}


In [25]:
# sort features
ls = sorted(dict_feature.items(), key = lambda item:item[1], reverse=True)
print(ls)

[('lead_time', 11222.19824775232), ('total_of_special_requests', 6957.100858553981), ('required_car_parking_spaces', 4744.260084629089), ('booking_changes', 2541.731400613087), ('previous_cancellations', 1465.864983065515), ('is_repeated_guest', 864.6070812664263), ('adults', 431.598080853803), ('previous_bookings_not_canceled', 394.0720458051765), ('days_in_waiting_list', 351.56776824240154), ('babies', 126.16762626474032), ('stays_in_week_nights', 73.26402252202055), ('arrival_date_year', 33.145333976572005), ('arrival_date_week_number', 7.926811357141699), ('arrival_date_day_of_month', 4.486514962782778), ('stays_in_weekend_nights', 0.38299323858604195)]


In [26]:
# keep top 5 features
k = 5
ls_new = []
for i in range(k):
    ls_new.append(ls[i][0])

In [27]:
X_num_filtered = X_num[ls_new]
X_num_filtered.head()

Unnamed: 0,lead_time,total_of_special_requests,required_car_parking_spaces,booking_changes,previous_cancellations
0,342,0,0,3,0
1,737,0,0,4,0
2,7,0,0,0,0
3,13,0,0,0,0
4,14,1,0,0,0
