In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
import category_encoders as ce 

from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier
from xgboost import XGBClassifier
from catboost import CatBoostClassifier
from sklearn.ensemble import ExtraTreesClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

In [2]:
data = pd.read_csv('hotel_bookings.csv')

In [3]:
data.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [4]:
data.isnull().sum()

hotel                                  0
is_canceled                            0
lead_time                              0
arrival_date_year                      0
arrival_date_month                     0
arrival_date_week_number               0
arrival_date_day_of_month              0
stays_in_weekend_nights                0
stays_in_week_nights                   0
adults                                 0
children                               4
babies                                 0
meal                                   0
country                              488
market_segment                         0
distribution_channel                   0
is_repeated_guest                      0
previous_cancellations                 0
previous_bookings_not_canceled         0
reserved_room_type                     0
assigned_room_type                     0
booking_changes                        0
deposit_type                           0
agent                              16340
company         

In [5]:
data = data.fillna({"children":0, "country":"UNKNOWN", "agent":0, "company":0})

In [6]:
data.isnull().sum()

hotel                             0
is_canceled                       0
lead_time                         0
arrival_date_year                 0
arrival_date_month                0
arrival_date_week_number          0
arrival_date_day_of_month         0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
meal                              0
country                           0
market_segment                    0
distribution_channel              0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
reserved_room_type                0
assigned_room_type                0
booking_changes                   0
deposit_type                      0
agent                             0
company                           0
days_in_waiting_list              0
customer_type                     0
adr                         

In [7]:
data.dtypes

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [8]:
threshold = 400
data.drop(data.loc[data['lead_time'] > threshold].index, inplace=True)

In [9]:
data.drop(data.loc[(data['stays_in_week_nights'] + data['stays_in_weekend_nights']) <= 0].index, inplace=True)

In [10]:
data.drop(data.loc[(data['adults'] + data['children'] + data['babies']) <= 0].index, inplace=True)

In [11]:
filtered = (data['is_repeated_guest'] == 1) & ((data['previous_cancellations'] + data['previous_bookings_not_canceled']) == 0)
data.drop(data.loc[filtered].index, inplace=True)

In [12]:
data.drop(data.loc[data['adr'] < 0].index, inplace=True)

In [13]:
data.shape

(116140, 32)

In [14]:
data.drop(['reservation_status'], axis=1, inplace=True)

In [15]:
data.corr()['is_canceled'].abs().sort_values(ascending=False)

is_canceled                       1.000000
lead_time                         0.280478
total_of_special_requests         0.227638
required_car_parking_spaces       0.196022
booking_changes                   0.141153
previous_cancellations            0.111502
company                           0.082371
is_repeated_guest                 0.071373
previous_bookings_not_canceled    0.057261
adults                            0.056647
days_in_waiting_list              0.054681
adr                               0.050926
agent                             0.041828
babies                            0.031916
stays_in_week_nights              0.025323
arrival_date_year                 0.014869
children                          0.008183
arrival_date_day_of_month         0.004817
arrival_date_week_number          0.004182
stays_in_weekend_nights           0.003051
Name: is_canceled, dtype: float64

In [16]:
# date = pd.to_datetime(data['reservation_status_date'])

# data['year'] = date.dt.year
# data['month'] = date.dt.month
# data['day'] = date.dt.day

data.drop(['reservation_status_date'] , axis = 1, inplace = True)

In [17]:
X = data.drop(['is_canceled'], axis=1)
y = data['is_canceled']

In [18]:
numericalFeatures = list(X._get_numeric_data().columns)
categoricalFeatures = list(set(X.columns) - set(numericalFeatures))

for col in categoricalFeatures:
    print(f"{col}({X[col].unique().size}): \n{X[col].unique()}\n") 

meal(5): 
['BB' 'FB' 'HB' 'SC' 'Undefined']

country(178): 
['GBR' 'PRT' 'USA' 'ESP' 'IRL' 'FRA' 'UNKNOWN' 'ROU' 'NOR' 'OMN' 'ARG'
 'POL' 'DEU' 'BEL' 'CHE' 'CN' 'GRC' 'ITA' 'NLD' 'DNK' 'RUS' 'SWE' 'AUS'
 'EST' 'CZE' 'BRA' 'FIN' 'MOZ' 'BWA' 'LUX' 'SVN' 'ALB' 'IND' 'CHN' 'MEX'
 'MAR' 'UKR' 'SMR' 'LVA' 'PRI' 'SRB' 'CHL' 'AUT' 'BLR' 'LTU' 'TUR' 'ZAF'
 'AGO' 'ISR' 'CYM' 'ZMB' 'CPV' 'ZWE' 'DZA' 'KOR' 'CRI' 'HUN' 'ARE' 'TUN'
 'JAM' 'HRV' 'HKG' 'IRN' 'GEO' 'AND' 'GIB' 'URY' 'JEY' 'CAF' 'CYP' 'COL'
 'GGY' 'KWT' 'NGA' 'MDV' 'VEN' 'SVK' 'FJI' 'KAZ' 'PAK' 'IDN' 'LBN' 'PHL'
 'SEN' 'SYC' 'AZE' 'BHR' 'NZL' 'THA' 'DOM' 'MKD' 'MYS' 'ARM' 'JPN' 'LKA'
 'CUB' 'CMR' 'BIH' 'MUS' 'COM' 'SUR' 'UGA' 'BGR' 'CIV' 'JOR' 'SYR' 'SGP'
 'BDI' 'SAU' 'VNM' 'PLW' 'EGY' 'PER' 'MLT' 'MWI' 'ECU' 'MDG' 'ISL' 'UZB'
 'NPL' 'BHS' 'MAC' 'TGO' 'TWN' 'DJI' 'STP' 'KNA' 'ETH' 'IRQ' 'HND' 'RWA'
 'QAT' 'KHM' 'MCO' 'BGD' 'IMN' 'TJK' 'NIC' 'BEN' 'VGB' 'TZA' 'GAB' 'GHA'
 'TMP' 'GLP' 'KEN' 'LIE' 'GNB' 'MNE' 'UMI' 'MYT' 'FRO' 'MMR' 'PAN' 

In [19]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(81298, 29)
(34842, 29)
(81298,)
(34842,)


In [20]:
encoder = ce.cat_boost.CatBoostEncoder(cols = categoricalFeatures)
encoder.fit(X_train, y_train)
X_train = encoder.transform(X_train)
X_test = encoder.transform(X_test)

# X_enc = X

In [21]:
X_train.head()

Unnamed: 0,hotel,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,...,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests
15234,0.281068,0,2016,0.31976,10,1,0,3,1,0.0,...,0.251118,2,0.286885,47.0,0.0,0,0.401368,30.0,0,0
22794,0.281068,20,2016,0.402724,15,4,1,1,2,0.0,...,0.251118,1,0.286885,250.0,0.0,0,0.401368,89.0,0,0
105910,0.411091,2,2017,0.415226,26,28,0,1,1,0.0,...,0.251118,1,0.286885,0.0,45.0,0,0.401368,0.0,0,2
62154,0.411091,27,2017,0.302612,1,3,0,2,2,0.0,...,0.439127,0,0.286885,9.0,0.0,0,0.401368,106.0,0,1
14121,0.281068,6,2017,0.33491,5,1,0,2,1,0.0,...,0.022905,0,0.286885,0.0,356.0,0,0.25762,0.0,0,1


In [22]:
scaler = StandardScaler().fit(X_train,y_train)
X_train = pd.DataFrame(scaler.transform(X_train), columns=X_train.columns)
X_test = pd.DataFrame(scaler.transform(X_test), columns=X_test.columns)

In [23]:
# fs = SelectKBest(score_func=chi2, k='all')
# fs.fit(X_enc, y)
# columns = X_enc.columns.to_list()
# scores = {columns[i]:fs.scores_[i] for i in range(len(columns))}
# dict(sorted(scores.items(), key=lambda item: item[1], reverse=True))

In [24]:
y_train = y_train.reset_index(drop=True)
# X_train = X_train[:10000]
# y_train = y_train[:10000]

y_test = y_test.reset_index(drop=True)
# X_test = X_test[:10000]
# y_test = y_test[:10000]

In [25]:
train = pd.concat([X_train[:10000], y_train[:10000]], axis=1, join='inner')
test = pd.concat([X_test[:10000], y_test[:10000]], axis=1, join='inner')

In [26]:
train.to_csv('train.csv',index=False, header=False)
X_test[:10000].to_csv('test.csv',index=False, header=False)

In [27]:
lr = LogisticRegression(max_iter=2000)
lr.fit(X_train, y_train)

y_pred_lr = lr.predict(X_test)

acc_lr = accuracy_score(y_test, y_pred_lr)
conf = confusion_matrix(y_test, y_pred_lr)
clf_report = classification_report(y_test, y_pred_lr)

print(f"Accuracy Score of Logistic Regression is : {acc_lr}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

Accuracy Score of Logistic Regression is : 0.8036852075081797
Confusion Matrix : 
[[20114  1928]
 [ 4912  7888]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.80      0.91      0.85     22042
           1       0.80      0.62      0.70     12800

    accuracy                           0.80     34842
   macro avg       0.80      0.76      0.78     34842
weighted avg       0.80      0.80      0.80     34842



In [28]:
dtc = DecisionTreeClassifier()
dtc.fit(X_train, y_train)

y_pred_dtc = dtc.predict(X_test)

acc_dtc = accuracy_score(y_test, y_pred_dtc)
conf = confusion_matrix(y_test, y_pred_dtc)
clf_report = classification_report(y_test, y_pred_dtc)

print(f"Accuracy Score of Decision Tree is : {acc_dtc}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

Accuracy Score of Decision Tree is : 0.8503243212215142
Confusion Matrix : 
[[19315  2727]
 [ 2488 10312]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.89      0.88      0.88     22042
           1       0.79      0.81      0.80     12800

    accuracy                           0.85     34842
   macro avg       0.84      0.84      0.84     34842
weighted avg       0.85      0.85      0.85     34842



In [29]:
rd_clf = RandomForestClassifier()
rd_clf.fit(X_train, y_train)

y_pred_rd_clf = rd_clf.predict(X_test)

acc_rd_clf = accuracy_score(y_test, y_pred_rd_clf)
conf = confusion_matrix(y_test, y_pred_rd_clf)
clf_report = classification_report(y_test, y_pred_rd_clf)

print(f"Accuracy Score of Random Forest is : {acc_rd_clf}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

Accuracy Score of Random Forest is : 0.8919981631364445
Confusion Matrix : 
[[20612  1430]
 [ 2333 10467]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.90      0.94      0.92     22042
           1       0.88      0.82      0.85     12800

    accuracy                           0.89     34842
   macro avg       0.89      0.88      0.88     34842
weighted avg       0.89      0.89      0.89     34842



In [30]:
knn = KNeighborsClassifier()
knn.fit(X_train, y_train)

y_pred_knn = knn.predict(X_test)

acc_knn = accuracy_score(y_test, y_pred_knn)
conf = confusion_matrix(y_test, y_pred_knn)
clf_report = classification_report(y_test, y_pred_knn)

print(f"Accuracy Score of KNN is : {acc_knn}")
print(f"Confusion Matrix : \n{conf}")
print(f"Classification Report : \n{clf_report}")

Accuracy Score of KNN is : 0.8393318408816945
Confusion Matrix : 
[[19578  2464]
 [ 3134  9666]]
Classification Report : 
              precision    recall  f1-score   support

           0       0.86      0.89      0.87     22042
           1       0.80      0.76      0.78     12800

    accuracy                           0.84     34842
   macro avg       0.83      0.82      0.83     34842
weighted avg       0.84      0.84      0.84     34842



In [31]:
cat = CatBoostClassifier(iterations=100, silent=True)
cat.fit(X_train, y_train)

y_pred_cat = cat.predict(X_test)

acc_cat = accuracy_score(y_test, y_pred_cat)
clf_report = classification_report(y_test, y_pred_cat)
print(f'Accuracy: {acc_cat}')
print(f'Classification Report : \n{clf_report}')

Accuracy: 0.8782216864703519
Classification Report : 
              precision    recall  f1-score   support

           0       0.90      0.91      0.90     22042
           1       0.85      0.82      0.83     12800

    accuracy                           0.88     34842
   macro avg       0.87      0.87      0.87     34842
weighted avg       0.88      0.88      0.88     34842

