In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.preprocessing import LabelEncoder
import numpy as np
from sklearn.metrics import roc_auc_score

In [2]:
df = pd.read_csv("Hotel Reservations.csv")
df.head()


Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled


In [3]:
# convert the booking statuse to 0 and 1. Not cancelled = 0, cancelled = 1
df['booking_status_encoded'] = np.where(np.array(df['booking_status']=="Canceled"),1,0)
df.head()



Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status,booking_status_encoded
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled,0
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled,0
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled,1
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled,1
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled,1


# Baseline model-- Logistic Regression


## Prepare dataset for modeling

In [7]:
# drop BookingID
df = df.drop("Booking_ID", axis = 1)
df.head()


Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status,booking_status_encoded
0,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled,0
1,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled,0
2,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled,1
3,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled,1
4,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled,1


In [8]:
df.shape

(36275, 19)

In [15]:

y = df["booking_status_encoded"]
y

0        0
1        0
2        1
3        1
4        1
        ..
36270    0
36271    1
36272    0
36273    1
36274    0
Name: booking_status_encoded, Length: 36275, dtype: int64

In [16]:
X = df.iloc[:,:17]
X.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests
0,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0
1,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1
2,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0
3,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0
4,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0


In [17]:
X = pd.get_dummies(X, columns=['type_of_meal_plan', 'room_type_reserved',"market_segment_type"], drop_first=True)
X.head()

Unnamed: 0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,...,room_type_reserved_Room_Type 2,room_type_reserved_Room_Type 3,room_type_reserved_Room_Type 4,room_type_reserved_Room_Type 5,room_type_reserved_Room_Type 6,room_type_reserved_Room_Type 7,market_segment_type_Complementary,market_segment_type_Corporate,market_segment_type_Offline,market_segment_type_Online
0,2,0,1,2,0,224,2017,10,2,0,...,0,0,0,0,0,0,0,0,1,0
1,2,0,2,3,0,5,2018,11,6,0,...,0,0,0,0,0,0,0,0,0,1
2,1,0,2,1,0,1,2018,2,28,0,...,0,0,0,0,0,0,0,0,0,1
3,2,0,0,2,0,211,2018,5,20,0,...,0,0,0,0,0,0,0,0,0,1
4,2,0,1,1,0,48,2018,4,11,0,...,0,0,0,0,0,0,0,0,0,1


## Modeling

In [18]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)
X_train.shape


(24304, 27)

In [19]:
LR_model = LogisticRegression(random_state=0).fit(X_train, y_train)


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [20]:
# score = correct predictions / total number of data
score = LR_model.score(X_test, y_test)
print(score)


0.8051123548575725


In [21]:
y_pred = LR_model.predict(X_test)
# y_pred_prob = LR_model.predict_proba(X_test)
# y_pred_prob
# print(roc_auc_score(y, LR_model.predict_proba(X_test)[:, 1]))

In [22]:
print(classification_report(y_test,y_pred))

              precision    recall  f1-score   support

           0       0.82      0.90      0.86      8010
           1       0.75      0.61      0.68      3961

    accuracy                           0.81     11971
   macro avg       0.79      0.76      0.77     11971
weighted avg       0.80      0.81      0.80     11971



## Logistic regression with ridge

In [23]:
from sklearn.linear_model import RidgeClassifierCV
LR_ridge = RidgeClassifierCV(alphas=[1e-3, 1e-2, 1e-1, 1]).fit(X_train, y_train)
LR_ridge.score(X_test, y_test)

0.8010191295631108

In [24]:
y_pred_ridge = LR_ridge.predict(X_test)
y_pred_ridge

array([0, 0, 0, ..., 0, 0, 0])

In [25]:
print(classification_report(y_test, y_pred_ridge))


              precision    recall  f1-score   support

           0       0.81      0.91      0.86      8010
           1       0.76      0.58      0.66      3961

    accuracy                           0.80     11971
   macro avg       0.79      0.74      0.76     11971
weighted avg       0.80      0.80      0.79     11971



# Second Model: Neural Network

In [26]:
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()
X_train_standardize = scaler.fit_transform(X_train)
X_test_standardize = scaler.fit_transform(X_test)



In [27]:
nn1 = MLPClassifier(solver='lbfgs', 
                    # alpha=1e-5,
                    # hidden_layer_sizes=(6,), 
                    random_state=123)
nn1.fit(X_train_standardize,y_train)
y_predNN = nn1.predict(X_test_standardize)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
  self.n_iter_ = _check_optimize_result("lbfgs", opt_res, self.max_iter)


In [32]:
print(classification_report(y_test,y_predNN))

              precision    recall  f1-score   support

           0       0.88      0.91      0.89      8010
           1       0.80      0.75      0.78      3961

    accuracy                           0.86     11971
   macro avg       0.84      0.83      0.84     11971
weighted avg       0.86      0.86      0.86     11971



In [33]:
params = {'hidden_layer_sizes': [(30,),(50,),(70,),(100,)],
         'learning_rate_init': [0.0001, 0.001,0.01, 0.1,1],
         }
nn_model = MLPClassifier()


In [34]:
gs_nn1 = GridSearchCV(nn_model,
                      param_grid=params,
                      scoring='roc_auc',
                      cv=10)
gs_nn1.fit(X_train_standardize,y_train)
gs_nn1.best_params_



{'hidden_layer_sizes': (100,), 'learning_rate_init': 0.01}

In [35]:
gs_knn_pred = gs_nn1.predict(X_test_standardize)



In [36]:
print(classification_report(y_test,gs_knn_pred))

              precision    recall  f1-score   support

           0       0.89      0.90      0.90      8010
           1       0.80      0.77      0.78      3961

    accuracy                           0.86     11971
   macro avg       0.84      0.84      0.84     11971
weighted avg       0.86      0.86      0.86     11971



In [42]:
roc_auc_score(y_test, gs_nn1.predict_proba(X_test_standardize)[:, 1])

0.9212882407467818