In [1]:
import pandas as pd
import numpy as np
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score, RepeatedKFold
from sklearn.metrics import accuracy_score, classification_report
from sklearn.pipeline import make_pipeline
from sklearn.svm import LinearSVC, SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier

from keras.models import Sequential, load_model
from keras.layers import Dense
from keras.wrappers.scikit_learn import KerasClassifier

import pickle

pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [2]:
def model_evaluate(model, X_test, y_test):
    y_pred = model.predict(X_test)
    print(classification_report(y_test, y_pred))

In [3]:
hotel = pd.read_csv("hotel-data.csv")

In [4]:
hotel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [5]:
hotel.reservation_status.value_counts(dropna=False)

Check-Out    75166
Canceled     43017
No-Show       1207
Name: reservation_status, dtype: int64

In [6]:
hotel.is_canceled.value_counts(dropna=False)

0    75166
1    44224
Name: is_canceled, dtype: int64

In [7]:
hotel.hotel.value_counts(dropna=False)

City Hotel      79330
Resort Hotel    40060
Name: hotel, dtype: int64

In [8]:
hotel.arrival_date_month.value_counts(dropna=False)

August       13877
July         12661
May          11791
October      11160
April        11089
June         10939
September    10508
March         9794
February      8068
November      6794
December      6780
January       5929
Name: arrival_date_month, dtype: int64

In [9]:
hotel.head()

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,meal,country,market_segment,distribution_channel,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,reserved_room_type,assigned_room_type,booking_changes,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,3,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,0.0,0,BB,PRT,Direct,Direct,0,0,0,C,C,4,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Direct,Direct,0,0,0,A,C,0,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,0.0,0,BB,GBR,Corporate,Corporate,0,0,0,A,A,0,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,0.0,0,BB,GBR,Online TA,TA/TO,0,0,0,A,A,0,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [10]:
#sns.pairplot(hotel, hue='is_canceled', vars=['adults', 'babies', 'required_car_parking_spaces'])



## Data Preparation

In [11]:
print(hotel.shape)
hotel = hotel[~((hotel.children == 0) & (hotel.adults == 0) & (hotel.babies == 0))]
print(hotel.shape)

(119390, 32)
(119210, 32)


In [12]:
hotel['reservation_status_date'] = pd.to_datetime(hotel['reservation_status_date'])

In [13]:
hotel = hotel.assign(year = hotel['reservation_status_date'].dt.year,
                     month = hotel['reservation_status_date'].dt.month,
                     day = hotel['reservation_status_date'].dt.day)

In [14]:
NUMERICAL_COLUMNS = hotel.select_dtypes(include='int64').columns
NUMERICAL_COLUMNS

Index(['is_canceled', 'lead_time', 'arrival_date_year',
       'arrival_date_week_number', 'arrival_date_day_of_month',
       'stays_in_weekend_nights', 'stays_in_week_nights', 'adults', 'babies',
       'is_repeated_guest', 'previous_cancellations',
       'previous_bookings_not_canceled', 'booking_changes',
       'days_in_waiting_list', 'required_car_parking_spaces',
       'total_of_special_requests', 'year', 'month', 'day'],
      dtype='object')

In [15]:
CATEGORICAL_COLUMNS = ['hotel', 'meal','reserved_room_type', 'distribution_channel', 'customer_type']
dummies = pd.get_dummies(hotel[CATEGORICAL_COLUMNS]).drop(['meal_FB', 'reserved_room_type_L', 'distribution_channel_Undefined', 'customer_type_Group', 'hotel_Resort Hotel'], axis=1)
DUMMIES_COLUMNS = dummies.columns
DUMMIES_COLUMNS

Index(['hotel_City Hotel', 'meal_BB', 'meal_HB', 'meal_SC', 'meal_Undefined',
       'reserved_room_type_A', 'reserved_room_type_B', 'reserved_room_type_C',
       'reserved_room_type_D', 'reserved_room_type_E', 'reserved_room_type_F',
       'reserved_room_type_G', 'reserved_room_type_H',
       'distribution_channel_Corporate', 'distribution_channel_Direct',
       'distribution_channel_GDS', 'distribution_channel_TA/TO',
       'customer_type_Contract', 'customer_type_Transient',
       'customer_type_Transient-Party'],
      dtype='object')

In [16]:
hotel.hotel.value_counts()

City Hotel      79163
Resort Hotel    40047
Name: hotel, dtype: int64

In [17]:
print(hotel.shape)
hotel = pd.concat([hotel, dummies], axis=1)
print(hotel.shape)

(119210, 35)
(119210, 55)


In [18]:
X = hotel[NUMERICAL_COLUMNS.append(DUMMIES_COLUMNS)].drop(['is_canceled', 'arrival_date_year', 'year'], axis=1)
y = hotel.is_canceled

In [19]:
hotel.is_canceled.value_counts(dropna=False)

0    75011
1    44199
Name: is_canceled, dtype: int64

In [20]:
# 80% training and 20% test
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=47)

## Model 1

In [115]:
model = LinearSVC(loss='hinge', dual=True)
model.fit(X_train, y_train)



LinearSVC(loss='hinge')

In [116]:
model_evaluate(model, X_test, y_test)

              precision    recall  f1-score   support

           0       0.66      1.00      0.80     18744
           1       0.99      0.14      0.25     11059

    accuracy                           0.68     29803
   macro avg       0.83      0.57      0.52     29803
weighted avg       0.78      0.68      0.59     29803



## Model 2

In [115]:
model2 = make_pipeline(StandardScaler(), LinearSVC(loss='hinge', dual=True))

In [116]:
model2.fit(X_train, y_train)



Pipeline(steps=[('standardscaler', StandardScaler()),
                ('linearsvc', LinearSVC(loss='hinge'))])

In [117]:
model_evaluate(model2, X_test, y_test)

              precision    recall  f1-score   support

           0       0.72      0.92      0.81     18788
           1       0.74      0.39      0.51     11060

    accuracy                           0.72     29848
   macro avg       0.73      0.65      0.66     29848
weighted avg       0.73      0.72      0.70     29848



## Model 3
Takes more than 20 min to run

In [118]:
model3 = make_pipeline(StandardScaler(),  SVC(kernel='rbf', gamma=0.5, C=0.1))
model3.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc', SVC(C=0.1, gamma=0.5))])

In [119]:
model_evaluate(model3, X_test, y_test)

## Model 4
Takes more than 10 min to run

In [21]:
model_SVC = make_pipeline(StandardScaler(),  SVC(kernel='poly', degree=2, gamma='auto', coef0=1, C=5))

In [22]:
model_SVC.fit(X_train, y_train)

Pipeline(steps=[('standardscaler', StandardScaler()),
                ('svc',
                 SVC(C=5, coef0=1, degree=2, gamma='auto', kernel='poly'))])

In [23]:
model_evaluate(model_SVC, X_test, y_test)

              precision    recall  f1-score   support

           0       0.84      0.99      0.91     18634
           1       0.97      0.69      0.80     11169

    accuracy                           0.88     29803
   macro avg       0.91      0.84      0.86     29803
weighted avg       0.89      0.88      0.87     29803



## Model 5

In [21]:
scaler = StandardScaler()
X_train_scale = scaler.fit_transform(X_train)
X_test_scale = scaler.transform(X_test)

In [22]:
# define the keras model
def base_model():
    model = Sequential()
    model.add(Dense(4, input_dim=X_train.shape[1], activation='relu'))
    model.add(Dense(4, activation='linear'))
    model.add(Dense(1, activation='sigmoid'))
    model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
    return model

model_NN = KerasClassifier(build_fn=base_model, epochs=150, batch_size=64, verbose = 0)    
model_NN.fit(X_train_scale, y_train)

  model_NN = KerasClassifier(build_fn=base_model, epochs=150, batch_size=64, verbose = 0)


<keras.callbacks.History at 0x29b1aa19fa0>

In [23]:
y_pred = model_NN.predict(X_test_scale)
print("A acurácia no conjunto de teste é", accuracy_score(y_test, y_pred))
print((y_pred.flatten() == y_test).mean())

A acurácia no conjunto de teste é 0.9536288293124853
0.9536288293124853


In [24]:
model_NN.model.get_weights()

[array([[-1.64736323e-02, -3.87834907e-01, -3.03591136e-02,
         -1.83908176e-02],
        [ 2.07626891e+00,  6.66125178e-01,  2.30386519e+00,
          2.37278557e+00],
        [ 2.71893535e-02,  5.29851690e-02,  1.99238975e-02,
          1.46708628e-02],
        [-4.08742949e-03,  2.20627680e-01,  1.15973270e-02,
          1.16515979e-02],
        [ 4.31069732e-02,  1.92771982e-02,  4.21384312e-02,
          4.45233621e-02],
        [-4.30018175e-03,  7.57506639e-02, -6.08413829e-04,
          2.85979849e-03],
        [-1.83870066e-02,  4.55308296e-02, -8.40887148e-03,
         -6.72754971e-03],
        [-4.98415157e-03,  1.08087219e-01,  3.39088799e-03,
         -1.13998279e-02],
        [-2.28371215e+00, -3.42886376e+00, -4.44209278e-02,
         -2.59327907e-02],
        [ 2.11088695e-02, -1.87743977e-01,  2.52732616e-02,
         -1.86996433e-04],
        [-1.21159886e-03,  2.44830638e-01, -2.83251703e-03,
          7.39394361e-03],
        [ 3.92711209e-03,  5.66932783e-02, 

## Model 6

In [21]:
base_estimator = DecisionTreeClassifier(max_depth = 2, random_state=47)
model_adaboost = AdaBoostClassifier(base_estimator = base_estimator,
                                    n_estimators = 50,
                                    learning_rate = 1,
                                    algorithm = 'SAMME.R',
                                    random_state = 47)

cv = RepeatedKFold(n_splits = 10, n_repeats = 5, random_state = 47)
scores = cross_val_score(model_adaboost, X, y, cv = cv)                                               

In [24]:
scores.mean()

0.9520728126834997

In [25]:
model_adaboost.fit(X_train, y_train)                                                 

AdaBoostClassifier(base_estimator=DecisionTreeClassifier(max_depth=2,
                                                         random_state=47),
                   learning_rate=1, random_state=47)

In [26]:
model_evaluate(model_adaboost, X_test, y_test)

              precision    recall  f1-score   support

           0       0.93      0.99      0.96     18631
           1       0.98      0.88      0.93     11172

    accuracy                           0.95     29803
   macro avg       0.95      0.94      0.94     29803
weighted avg       0.95      0.95      0.95     29803



In [43]:
model_adaboost.feature_importances_

array([0.25, 0.16, 0.04, 0.01, 0.02, 0.01, 0.  , 0.  , 0.04, 0.04, 0.02,
       0.04, 0.01, 0.04, 0.18, 0.03, 0.01, 0.  , 0.  , 0.  , 0.01, 0.  ,
       0.01, 0.01, 0.01, 0.  , 0.  , 0.01, 0.01, 0.  , 0.01, 0.01, 0.  ,
       0.  , 0.02, 0.  ])

In [45]:
model_adaboost.estimator_weights_

array([1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.,
       1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1., 1.])

## Saving objects

In [27]:
#with open('model_SVC.pickle','wb') as f:
#   pickle.dump(model_SVC, f)

# with open('model_adaboost.pickle','wb') as f:
#    pickle.dump(model_adaboost, f)

# with open('scaler.pickle','wb') as f:
#   pickle.dump(scaler, f)

#model_NN.model.save('model_NN.h5')
#del model_NN

## Feature Importances

In [24]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(model_NN, 
                             scoring = 'accuracy',
                             cv = 'prefit',
                             n_iter = 5,
                             random_state = 47)
perm.fit(X_test_scale, y_test)



PermutationImportance(estimator=<keras.wrappers.scikit_learn.KerasClassifier object at 0x000001741E092910>,
                      random_state=47, scoring='accuracy')

In [4]:
#hotel.groupby(['is_canceled', 'arrival_date_week_number']).count()

In [26]:
eli5.show_weights(perm, feature_names = X_test.columns.tolist())

Weight,Feature
0.4551  ± 0.0029,arrival_date_week_number
0.4465  ± 0.0027,month
0.1649  ± 0.0024,day
0.1102  ± 0.0019,customer_type_Transient
0.0779  ± 0.0018,customer_type_Transient-Party
0.0599  ± 0.0010,reserved_room_type_A
0.0542  ± 0.0011,reserved_room_type_D
0.0296  ± 0.0009,previous_cancellations
0.0254  ± 0.0020,distribution_channel_TA/TO
0.0212  ± 0.0014,reserved_room_type_E


In [25]:
with open('importances.pickle','wb') as f:
    pickle.dump(perm, f)

INFO:tensorflow:Assets written to: ram://28d6d433-4cae-4186-987a-59123cd1d7c5/assets


## Prediction

In [68]:
model_NN = load_model('model_NN.h5')

y_pred = np.round(model_NN.predict(X_test_scale))
print("A acurácia no conjunto de teste é", accuracy_score(y_test, y_pred))
print((y_pred.flatten() == y_test).mean())

A acurácia no conjunto de teste é 0.9536288293124853
0.9536288293124853


In [111]:
sample = {'lead_time': 142,
 'arrival_date_week_number': 17,
 'arrival_date_day_of_month': 22,
 'stays_in_weekend_nights': 2,
 'stays_in_week_nights': 3,
 'adults': 2,
 'babies': 0,
 'is_repeated_guest': 0,
 'previous_cancellations': 0,
 'previous_bookings_not_canceled': 0,
 'booking_changes': 0,
 'days_in_waiting_list': 0,
 'required_car_parking_spaces': 0,
 'total_of_special_requests': 0,
 'month': 1,
 'day': 18,
 'hotel_City Hotel': 1,
 'meal_BB': 1,
 'meal_HB': 0,
 'meal_SC': 0,
 'meal_Undefined': 0,
 'reserved_room_type_A': 1,
 'reserved_room_type_B': 0,
 'reserved_room_type_C': 0,
 'reserved_room_type_D': 0,
 'reserved_room_type_E': 0,
 'reserved_room_type_F': 0,
 'reserved_room_type_G': 0,
 'reserved_room_type_H': 0,
 'distribution_channel_Corporate': 0,
 'distribution_channel_Direct': 0,
 'distribution_channel_GDS': 0,
 'distribution_channel_TA/TO': 1,
 'customer_type_Contract': 0,
 'customer_type_Transient': 1,
 'customer_type_Transient-Party': 0}
new_data = pd.DataFrame([sample]).values

In [137]:
def predict_cancellation(model, new_data):
    # the output is not exactly one or zero, it should be rounded
    predictions = np.round(model.predict(new_data)[0])

    pred_to_label = {0: 'Not Cancelled', 1: 'Cancelled'}

    # Make a list of predictions
    data = []
    for t, pred in zip(new_data, predictions):
        data.append((pred, pred_to_label[pred]))

    return data

In [115]:
new_data_scaled= scaler.transform(new_data)

In [138]:
if __name__ == "__main__":
    predictions = predict_cancellation(model_NN, new_data_scaled)                        
    print(predictions)

[(1.0, 'Cancelled')]
