## Hotel Cancellation Classification

#### 00: Setup and Libraries

In [41]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [42]:
from EnsembleLearner import EnsembleLearner

In [43]:
include_weeks = False

#### 01: Data Loading

In [44]:
# load data
data_orig = pd.read_csv('data/train.csv')

In [45]:
# check some descriptives
data_orig.dtypes
#data_orig.describe()

hotel                              object
is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_month                 object
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
meal                               object
country                            object
market_segment                     object
distribution_channel               object
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
reserved_room_type                 object
assigned_room_type                 object
booking_changes                     int64
deposit_type                       object
agent                             

In [46]:
tdf = pd.read_csv('data/test.csv')

In [47]:
tdf['arrival_date_year'].unique()

array([2015, 2016, 2017], dtype=int64)

#### 02: Preprocessing

In [48]:
# drop columns that do not have explanatory value
data_clean = data_orig.drop(['reservation_status_date', 'name', 'email', 'phone-number', 'credit_card'], axis=1)

In [49]:
# check numeric columns if it makes sense to leave them as numeric or should be converted to factors
data_clean.dtypes[data_clean.dtypes != 'object']

is_canceled                         int64
lead_time                           int64
arrival_date_year                   int64
arrival_date_week_number            int64
arrival_date_day_of_month           int64
stays_in_weekend_nights             int64
stays_in_week_nights                int64
adults                              int64
children                          float64
babies                              int64
is_repeated_guest                   int64
previous_cancellations              int64
previous_bookings_not_canceled      int64
booking_changes                     int64
agent                             float64
company                           float64
days_in_waiting_list                int64
adr                               float64
required_car_parking_spaces         int64
total_of_special_requests           int64
dtype: object

In [50]:
# function to create categorical dummies for a column
def create_dummies(data, column):
    data_clean = pd.concat([
        data.drop(column, axis=1),
        pd.get_dummies(data[column])
    ], axis=1)
    return data_clean

Introduce dummies for numerical values where categorical makes more sense

##### 02.1 Preprocessing Time Variables

In [51]:
# arrival year
data_clean = create_dummies(data_clean, 'arrival_date_year')

In [52]:
# arrival week number, drop month
if include_weeks:
    data_clean = create_dummies(data_clean, 'arrival_date_week_number')
else:
    data_clean.drop('arrival_date_week_number', axis=1, inplace=True)

data_clean.drop('arrival_date_month', axis=1, inplace=True)

In [53]:
data_orig.groupby('arrival_date_week_number').agg(
    cancelled_mean = ('is_canceled', 'mean'),
    count = ('is_canceled', 'count')
)\
    .reset_index() \
    .sort_values('cancelled_mean', ascending=False) \
    .head(3)

Unnamed: 0,arrival_date_week_number,cancelled_mean,count
24,25,0.455906,1871
17,18,0.45065,2077
19,20,0.437309,1962


In [54]:
# arrival date day of month - bin to four groups
data_clean['arrival_date_period_of_month'] = pd.cut(data_clean['arrival_date_day_of_month'], bins=4)
data_clean = create_dummies(data_clean, 'arrival_date_period_of_month')

##### 02.2 Preprocessing Agent

In [55]:
# check agent
agent_analysis = data_clean.groupby('agent').agg(
    cancelled_mean = ('is_canceled', 'mean'),
    count = ('is_canceled', 'count')
)\
    .reset_index() \
    .sort_values('count', ascending=False) \
    .query('(cancelled_mean >= 0.75 | cancelled_mean < 0.25) & count > 250')

agent_analysis

Unnamed: 0,agent,cancelled_mean,count
13,14.0,0.178878,2566
6,7.0,0.1375,2480
176,250.0,0.172048,1982
168,241.0,0.132885,1189
26,28.0,0.074232,1172
38,40.0,0.077135,726
218,314.0,0.184783,644
27,29.0,0.806316,475
73,85.0,0.204082,392
170,243.0,0.065527,351


In [56]:
# keep those agents which seem to have explanatory power and enough bookings and make dummies
data_clean['agent_keep'] = np.where((data_clean['agent'].isin(list(agent_analysis['agent']))), data_clean['agent'], np.nan)
data_clean = create_dummies(data_clean, 'agent_keep')
data_clean.drop('agent', axis=1, inplace=True)

##### 02.3 Preprocessing Company

In [57]:
# check agent
company_analysis = data_clean.groupby('company').agg(
    cancelled_mean = ('is_canceled', 'mean'),
    count = ('is_canceled', 'count')
)\
    .reset_index() \
    .sort_values('count', ascending=False) \
    .query('(cancelled_mean >= 0.75 | cancelled_mean < 0.25) & count > 100')

company_analysis.head(5)

Unnamed: 0,company,cancelled_mean,count
17,40.0,0.083591,646
122,223.0,0.142857,560
20,45.0,0.125,168
84,153.0,0.22,150
93,174.0,0.168224,107


In [58]:
# keep those companies which seem to have explanatory power and enough bookings and make dummies
data_clean['company_keep'] = np.where((data_clean['company'].isin(list(company_analysis['company']))), data_clean['company'], np.nan)
data_clean = create_dummies(data_clean, 'company_keep')
data_clean.drop('company', axis=1, inplace=True)

##### 02.4 Preprocessing Country

In [59]:
# check country
country_analysis = data_clean.groupby('country').agg(
    cancelled_mean = ('is_canceled', 'mean'),
    count = ('is_canceled', 'count')
)\
    .reset_index() \
    .sort_values('count', ascending=False) \
    .query('(cancelled_mean >= 0.75 | cancelled_mean < 0.25) & count > 500')

country_analysis.head(5)   

Unnamed: 0,country,cancelled_mean,count
55,GBR,0.199788,8489
52,FRA,0.187578,7245
39,DEU,0.167825,5178
71,IRL,0.245937,2338
12,BEL,0.197612,1675


In [60]:
# keep those countries which seem to have explanatory power and enough bookings and make dummies
data_clean['country_keep'] = np.where((data_clean['country'].isin(list(country_analysis['country']))), data_clean['country'], np.nan)
data_clean = create_dummies(data_clean, 'country_keep')
data_clean.drop('country', axis=1, inplace=True)

##### 02.5 Preprocessing market_segment

In [61]:
# check country
market_segment_analysis = data_clean.groupby('market_segment').agg(
    cancelled_mean = ('is_canceled', 'mean'),
    count = ('is_canceled', 'count')
)\
    .reset_index() \
    .sort_values('count', ascending=False) \
    .query('(cancelled_mean >= 0.75 | cancelled_mean < 0.25) & count > 500')

market_segment_analysis

Unnamed: 0,market_segment,cancelled_mean,count
3,Direct,0.152395,8852
2,Corporate,0.186261,3683
1,Complementary,0.132438,521


In [62]:
# keep all - make dummies
data_clean = create_dummies(data_clean, 'market_segment')

##### 02.6 Perprocessing distribution_channel

In [63]:
data_clean = pd.get_dummies(data_clean)

In [64]:
len(data_clean.columns)

101

In [65]:
# same as above with market_segment, distribution_channel

In [66]:
print(np.mean(data_clean['is_canceled']))
print(np.std(data_clean['is_canceled']))

0.37161573615569105
0.48323646468096937


In [67]:
print(np.mean(data_clean['is_canceled']) + 0.5*np.std(data_clean['is_canceled']))
print(np.mean(data_clean['is_canceled']) - 0.5*np.std(data_clean['is_canceled']))

0.6132339684961757
0.12999750381520636


In [68]:
data_clean.dtypes[data_clean.dtypes == 'object']

Series([], dtype: object)

In [69]:
# check for nans
pd.DataFrame(data_clean.isnull().sum()).reset_index() \
    .rename(columns = {0: 'null_sum'}) \
    .query('null_sum > 0')

Unnamed: 0,index,null_sum
6,children,2


In [70]:
# only two where there are nans for children, assuming no children traveled
data_clean['children'] = data_clean['children'].fillna(0)

In [71]:
# split data in train and evaluation set
dep_variable = 'is_canceled'
X = data_clean.drop(dep_variable, axis = 1)
X = (X-X.mean())/X.std()
y = data_clean[dep_variable]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=21)
X_train = X_train.iloc[0:1000]
y_train = y_train.iloc[0:1000]
X_val = X_val[0:100]
y_val = y_val[0:100]

#### 03: Training and Forecasting

In [72]:
from sklearn.metrics import accuracy_score
import pandas as pd

def matrix_acc(pred_matrix, truth_series):
    acc_dict = dict()
    for col in pred_matrix.columns:
        acc_dict[col] = round(accuracy_score(pred_matrix[col], truth_series), 4)
    return pd.DataFrame([acc_dict])

In [90]:
import EnsembleLearner
import importlib
importlib.reload(EnsembleLearner)
from EnsembleLearner import EnsembleLearner

models = ['knn', 'GaussianNB', 'DecisionTree', 'SVM','RandomForest', 'LogisticRegression'] # , 'MLP', 'XGBoost'
EL = EnsembleLearner(X = X_train, y = y_train, models = models, ensemble_learner = 'DecisionTree')
training_pred_df = EL.train_ensemble_learner(return_training_predictions=True, verbose=True)
print(matrix_acc(training_pred_df.drop('truth', axis=1), training_pred_df['truth']))
pred = EL.predict(X_val)
print(matrix_acc(pred, y_val))

Model knn done training Ensemble Learner Training
Model GaussianNB done training Ensemble Learner Training
Model DecisionTree done training Ensemble Learner Training
Model SVM done training Ensemble Learner Training
Model RandomForest done training Ensemble Learner Training
Model LogisticRegression done training Ensemble Learner Training
Model knn done training Final Model Training
Model GaussianNB done training Final Model Training
Model DecisionTree done training Final Model Training
Model SVM done training Final Model Training
Model RandomForest done training Final Model Training
Model LogisticRegression done training Final Model Training
   EL_pred     knn  GaussianNB  DecisionTree   SVM  RandomForest  \
0   0.8267  0.7433      0.5167          0.73  0.77           0.8   

   LogisticRegression  
0              0.7867  
   EL_pred   knn  GaussianNB  DecisionTree   SVM  RandomForest  \
0     0.86  0.73        0.51          0.79  0.81          0.86   

   LogisticRegression  
0       

#### 04: Prediction on Test Set

In [None]:
# here need to ensure that all dummy variables and columns of test set include the original ones