## Hotel Cancellation Classification

#### 00: Setup and Libraries

In [15]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [31]:
from EnsembleLearner import EnsembleLearner

#### 01: Data Loading

In [17]:
# load data
data_orig = pd.read_csv('data/train.csv')

In [152]:
# check some descriptives
#data_orig.dtypes
#data_orig.describe()

#### 02: Preprocessing

In [18]:
# drop columns that do not have explanatory value - include country later again
data_clean = data_orig.drop(['reservation_status_date', 'name', 'email', 'phone-number', 'credit_card', 'country'], axis = 1)

# make dummies from categorical values
data_clean = pd.get_dummies(data_clean)

# binarise time variables
#data_clean = pd.concat(
#    [
#        data_clean.drop(['arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month'], axis = 1),
#        pd.get_dummies(data_clean['arrival_date_year']),
#        pd.get_dummies(data_clean['arrival_date_week_number']),
#        pd.get_dummies(data_clean['arrival_date_day_of_month'])
#    ],
#    axis = 1
#)

# binarise agent and company variable
#data_clean = pd.concat(
#    [
#        data_clean.drop(['agent', 'company'], axis = 1),
#        pd.get_dummies(data_clean['agent']),
#        pd.get_dummies(data_clean['company'])
#    ],
#    axis = 1
#)

In [19]:
# check for nans
pd.DataFrame(data_clean.isnull().sum()).reset_index() \
    .rename(columns = {0: 'null_sum'}) \
    .query('null_sum > 0')

Unnamed: 0,index,null_sum
8,children,2
14,agent,11403
15,company,79106


In [20]:
# only two where there are nans for children, assuming no children traveled
data_clean['children'] = data_clean['children'].fillna(0)

In [21]:
data_clean.drop(['agent', 'company'], axis = 1, inplace=True)

In [22]:
len(list(data_clean.columns))

79

In [23]:
data_clean.shape

(83807, 79)

In [45]:
# split data in train and evaluation set
dep_variable = 'is_canceled'
X = data_clean.drop(dep_variable, axis = 1)
X = (X-X.mean())/X.std()
y = data_clean[dep_variable]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=21)

#### 03: Training and Forecasting

In [46]:
EL = EnsembleLearner(X = X_train, y = y_train, models = ['knn', 'GaussianNB', 'DecisionTree'], ensemble_learner = 'DecisionTree')

In [48]:
EL.train_ensemble_learner()

In [49]:
pred = EL.predict(X_val)

In [52]:
from sklearn.metrics import accuracy_score

print(accuracy_score(pred['EL_pred'], y_val))
print(accuracy_score(pred['knn'], y_val))
print(accuracy_score(pred['GaussianNB'], y_val))
print(accuracy_score(pred['DecisionTree'], y_val))

0.8284810881756354
0.8212027204390884
0.7480014318100465
0.8173249015630593


#### 04: Prediction on Test Set

In [None]:
# here need to ensure that all dummy variables and columns of test set include the original ones