## Hotel Cancellation Classification

#### 00: Setup and Libraries

In [1]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [2]:
from EnsembleLearner import EnsembleLearner

#### 01: Data Loading

In [3]:
# load data
data_orig = pd.read_csv('data/train.csv')

In [5]:
# check some descriptives
#data_orig.dtypes
#data_orig.describe()

#### 02: Preprocessing

In [4]:
# drop columns that do not have explanatory value - include country later again
data_clean = data_orig.drop(['reservation_status_date', 'name', 'email', 'phone-number', 'credit_card', 'country'], axis = 1)

# make dummies from categorical values
data_clean = pd.get_dummies(data_clean)

# binarise time variables
#data_clean = pd.concat(
#    [
#        data_clean.drop(['arrival_date_year', 'arrival_date_week_number', 'arrival_date_day_of_month'], axis = 1),
#        pd.get_dummies(data_clean['arrival_date_year']),
#        pd.get_dummies(data_clean['arrival_date_week_number']),
#        pd.get_dummies(data_clean['arrival_date_day_of_month'])
#    ],
#    axis = 1
#)

# binarise agent and company variable
#data_clean = pd.concat(
#    [
#        data_clean.drop(['agent', 'company'], axis = 1),
#        pd.get_dummies(data_clean['agent']),
#        pd.get_dummies(data_clean['company'])
#    ],
#    axis = 1
#)

In [5]:
# check for nans
pd.DataFrame(data_clean.isnull().sum()).reset_index() \
    .rename(columns = {0: 'null_sum'}) \
    .query('null_sum > 0')

Unnamed: 0,index,null_sum
8,children,2
14,agent,11403
15,company,79106


In [6]:
# only two where there are nans for children, assuming no children traveled
data_clean['children'] = data_clean['children'].fillna(0)

In [7]:
data_clean.drop(['agent', 'company'], axis = 1, inplace=True)

In [8]:
len(list(data_clean.columns))

79

In [11]:
data_clean.shape

(83807, 79)

In [99]:
# split data in train and evaluation set
dep_variable = 'is_canceled'
X = data_clean.drop(dep_variable, axis = 1)
X = (X-X.mean())/X.std()
y = data_clean[dep_variable]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=21)
X_train = X_train.iloc[0:1000]
y_train = y_train.iloc[0:1000]
X_val = X_val[0:100]
y_val = y_val[0:100]

#### 03: Training and Forecasting

In [119]:
from sklearn.metrics import accuracy_score
import pandas as pd

def matrix_acc(pred_matrix, truth_series):
    acc_dict = dict()
    for col in pred_matrix.columns:
        acc_dict[col] = round(accuracy_score(pred_matrix[col], truth_series), 4)
    return pd.DataFrame([acc_dict])

In [123]:
import ELmodule
import importlib
importlib.reload(ELmodule)
from ELmodule import EnsembleLearner

EL = EnsembleLearner(X = X_train, y = y_train, models = ['knn', 'GaussianNB', 'DecisionTree', 'MLP'], ensemble_learner = 'DecisionTree')
training_pred_df = EL.train_ensemble_learner(return_training_predictions=True)
matrix_acc(training_pred_df.drop('truth', axis=1), training_pred_df['truth'])

Model knn done training for Ensemble Learner
Model GaussianNB done training for Ensemble Learner
Model DecisionTree done training for Ensemble Learner




Model MLP done training for Ensemble Learner


Unnamed: 0,EL_pred,knn,GaussianNB,DecisionTree,MLP
0,0.7933,0.76,0.51,0.7133,0.7733


In [124]:
pred = EL.predict(X_val)
matrix_acc(pred, y_val)

Model knn done training for Final Predictions
Model GaussianNB done training for Final Predictions
Model DecisionTree done training for Final Predictions
Model MLP done training for Final Predictions




Unnamed: 0,EL_pred,knn,GaussianNB,DecisionTree,MLP
0,0.8,0.81,0.46,0.74,0.8


#### 04: Prediction on Test Set

In [None]:
# here need to ensure that all dummy variables and columns of test set include the original ones