## Hotel Cancellation Classification

#### 00: Setup and Libraries

In [139]:
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split

In [140]:
from EnsembleLearner import EnsembleLearner

In [141]:
include_weeks = True

#### 01: Data Loading

In [142]:
# load data
data_orig = pd.read_csv('data/train.csv')
test_data = pd.read_csv('data/test.csv')

In [None]:
# add artificial is_canceled column so the test set can follow the transformations done on the train set
test_data['is_canceled'] = np.nan

# bind the datasets
data = pd.concat([data_orig, test_data], axis = 0)

In [143]:
# check some descriptives
#data_orig.dtypes
#data_orig.describe()

#### 02: Preprocessing

In [144]:
# drop columns that do not have explanatory value
data_clean = data_orig.drop(['reservation_status_date', 'name', 'email', 'phone-number', 'credit_card'], axis=1)

In [145]:
# function to create categorical dummies for a column
def create_dummies(data, column):
    data_clean = pd.concat([
        data.drop(column, axis=1),
        pd.get_dummies(data[column])
    ], axis=1)
    return data_clean

##### 02.1 Preprocessing Time Variables

In [146]:
# arrival year
data_clean = create_dummies(data_clean, 'arrival_date_year')

In [147]:
# arrival week number, drop month
if include_weeks:
    data_clean = create_dummies(data_clean, 'arrival_date_week_number')
else:
    data_clean.drop('arrival_date_week_number', axis=1, inplace=True)

data_clean.drop('arrival_date_month', axis=1, inplace=True)

In [148]:
data_orig.groupby('arrival_date_week_number').agg(
    cancelled_mean = ('is_canceled', 'mean'),
    count = ('is_canceled', 'count')
)\
    .reset_index() \
    .sort_values('cancelled_mean', ascending=False) \
    .head(3)

Unnamed: 0,arrival_date_week_number,cancelled_mean,count
24,25,0.455906,1871
17,18,0.45065,2077
19,20,0.437309,1962


In [149]:
# arrival date day of month - bin to four groups
data_clean['arrival_date_period_of_month'] = pd.cut(data_clean['arrival_date_day_of_month'], bins=4)
data_clean = create_dummies(data_clean, 'arrival_date_period_of_month')

##### 02.2 Preprocessing Agent

In [150]:
# check agent
agent_analysis = data_clean.groupby('agent').agg(
    cancelled_mean = ('is_canceled', 'mean'),
    count = ('is_canceled', 'count')
)\
    .reset_index() \
    .sort_values('count', ascending=False) \
    .query('(cancelled_mean >= 0.75 | cancelled_mean < 0.25) & count > 250')

agent_analysis.head(5)

Unnamed: 0,agent,cancelled_mean,count
13,14.0,0.178878,2566
6,7.0,0.1375,2480
176,250.0,0.172048,1982
168,241.0,0.132885,1189
26,28.0,0.074232,1172


In [151]:
# keep those agents which seem to have explanatory power and enough bookings and make dummies
data_clean['agent_keep'] = np.where((data_clean['agent'].isin(list(agent_analysis['agent']))), data_clean['agent'], np.nan)
data_clean = create_dummies(data_clean, 'agent_keep')
data_clean.drop('agent', axis=1, inplace=True)

##### 02.3 Preprocessing Company

In [152]:
# check agent
company_analysis = data_clean.groupby('company').agg(
    cancelled_mean = ('is_canceled', 'mean'),
    count = ('is_canceled', 'count')
)\
    .reset_index() \
    .sort_values('count', ascending=False) \
    .query('(cancelled_mean >= 0.75 | cancelled_mean < 0.25) & count > 100')

company_analysis.head(5)

Unnamed: 0,company,cancelled_mean,count
17,40.0,0.083591,646
122,223.0,0.142857,560
20,45.0,0.125,168
84,153.0,0.22,150
93,174.0,0.168224,107


In [153]:
# keep those companies which seem to have explanatory power and enough bookings and make dummies
data_clean['company_keep'] = np.where((data_clean['company'].isin(list(company_analysis['company']))), data_clean['company'], np.nan)
data_clean = create_dummies(data_clean, 'company_keep')
data_clean.drop('company', axis=1, inplace=True)

##### 02.4 Preprocessing Country

In [154]:
# check country
country_analysis = data_clean.groupby('country').agg(
    cancelled_mean = ('is_canceled', 'mean'),
    count = ('is_canceled', 'count')
)\
    .reset_index() \
    .sort_values('count', ascending=False) \
    .query('(cancelled_mean >= 0.75 | cancelled_mean < 0.25) & count > 500')

country_analysis.head(5)   

Unnamed: 0,country,cancelled_mean,count
55,GBR,0.199788,8489
52,FRA,0.187578,7245
39,DEU,0.167825,5178
71,IRL,0.245937,2338
12,BEL,0.197612,1675


In [155]:
# keep those countries which seem to have explanatory power and enough bookings and make dummies
data_clean['country_keep'] = np.where((data_clean['country'].isin(list(country_analysis['country']))), data_clean['country'], np.nan)
data_clean = create_dummies(data_clean, 'country_keep')
data_clean.drop('country', axis=1, inplace=True)

##### 02.5 Preprocessing market_segment

In [156]:
# check country
market_segment_analysis = data_clean.groupby('market_segment').agg(
    cancelled_mean = ('is_canceled', 'mean'),
    count = ('is_canceled', 'count')
)\
    .reset_index() \
    .sort_values('count', ascending=False) \
    .query('(cancelled_mean >= 0.75 | cancelled_mean < 0.25) & count > 500')

market_segment_analysis.head(5)

Unnamed: 0,market_segment,cancelled_mean,count
3,Direct,0.152395,8852
2,Corporate,0.186261,3683
1,Complementary,0.132438,521


In [157]:
# keep all - make dummies
data_clean = create_dummies(data_clean, 'market_segment')

##### 02.6 Perprocessing other object columns

In [158]:
# show columns that are still not numerical
obj_columns = list(data_clean.columns[data_clean.dtypes == 'object'])
obj_columns

['hotel',
 'meal',
 'distribution_channel',
 'reserved_room_type',
 'assigned_room_type',
 'deposit_type',
 'customer_type']

In [159]:
# not too many so keep all
data_clean[obj_columns].nunique()

hotel                    2
meal                     5
distribution_channel     5
reserved_room_type      10
assigned_room_type      12
deposit_type             3
customer_type            4
dtype: int64

In [160]:
# make dummies for all the object columns
for attribute in obj_columns:
    data_clean = create_dummies(data_clean, attribute)

In [161]:
# check that all object columns are converted
all(data_clean.dtypes != 'object')

True

In [162]:
len(data_clean.columns)

154

##### 02.7 Preprocessing NAs

In [163]:
# check for nans
pd.DataFrame(data_clean.isnull().sum()).reset_index() \
    .rename(columns = {0: 'null_sum'}) \
    .query('null_sum > 0')

Unnamed: 0,index,null_sum
6,children,2


In [164]:
# only two where there are nans for children, assuming no children traveled
data_clean['children'] = data_clean['children'].fillna(0)

##### Splitting data

In [165]:
# split data again
test_set = data_clean.loc[data_clean['is_canceled'].isnull()]
train_set = data_clean.loc[not data_clean['is_canceled'].isnull()]

# split data in train and evaluation set
dep_variable = 'is_canceled'
X = data_clean.drop(dep_variable, axis = 1)
X = (X-X.mean())/X.std() # standardise data
y = data_clean[dep_variable]
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=21)

#### for development ####
#X_train = X_train.iloc[0:1000]
#y_train = y_train.iloc[0:1000]
#X_val = X_val[0:100]
#y_val = y_val[0:100]
#import EnsembleLearner
#import importlib
#importlib.reload(EnsembleLearner)
#from EnsembleLearner import EnsembleLearner

#### 03: Training and Forecasting

In [166]:
# function to evaluate the prediction outcome
from sklearn.metrics import accuracy_score

def matrix_acc(pred_matrix, truth_series):
    acc_dict = dict()
    for col in pred_matrix.columns:
        acc_dict[col] = round(accuracy_score(pred_matrix[col], truth_series), 4)
    return pd.DataFrame([acc_dict])

In [167]:
# choose the models
models = ['knn', 'GaussianNB', 'DecisionTree', 'SVM','RandomForest', 'LogisticRegression', 'MLP', 'XGBoost'] # , 'MLP', 'XGBoost'

# establish the ensemble learner class
EL = EnsembleLearner(X = X_train, y = y_train, models = models, ensemble_learner = 'DecisionTree')

# train the models and the ensemble learner
training_pred_df = EL.train_ensemble_learner(return_training_predictions=True, verbose=True)
matrix_acc(training_pred_df.drop('truth', axis=1), training_pred_df['truth'])

Model knn done training Ensemble Learner Training
Model GaussianNB done training Ensemble Learner Training
Model DecisionTree done training Ensemble Learner Training
Model SVM done training Ensemble Learner Training
Model RandomForest done training Ensemble Learner Training


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Model LogisticRegression done training Ensemble Learner Training


In [None]:
# check the performance on the test set
pred = EL.predict(X_val)
matrix_acc(pred, y_val)

#### 04: Prediction on Test Set

In [None]:
# train model again on whole data

# choose the models
models = ['knn', 'GaussianNB', 'DecisionTree', 'SVM','RandomForest', 'LogisticRegression', 'MLP', 'XGBoost'] # , 'MLP', 'XGBoost'

# establish the ensemble learner class
EL = EnsembleLearner(X = X, y = y, models = models, ensemble_learner = 'DecisionTree')

# train the models and the ensemble learner
training_pred_df = EL.train_ensemble_learner(return_training_predictions=True, verbose=True)
matrix_acc(training_pred_df.drop('truth', axis=1), training_pred_df['truth'])

In [None]:
# predict on the test set
final_prediction = EL.predict(test_set.drop(dep_variable, axis = 1))