# INSY695-078 Final Project: Analysis of Hotel Booking Cancellations

End to end project using Predictive Modeling and Causal Inference for Hotel Booking Cancellation Insights

## 1. Import the Required Libraries

In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

import numpy as np
import xgboost as xgb
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
import sklearn.metrics as metrics
from sklearn.metrics import *
from sklearn.metrics import classification_report, RocCurveDisplay, ConfusionMatrixDisplay
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.ensemble import IsolationForest
from imblearn.over_sampling import RandomOverSampler
import joblib

import warnings
warnings.filterwarnings('ignore')

## 2. Data Preprocessing

In [2]:
# Load the data
df = pd.read_csv('/Users/zy/Documents/GitHub/hotel_cancellation_ML2/Part 1/hotel_booking.csv')

### 2.1 Feature Engineering

In [3]:
df.sort_values(by=['name', 'arrival_date_year', 'arrival_date_month', 'arrival_date_day_of_month'], inplace=True)

# Create Number of bookings column counting the number of bookings by name prior to arrival year, month and day
df['num_bookings'] = df.groupby('name').cumcount()

# Arrival date year does not make sense for the model we want to build, so we drop it
df = df.drop(['arrival_date_year'], axis=1)

# Keep country names with more than 1000 bookings rest as 'Other'   
country_counts = df['country'].value_counts()
df['country'] = np.where(df['country'].isin(country_counts.index[country_counts > 1000]), df['country'], 'Other')

### 2.2 Reservation Status, Reservation Status Date are Updated after is cancelled , so it is dropped

In [4]:
# Drop the 'reservation_status_date' and 'reservation_status_days_difference' columns
df = df.drop(['reservation_status_date', 'reservation_status','assigned_room_type'], axis=1)

### 2.3 Removing Personal identification information

In [5]:
df = df.drop(['name', 'email','phone-number', 'credit_card'], axis=1)

### 2.4 Since the number of agents and company is very high and there are many missing values, we can omit these columns

In [6]:
# Drop the 'agent' and 'company' columns
df = df.drop(['agent', 'company'], axis=1)

### 2.5 Replace missing values with the 0 for the children column


In [7]:
df['children'] = df['children'].fillna(0)

## 3. Split the Data into train, validation, and test set

In [8]:
train, eval = train_test_split(df, test_size=0.3, random_state=42)
test, val = train_test_split(eval, test_size=0.5, random_state=42)

## 4. Categorical Encoding

In [9]:
# Using get_dummies to convert categorical columns to numerical columns
train = pd.get_dummies(train, columns=['hotel','arrival_date_month', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type'], drop_first=True)

## 5. Handling Outliers in the dataset

In [10]:
iforest = IsolationForest(n_estimators=100, random_state=42, contamination=0.02)
pred = iforest.fit_predict(train)
score = iforest.decision_function(train)

from numpy import where
anom_index = where(pred== -1)
values = train.iloc[anom_index]

#Remove outliers
train = train.drop(values.index)

## 6. Define the target and features

In [11]:
# Split the data into features and target
X_train = train.drop('is_canceled', axis=1)
y_train = train['is_canceled']

## 7. Feature Selection

In [12]:
randomforest = RandomForestClassifier(random_state=42)
model = randomforest.fit(X_train,y_train)
model.feature_importances_
pd.DataFrame(list(zip(X_train.columns,model.feature_importances_)), columns = ['predictor','feature importance']).sort_values("feature importance")[:10]

Unnamed: 0,predictor,feature importance
54,market_segment_Undefined,1.2e-05
58,distribution_channel_Undefined,1.8e-05
66,reserved_room_type_L,3.7e-05
69,deposit_type_Refundable,0.000135
67,reserved_room_type_P,0.000181
56,distribution_channel_GDS,0.00023
70,customer_type_Group,0.000233
48,market_segment_Complementary,0.000416
65,reserved_room_type_H,0.000594
7,babies,0.000801


In [13]:
#Drop columns with low feature importance 'reserved_room_type_L','market_segment_Undefined','distribution_channel_Undefined'
X_train = X_train.drop(['reserved_room_type_L','market_segment_Undefined','distribution_channel_Undefined'], axis=1)

## 8. Standardize the Data

In [14]:
sc = StandardScaler()
X_train_std = sc.fit_transform(X_train)

## 9. Preprocessing for Validation & Test Data

In [15]:
#Replace missing values with the 0 for the children column
val['children'] = val['children'].fillna(0)

# Using get_dummies to convert categorical columns to numerical columns
val = pd.get_dummies(val, columns=['hotel','arrival_date_month', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type'], drop_first=True)

# Split the data into features and target
X_val = val.drop('is_canceled', axis=1)
y_val = val['is_canceled']

#Drop columns with low feature importance 'reserved_room_type_L','market_segment_Undefined','distribution_channel_Undefined' if these columns exist
if 'reserved_room_type_L' in X_val.columns:
    X_val = X_val.drop(['reserved_room_type_L'], axis=1)
if 'market_segment_Undefined' in X_val.columns:
    X_val = X_val.drop(['market_segment_Undefined'], axis=1)
if 'distribution_channel_Undefined' in X_val.columns:
    X_val = X_val.drop(['distribution_channel_Undefined'], axis=1)

#standardize the data
sc = StandardScaler()
X_val_std = sc.fit_transform(X_val)


#Replace missing values with the 0 for the children column
test['children'] = test['children'].fillna(0)

# Using get_dummies to convert categorical columns to numerical columns
test = pd.get_dummies(test, columns=['hotel','arrival_date_month', 'meal', 'country', 'market_segment', 'distribution_channel', 'reserved_room_type', 'deposit_type', 'customer_type'], drop_first=True)

# Split the data into features and target
X_test = test.drop('is_canceled', axis=1)
y_test = test['is_canceled']

#Drop columns with low feature importance 'reserved_room_type_L','market_segment_Undefined','distribution_channel_Undefined' if these columns exist
if 'reserved_room_type_L' in X_test.columns:
    X_test = X_test.drop(['reserved_room_type_L'], axis=1)
if 'market_segment_Undefined' in X_test.columns:
    X_test = X_test.drop(['market_segment_Undefined'], axis=1)
if 'distribution_channel_Undefined' in X_test.columns:
    X_test = X_test.drop(['distribution_channel_Undefined'], axis=1)

#standardize the data
sc = StandardScaler()
X_test_std = sc.fit_transform(X_test)

## 10. Balancing the Classes for training data

In [16]:
#Use RandomOverSampler to handle imbalanced data
ros = RandomOverSampler(random_state=42)
X_train_ros, y_train_ros = ros.fit_resample(X_train_std, y_train)

pd.Series(y_train_ros).value_counts()

0    51281
1    51281
Name: is_canceled, dtype: int64

In [17]:
train_X = X_train.copy()
cols = list(train_X)

X_train_ros=pd.DataFrame(X_train_ros)
X_train_ros.columns=cols
#Renaming column name of Target variable
y_train_ros=pd.DataFrame(y_train_ros)
y_train_ros.columns = ['is_canceled']
scaled_train_df = pd.concat([X_train_ros,y_train_ros], axis=1)

X_train_ros & y_train_ros are the final features and target dataframes after balancing classes to be used for model training.

## 11. Training for the best selected model

In [18]:
clf_rf_best = RandomForestClassifier(
    **{
        "min_samples_leaf": 1,
        "n_estimators": 200,
        "random_state": 42,
    })
clf_rf_best.fit(X_train_ros, y_train_ros)
print("Accuracy score of the best model: ", accuracy_score(y_val, clf_rf_best.predict(X_val)))
print("\nROC-AUC score of the best model: ", roc_auc_score(y_val, clf_rf_best.predict(X_val)))

Accuracy score of the best model:  0.6282316153889106

ROC-AUC score of the best model:  0.6514754434856518


## 12. Hyperparameter Tuning

In [19]:
!pip install optuna


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m


In [20]:
pip install optuna-dashboard


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m23.3.2[0m[39;49m -> [0m[32;49m24.0[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
Note: you may need to restart the kernel to use updated packages.


In [21]:
from sklearn.model_selection import RandomizedSearchCV, cross_val_score
from sklearn.metrics import accuracy_score, roc_auc_score
from bayes_opt import BayesianOptimization
from ray.tune.schedulers import PopulationBasedTraining
from ray import tune

from hyperopt import hp, fmin, tpe, Trials, STATUS_OK
from ray.tune.schedulers import AsyncHyperBandScheduler
from ray.tune.schedulers import HyperBandForBOHB
import mlflow

import optuna
import sklearn.datasets
import sklearn.ensemble
import sklearn.metrics
import sklearn.model_selection
from sklearn.svm import SVR
from optuna.trial import TrialState

### 12.5 Optuna

In [22]:
# Import Optuna directly
import optuna

# Define an objective function to be minimized.
def objective(trial):
    # Invoke suggest methods of a Trial object to generate hyperparameters.
    regressor_name = trial.suggest_categorical('classifier', ['SVR', 'RandomForest'])
    if regressor_name == 'SVR':
        svr_c = trial.suggest_float('svr_c', 1e-10, 1e10, log=True)
        regressor_obj = SVR(C=svr_c)
    else:
        rf_max_depth = trial.suggest_int('rf_max_depth', 2, 32)
        regressor_obj = sklearn.ensemble.RandomForestRegressor(max_depth=rf_max_depth)
        
    X, y = sklearn.datasets.load_boston(return_X_y=True)
    X_train, X_val, y_train, y_val = sklearn.model_selection.train_test_split(X, y, random_state=0)
    regressor_obj.fit(X_train, y_train)
    y_pred = regressor_obj.predict(X_val)
    error = sklearn.metrics.mean_squared_error(y_val, y_pred)
    return error  # An objective value linked with the Trial object.

# Create a new study.
study = optuna.create_study(direction='minimize')

# Invoke optimization of the objective function.
study.optimize(objective, n_trials=100)

# Get the best hyperparameters
best_params_optuna = study.best_params

[I 2024-04-22 18:52:59,937] A new study created in memory with name: no-name-cc1f7808-b4f0-4e1a-935e-0d0869d6f4ce
[W 2024-04-22 18:52:59,947] Trial 0 failed with parameters: {'classifier': 'RandomForest', 'rf_max_depth': 22} because of the following error: ImportError('\n`load_boston` has been removed from scikit-learn since version 1.2.\n\nThe Boston housing prices dataset has an ethical problem: as\ninvestigated in [1], the authors of this dataset engineered a\nnon-invertible variable "B" assuming that racial self-segregation had a\npositive impact on house prices [2]. Furthermore the goal of the\nresearch that led to the creation of this dataset was to study the\nimpact of air quality but it did not give adequate demonstration of the\nvalidity of this assumption.\n\nThe scikit-learn maintainers therefore strongly discourage the use of\nthis dataset unless the purpose of the code is to study and educate\nabout ethical issues in data science and machine learning.\n\nIn this special ca

ImportError: 
`load_boston` has been removed from scikit-learn since version 1.2.

The Boston housing prices dataset has an ethical problem: as
investigated in [1], the authors of this dataset engineered a
non-invertible variable "B" assuming that racial self-segregation had a
positive impact on house prices [2]. Furthermore the goal of the
research that led to the creation of this dataset was to study the
impact of air quality but it did not give adequate demonstration of the
validity of this assumption.

The scikit-learn maintainers therefore strongly discourage the use of
this dataset unless the purpose of the code is to study and educate
about ethical issues in data science and machine learning.

In this special case, you can fetch the dataset from the original
source::

    import pandas as pd
    import numpy as np

    data_url = "http://lib.stat.cmu.edu/datasets/boston"
    raw_df = pd.read_csv(data_url, sep="\s+", skiprows=22, header=None)
    data = np.hstack([raw_df.values[::2, :], raw_df.values[1::2, :2]])
    target = raw_df.values[1::2, 2]

Alternative datasets include the California housing dataset and the
Ames housing dataset. You can load the datasets as follows::

    from sklearn.datasets import fetch_california_housing
    housing = fetch_california_housing()

for the California housing dataset and::

    from sklearn.datasets import fetch_openml
    housing = fetch_openml(name="house_prices", as_frame=True)

for the Ames housing dataset.

[1] M Carlisle.
"Racist data destruction?"
<https://medium.com/@docintangible/racist-data-destruction-113e3eff54a8>

[2] Harrison Jr, David, and Daniel L. Rubinfeld.
"Hedonic housing prices and the demand for clean air."
Journal of environmental economics and management 5.1 (1978): 81-102.
<https://www.researchgate.net/publication/4974606_Hedonic_housing_prices_and_the_demand_for_clean_air>


In [None]:
best_params_optuna