# Predicting Cancellation Rates

In this notebook, you will build a machine learning model to predict whether or not a customer cancelled a hotel booking.

We will use a dataset on hotel bookings from the article ["Hotel booking demand datasets"](https://www.sciencedirect.com/science/article/pii/S2352340918315191), published in the Elsevier journal, [Data in Brief](https://www.sciencedirect.com/journal/data-in-brief). The abstract of the article states 

> This data article describes two datasets with hotel demand data. One of the hotels (H1) is a resort hotel and the other is a city hotel (H2). Both datasets share the same structure, with 31 variables describing the 40,060 observations of H1 and 79,330 observations of H2. Each observation represents a hotel booking. Both datasets comprehend bookings due to arrive between the 1st of July of 2015 and the 31st of August 2017, including bookings that effectively arrived and bookings that were canceled. 

For convenience, the two datasets have been combined into a single csv file `hotel_bookings.csv`. Let us start by importing all the functions needed to import, visualize and model the data.

In [None]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix
import matplotlib.pyplot as plt
import lightgbm as lgb


hotel_bookings = pd.read_csv('hotel_bookings.csv')

month_mapping = {
    "January": 1, "February": 2, "March": 3, "April": 4, "May": 5, "June": 6,
    "July": 7, "August": 8, "September": 9, "October": 10, "November": 11, "December": 12
}
hotel_bookings['arrival_date_month'] = hotel_bookings['arrival_date_month'].map(month_mapping)

hotel_bookings['arrival_date'] = pd.to_datetime(
    dict(year=hotel_bookings['arrival_date_year'], month=hotel_bookings['arrival_date_month'], day=hotel_bookings['arrival_date_day_of_month'])
)

hotel_bookings_sorted = hotel_bookings.sort_values('arrival_date')

cutoff_index = int(len(hotel_bookings_sorted) * 0.8)

train_data = hotel_bookings_sorted.iloc[:cutoff_index]
test_data = hotel_bookings_sorted.iloc[cutoff_index:]

train_data.fillna({'children': 0, 'country': 'Unknown', 'agent': 'Unknown', 'company': 'Unknown'}, inplace=True)
test_data.fillna({'children': 0, 'country': 'Unknown', 'agent': 'Unknown', 'company': 'Unknown'}, inplace=True)

categorical_vars = ['agent', 'company', 'country', 'hotel', 'arrival_date_month', 'meal', 'market_segment', 'distribution_channel', 'reserved_room_type', 'assigned_room_type', 'customer_type']
train_data[categorical_vars] = train_data[categorical_vars].astype('object')
test_data[categorical_vars] = test_data[categorical_vars].astype('object')

X_train = pd.get_dummies(train_data.drop(columns=['is_canceled', 'arrival_date', 'reservation_status_date', 'reservation_status']), drop_first=True)
y_train = train_data['is_canceled']

X_test = pd.get_dummies(test_data.drop(columns=['is_canceled', 'arrival_date', 'reservation_status_date', 'reservation_status']), drop_first=True)
y_test = test_data['is_canceled']

X_train, X_test = X_train.align(X_test, join='outer', axis=1, fill_value=0)

models = {
    "RandomForest": RandomForestClassifier(random_state=42),
    "GradientBoosting": GradientBoostingClassifier(random_state=42),
    "LightGBM": lgb.LGBMClassifier(random_state=42)
}

results = {}

for name, model in models.items():
    model.fit(X_train, y_train)
    
    y_pred = model.predict(X_test)
    
    accuracy = accuracy_score(y_test, y_pred)
    precision = precision_score(y_test, y_pred)
    recall = recall_score(y_test, y_pred)
    f1 = f1_score(y_test, y_pred)
    confusion = confusion_matrix(y_test, y_pred)
    
    results[name] = {'Accuracy': accuracy, 'Precision': precision, 'Recall': recall, 'F1 Score': f1, 'Confusion Matrix': confusion}
    
    print(f"Results for {name}:")
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1 Score: {f1}")
    print("Confusion Matrix:")
    print(confusion)
    print("\n")

for name, model in models.items():
    if hasattr(model, 'feature_importances_'):
        importances = model.feature_importances_
        indices = np.argsort(importances)[::-1]

        plt.figure(figsize=(10, 6))
        plt.title(f"Feature importances for {name}")
        plt.bar(range(X_train.shape[1])[:10], importances[indices][:10], color="r", align="center")
        plt.xticks(range(X_train.shape[1])[:10], X_train.columns[indices][:10], rotation=90)
        plt.xlim([-1, 10])
        plt.show()
plt.show()
