#### In this case study we are building a machine learning model which Predicts Hotel Booking Cancellation in 
#### Portugal Project. It will try to predict whether a booking will be cancelled or a 
#### booking will not be cancelled using machine learning based on historical data.
#### Another Goals of this project is to find out the characteristic of customers who cancelled 
#### and finding a pattern in cancelled booking by doing an exploratory data analysis 


In [None]:
 Importing all the required libraries for the project.


# Data analysis and wrangling
import pandas as pd
import numpy as np
import scipy.stats as stats

# Visualization
import #matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns

# Filter warning
import warnings
warnings.filterwarnings("ignore")

# Machine learning
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from matplotlib.colors import ListedColormap, LinearSegmentedColormap
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score 
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.metrics import accuracy_score, recall_score, precision_score, f1_score, roc_auc_score
from sklearn.metrics import classification_report, RocCurveDisplay, ConfusionMatrixDisplay

In [None]:
# reading csv file data

hotel_data=pd.read_csv("C:\\Users\\admin\\Downloads\\hotel_bookings.csv")
hotel_data.head(10)

In [None]:
#Initial data exploration to find total number of columns and rows of our dataset and to find the data type of each column

hotel_data.info()

In [None]:
hotel_data.shape

In [None]:
hotel_data.describe()

# Handling Missing Values

In [None]:
# Finding the percentage of all the null values in every column

hotel_data.isnull().sum()/len(hotel_data)*100

##### **From above we can caonclude that 94% percent of data in column "company" is missing hence we can drop that column and other columns where there are  null values we can treat it by imputing values.

In [None]:
# Dropping column "company"

hotel_data=hotel_data.drop(['company'], axis=1)
hotel_data.head()

In [None]:
# Imputing null values for columns- "country","agent","children"

hotel_data["agent"]=hotel_data["agent"].fillna(hotel_data["agent"].median())

In [None]:
hotel_data["children"]=hotel_data["children"].fillna(hotel_data["children"].median())

In [None]:
# as percentage of missing values in "country" column is 0.4% so here I am using fillna approach to impute null values
hotel_data["country"]=hotel_data["country"].fillna(method='ffill')

In [None]:
hotel_data.isnull().sum()

#### **From above we can see that our data is now free from all the missing values

# EDA

#### In this part, I would like to visualize some features and show statistical relationship with target variable. This analysis will help to get overall view and deep familiarity of the data, detect extreme values and identify obvious errors.


In [None]:
hotel_data.head()

In [None]:
# Visualising no. of booking v/s type of hotel. According to data we have two types hotel
# and here we are going to visualise which type has most no.of bookings

plt.figure(figsize=(6,6))
sns.countplot(x='hotel', data = hotel_data, palette="rocket")
plt.title('Hotel Types', weight='bold')
plt.xlabel('Hotel', fontsize=12)
plt.ylabel('Count', fontsize=12)

#### **From above graph we can see no of bookings count is more in city hotel than resort hotel

In [None]:
# In this graph we are going to visualise no. of cancellation v/s no. of bookings. 
# This graph will tell us how many booking are being cancelled out of total booking

plt.figure(figsize=(6,6))
sns.countplot(x='is_canceled', data = hotel_data, palette="cubehelix")
plt.title("Cancellation v/s no. of booking", weight='bold')
plt.xlabel('no. of cancellation', fontsize=12)
plt.ylabel('Count', fontsize=12)

#### **From the above we can conclude that approx 63% people did not cancel their booking and approx 37% did cancel hence data is slightly imbalaced but not highly imbalaced so there is not actual need of using oversampling or undersmapling technique.

In [None]:
# We are going to visualise cancellation done by different type of customers out of total no. of bookings done by them.

group_customertype_cancel = hotel_data.groupby([ 'customer_type', 'is_canceled']).size().unstack()
group_customertype_cancel.plot(kind='bar', stacked=True, cmap='spring', figsize=(6,6))
plt.title('Customer Type vs Booking Cancellation Status', weight='bold')
plt.xlabel('Type of customer', fontsize=12)
plt.xticks(rotation=360)
plt.ylabel('Count', fontsize=12)

#### **From above Graph we can conclude that most of the bookings and cancellation are done by Transient customers only.

In [None]:
# Visualising no. of booking from top 10 countries.

plt.figure(figsize=(6,16))
sns.countplot(x='country', data=hotel_data, 
              order=pd.value_counts(hotel_data['country']).iloc[:10].index, palette="brg")
plt.title('Top 10 Country of Origin', weight='bold')
plt.xlabel('Country', fontsize=12)
plt.ylabel('Count', fontsize=12)


#### **From above we concluded that almost 40% of the booking are done from Portugal itself.

In [None]:
# "Market_segment" feature exploration

plt.figure(figsize=(10,10))
sns.countplot(x=hotel_data['market_segment'], palette='spring_r', 
              order=pd.value_counts(hotel_data['market_segment']).index)
plt.title('Market Segment Types', weight='bold')
plt.xlabel('Market Segment', fontsize=12)
plt.ylabel('Count', fontsize=12)

#### **From above we concluded that most of the bookings are done via Online Travel Agencies

In [None]:
# Graph to show change in average daily rate of different types of rooms.

data = hotel_data[hotel_data['is_canceled'] == 0]

plt.figure(figsize=(4,4))
sns.catplot(data=data,x="reserved_room_type",y="adr",hue="hotel",height=5, aspect=.8)
plt.title('ADR v/s Type of reserved rooms in different hotel', weight='bold')
plt.xlabel('Type of Rooms', fontsize=12)
plt.ylabel('ADR', fontsize=12)

#### **From above graph we can conclude that Average daily rate varies with the type of room reserved by guest in different types of hotel. 

In [None]:
# We are going to visualise how ADR varies in different months in different hotels.

data = hotel_data[hotel_data['is_canceled'] == 0]

plt.figure(figsize=(18,10))
sns.catplot(data=data,x="arrival_date_month",y="adr",hue="hotel",height=5,aspect=2,palette='spring_r')
plt.title('ADR v/s Different months in different hotels', weight='bold')
plt.xlabel('Months', fontsize=12)
plt.ylabel('ADR', fontsize=12)

#### **From above graph we can conclude that most of the bookings are done in Spring and summer season of Portugal. ADR goes down from the month of September to February. Also most of the bookings are from Resort hotel.

In [None]:
# We are now going to visualise no. of booking cancellation in different months because of ADR.

hotel_data['adr'] = hotel_data['adr'].astype(float)
plt.figure(figsize=(15,10))
sns.barplot(x='arrival_date_month', y='adr', hue='is_canceled', dodge=True, palette= 'PuBu_r', data=hotel_data)
plt.title('Arrival Month vs ADR vs Booking Cancellation Status', weight='bold')
plt.xlabel('Arrival Month', fontsize=12)
plt.ylabel('ADR', fontsize=12)

#### **Here we have concluded that as Average daily rate is high in spring and summer due to which most of the cancellation are done in same season due to high rates.

In [None]:
# We are now going to visualise the impact of special request on the no. of cancellation out of total bookings.

group_cancel_request = hotel_data.groupby([ 'total_of_special_requests', 'is_canceled']).size().unstack()
group_cancel_request.plot(kind='bar', stacked=True, cmap='Accent', figsize=(6,6))
plt.title('Total Special Request vs Booking Cancellation Status', weight='bold')
plt.xlabel('Number of Special Request', fontsize=12)
plt.xticks(rotation=360)
plt.ylabel('Count', fontsize=12)

#### **From above graph we can clearly see that almost 40% of the bookings are canceled when no special rquest is made by Guest.

In [None]:
# Now as we can see children and babies make no difference so we will combine these two features to make one

hotel_data['all_children'] = hotel_data['children'] + hotel_data['babies']

In [None]:
hotel_data=hotel_data.drop(["children","babies"],axis=1)

In [None]:
hotel_data.head()

In [None]:
hotel_data.info()

In [None]:
# Converting type of column "all_children"

hotel_data['all_children']= hotel_data['all_children'].astype(int)

In [None]:
# Finding correlation of each column with each other

plt.figure(figsize = (24, 12))

corr = hotel_data.corr()
sns.heatmap(corr, annot = True, linewidths = 1)
plt.show()

### _From above heatmap we can conclude that feature "hotel" ,"reserved_room_type","previous__booking_not_canceled" and market_segment" are strongly correlated with "agent","assigned_room_type" ,"previous_cancellations"and "distribution channel". Hence we can drop "agent","assigned_room_type" ,"previous_cancellations" and "distribution channel" from data. Also we can see  ,"arrival_date_day_of_month" ,"days_in_waiting_list" and "arrival_date_week_number" are not much correlated with "is_canceled" hence not providing much insight about no. of cancellation. Therefore we will drop these two features as well_

### _Also reservation_status is a categorical feature that indicates the current status of a reservation. It can have values like 'Canceled', 'Check-Out', and 'No-Show'. This feature is directly related to the target variable is_canceled for the following reasons:

##### If the reservation_status is 'Canceled', it implies that is_canceled should be 1.
##### Similarly, if the reservation_status is 'Check-Out', it implies that the booking was not canceled, and hence is_canceled should be 0.
##### Therefore, knowing the reservation_status directly gives us the value of the target variable, leading to data leakage if it is used as a feature in the model. So, it is important to remove this feature to build a model that can actually predict cancellations. Therefore, reservation_status is considered as an irrelevant feature and should be omitted.

### _Since reservation_status_date includes date type data which also is directly related to target variable as change in this date can provide us direct info whether booking was canceled before that date or not. hence we will omit this too._

### _We can also drop arrival_date_year as it is only providing information for certain years hence can not be used for future predictions therefore it can be considered as irrelevant features._


In [None]:
hotel_data=hotel_data.drop(["agent","assigned_room_type","distribution_channel",
                            "arrival_date_week_number","reservation_status_date",
                            "previous_cancellations","arrival_date_day_of_month",
                            "days_in_waiting_list","reservation_status","arrival_date_year"],axis=1)

In [None]:
hotel_data.head()

In [None]:
hotel_data.shape

In [None]:
hotel_data.info()

#### **From above we can now see that data is left with 1 float dtype column, 12 int dtype column and 8 object i.e string dtype columns



In [None]:
# listing the columns that are categorical in terms of their categories.

categorical_columns = [
    'hotel',
    'arrival_date_month',
    'meal',
    'country',
    'market_segment',
    'is_repeated_guest',
    'reserved_room_type',
    'deposit_type',
    'agent',
    'customer_type']

# Converting them to string dtype i. object

for column in categorical_columns:
    if column in hotel_data.columns:
        hotel_data[column] = hotel_data[column].astype(str)

In [None]:
hotel_data.info()

In [None]:
# Statisticals for categorical data
hotel_data.describe(include="object").T

#### _Here we can see column country has 177 unique values which is quite large to handle hence for ease we can drop this feature as well._

In [None]:
hotel_data=hotel_data.drop(["country"], axis=1)

In [None]:
# Statistics for numerical data

hotel_data.describe().T

### **From above statistics we can see there is some noisy data in column adr,all_children and adults.

#### "adr"= there are negetive values in this column which is absurd as this column provides us information regarding average daily rate and that can't be negative.
#### "adults"= there is 0 value in adult column that means there is no adult and it can't be possible for a child to check in hotel without adult.
#### "all_children"= There is max value of 10 children in one column which is quite unusual hence considered as outlier and should be omitted.

# Handling Noisy Data

In [None]:
noisy_data = {
    'adr':      hotel_data[hotel_data['adr'] < 0],
    'adults':   hotel_data[hotel_data['adults'] == 0],
    'all_children': hotel_data[hotel_data['all_children'] == 10],}

noisy_data_count = {key: len(value) for key, value in noisy_data.items()}
noisy_data_count

#### ** Here we can see that there is one negative adr value therefore we will replace it with median, 386 rows with adult 0 so it is better we remove rows with 0 adults as values is not that big so removing rows won't impact our data that much and 2 columns with children 10 which is considered as oulier since it is unusuaaly large from other values of that column hence should be omitted..

In [None]:
# Replace negative adr with median of adr column
hotel_data.loc[hotel_data['adr'] < 0, 'adr'] = hotel_data['adr'].median()

In [None]:
# Deleting rows with 0 adults
hotel_data=hotel_data.loc[hotel_data["adults"]!=0]

In [None]:
# Deleting rows with 10 children
hotel_data=hotel_data.loc[hotel_data["all_children"]!=10]

In [None]:
# Resetting the index
hotel_data.reset_index(drop=True, inplace=True)

In [None]:
#Checking if noisy data is treated or not.
noisy_data_handled = {
    'adr': hotel_data[hotel_data['adr'] < 0],
    'adults': hotel_data[hotel_data['adults'] == 0],
    'all_children': hotel_data[hotel_data['all_children'] == 10]}

noisy_data_handled_count = {key: len(value) for key, value in noisy_data_handled.items()}
noisy_data_handled_count

# Encoding

In [None]:
# One hot encoding- it is used for data which is not in order.
# Label encoding- it is used for data which is in order.

### One Hot Encoding

The following features are nominal variables and should be one-hot encoded:

`hotel`

`meal`

`market_segment`

`reserved_room_type`

`deposit_type`

`customer_type`

In [None]:
one_hot_cols = ['hotel', 'meal', 'market_segment', 'reserved_room_type', 'deposit_type', 'customer_type',"is_repeated_guest"]
hotel_data = pd.get_dummies(hotel_data, columns=one_hot_cols, drop_first=True)


# Label Encoding

The following feature is an ordinal variable and should be label encoded:

`arrival_date_month`


In [None]:
months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 
          'August', 'September', 'October', 'November', 'December']
hotel_data["arrival_date_month"]= hotel_data["arrival_date_month"].apply(lambda i:months.index(i)+1)

In [None]:
hotel_data.head()

In [None]:
hotel_data.describe().T

In [None]:
hotel_data.info()

# Model Training and Selection

### Train/Test Split

In [None]:
X= hotel_data.drop(["is_canceled"],axis=1)
X.shape

In [None]:
y= hotel_data["is_canceled"]
y.shape

In [None]:
X_train,X_test,y_train,y_test=train_test_split(X,y,train_size=0.80,stratify=y)

In [None]:
X_train.shape

In [None]:
y_train.shape

# Decision Tree Classifier

In [None]:
dt_base = DecisionTreeClassifier(random_state=0)

### Hyperparameter Tuning

In [None]:
def tune_clf_hyperparameters(clf, param_grid, X_train, y_train, scoring='f1', n_splits=5):
    '''
    This function optimizes the hyperparameters for a classifier by searching over a specified hyperparameter grid. 
    It uses GridSearchCV and cross-validation (StratifiedKFold) to evaluate different combinations of hyperparameters. 
    The combination with the highest F1-score for class 1 (canceled bookings) is selected as the default scoring metric. 
    The function returns the classifier with the optimal hyperparameters.
    '''
# Create the cross-validation object using StratifiedKFold to ensure the class distribution is the same across all the folds
    cv = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=0)

# Create the GridSearchCV object
    clf_grid = GridSearchCV(clf, param_grid, cv=cv, scoring=scoring, n_jobs=-1)

# Fit the GridSearchCV object to the training data
    clf_grid.fit(X_train, y_train)

# Get the best hyperparameters
    best_hyperparameters = clf_grid.best_params_
# Return best_estimator_ attribute which gives us the best model that has been fitted to the training data
    return clf_grid.best_estimator_, best_hyperparameters

In [None]:
# Hyperparameter grid for DT
param_grid_dt = {
    'criterion': ['gini', 'entropy'],
    'max_depth': [13, 14, 15],
    'min_samples_split': [2, 3, 4],
    'min_samples_leaf': [1, 2, 3],
    'class_weight': [{0: 1, 1: w} for w in [1, 2, 3]]
}

##### Since the data is slightly imbalanced and we want to optimize for class 1, we have included the class_weight parameter in our grid. In the grid above, the weight for class 0 is always 1, while the weight for class 1 varies from 1 to 5. This will help the model to focus more on class 1.

In [1]:
# Call the function for hyperparameter tuning
best_dt, best_dt_hyperparams = tune_clf_hyperparameters(dt_base, param_grid_dt, X_train, y_train)

NameError: name 'tune_clf_hyperparameters' is not defined

In [None]:
print('DT Optimal Hyperparameters: \n', best_dt_hyperparams)

### Dt Model Evaluation

In [None]:
def metrics_calculator(clf, X_test, y_test, model_name):
    '''
    This function calculates all desired performance metrics for a given model on test data.
    The metrics are calculated specifically for class 1.
    '''
    y_pred = clf.predict(X_test)
    result = pd.DataFrame(data=[accuracy_score(y_test, y_pred),
                                precision_score(y_test, y_pred, pos_label=1),
                                recall_score(y_test, y_pred, pos_label=1),
                                f1_score(y_test, y_pred, pos_label=1),
                                roc_auc_score(y_test, clf.predict_proba(X_test)[:,1])],
                          index=['Accuracy','Precision (Class 1)','Recall (Class 1)','F1-score (Class 1)','AUC (Class 1)'],
                          columns = [model_name])
    
    result = (result * 100).round(2).astype(str) + '%'                            
    return result

In [None]:
def model_evaluation(clf, X_train, X_test, y_train, y_test, model_name):
    '''
    This function provides a complete report of the model's performance including classification reports, 
    confusion matrix and ROC curve.
    '''
    sns.set(font_scale=1.2)
    
    # Generate classification report for training set
    y_pred_train = clf.predict(X_train)
    print("\n\t  Classification report for training set")
    print("-"*55)
    print(classification_report(y_train, y_pred_train))

    # Generate classification report for test set
    y_pred_test = clf.predict(X_test)
    print("\n\t   Classification report for test set")
    print("-"*55)
    print(classification_report(y_test, y_pred_test))
     # Create figure and subplots 
    fig, (ax1, ax2, ax3) = plt.subplots(1, 3, figsize=(15, 5), dpi=100, gridspec_kw={'width_ratios': [2, 2, 1]})
    
    # Define a colormap
    royalblue = LinearSegmentedColormap.from_list('royalblue', [(0, (1,1,1)), (1, (0.25,0.41,0.88))])
    royalblue_r = royalblue.reversed()

    # Plot confusion matrix for test set
    ConfusionMatrixDisplay.from_estimator(clf, X_test, y_test, colorbar=False, cmap=royalblue_r, ax=ax1)
    ax1.set_title('Confusion Matrix for Test Data')                                     
    ax1.grid(False)
    
    # Plot ROC curve for test data and display AUC score 
    RocCurveDisplay.from_estimator(clf, X_test, y_test, ax=ax2)
    ax2.set_xlabel('False Positive Rate')
    ax2.set_ylabel('True Positive Rate')
    ax2.set_title('ROC Curve for Test Data (Positive label: 1)')
    
    # Report results for the class specified by positive label
    result = metrics_calculator(clf, X_test, y_test, model_name)
    table = ax3.table(cellText=result.values, colLabels=result.columns, rowLabels=result.index, loc='center')
    table.scale(0.6, 2)
    table.set_fontsize(12)
    ax3.axis('tight')
    ax3.axis('off')
    # Modify color 
    for key, cell in table.get_celld().items():
        if key[0] == 0:
            cell.set_color('royalblue')
    plt.tight_layout()
    plt.show() 
    

In [None]:
model_evaluation(best_dt, X_train, X_test, y_train, y_test, 'Decision Tree')

In [None]:
dt_result = metrics_calculator(best_dt, X_test, y_test, 'Decision Tree')
dt_result

# Random Forest Classifier

In [None]:
# rf_base = RandomForestClassifier(random_state=0, n_jobs=-1)

In [None]:
# param_grid_rf = {
#     'n_estimators': [100, 150],
#     'criterion': ['entropy'],
#     'max_depth': [16, 18],
#     'min_samples_split': [2, 3, 4],
#     'min_samples_leaf': [1, 2, 3],
#     'class_weight': [{0: 1, 1: w} for w in [1, 2, 3]]
# }

In [None]:
# # Using the tune_clf_hyperparameters function to get the best estimator
# best_rf, best_rf_hyperparams = tune_clf_hyperparameters(rf_base, param_grid_rf, X_train, y_train)

# print('RF Optimal Hyperparameters: \n', best_rf_hyperparams)

### RF Model Evaluation

In [None]:
# model_evaluation(best_rf, X_train, X_test, y_train, y_test, 'Random Forest')

In [None]:
# rf_result = metrics_calculator(best_rf, X_test, y_test, 'Random Forest')
# rf_result

# Conclusion

From above we can see that Random Forest gives better accuracy than Decision Tree