In [None]:
Importing necessary libraries and data

In [1]:
import numpy as np
import pandas as pd

from sklearn.model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.impute import SimpleImputer, KNNImputer
from sklearn.linear_model import LinearRegression, LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.ensemble import (
    BaggingClassifier,
    RandomForestClassifier,
    ExtraTreesClassifier,
    AdaBoostClassifier,
    StackingClassifier,
    GradientBoostingClassifier,
    HistGradientBoostingClassifier,
)

from lightgbm import LGBMClassifier
from sklearn.model_selection import cross_val_score, cross_validate, cross_val_predict

# To tune a model
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV


from utils import (
    col_out,
    model_summary,
    adj_r2_score,
    model_performance_regression,
    confusion_matrix_helper,
    model_performance_classification_sklearn,
)


import sklearn.metrics as metrics
from sklearn.metrics import (
    mean_absolute_error,
    mean_squared_error,
    r2_score,
    classification_report,
    confusion_matrix,
    ConfusionMatrixDisplay,
    f1_score,
    accuracy_score,
    recall_score,
    precision_score,
    roc_auc_score,
    precision_recall_curve,
    roc_curve,
)
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn import set_config

set_config(display="diagram")

HotelRes_data = (pd.read_csv("hotelReservation.csv").rename(
                    columns=lambda col: col.replace("'", "")
                                            .replace('tag_','')
                                            .replace('_p','P')
                                            .replace('_c','C')
                                            .replace(' ','')
                                            .replace('-','')
                ).reset_index(drop=True))

In [2]:
HotelRes_data.tail()

Unnamed: 0,Unnamed:0,Booking_ID,no_of_adults,no_ofChildren,no_of_weekend_nights,no_of_week_nights,type_of_mealPlan,requiredCarParking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_ofPreviousCancellations,no_ofPrevious_bookings_notCanceled,avgPricePer_room,no_of_special_requests,booking_status
36270,36270,INN36271,3.0,0,2.0,6,Meal Plan 1,0,Room_Type 4,85,2018,8,3,Online,0,0,0,167.8,1,Not_Canceled
36271,36271,INN36272,2.0,0,1.0,3,Meal Plan 1,0,Room_Type 1,228,2018,10,17,Online,0,0,0,90.95,2,Canceled
36272,36272,INN36273,2.0,0,2.0,6,Meal Plan 1,0,Room_Type 1,148,2018,7,1,Online,0,0,0,98.39,2,Not_Canceled
36273,36273,INN36274,2.0,0,0.0,3,Not Selected,0,Room_Type 1,63,2018,4,21,Online,0,0,0,94.5,0,Canceled
36274,36274,INN36275,2.0,0,1.0,2,Meal Plan 1,0,Room_Type 1,207,2018,12,30,Offline,0,0,0,161.67,0,Not_Canceled


In [3]:
HotelRes_data = HotelRes_data.drop(columns=["Unnamed:0"])


In [4]:
HotelRes_data.isnull().sum()

Booking_ID                              0
no_of_adults                          100
no_ofChildren                           0
no_of_weekend_nights                  199
no_of_week_nights                       0
type_of_mealPlan                        0
requiredCarParking_space                0
room_type_reserved                    145
lead_time                               0
arrival_year                            0
arrival_month                           0
arrival_date                            0
market_segment_type                   611
repeated_guest                          0
no_ofPreviousCancellations              0
no_ofPrevious_bookings_notCanceled      0
avgPricePer_room                        0
no_of_special_requests                  0
booking_status                          0
dtype: int64

In [5]:
HotelRes_data.describe()

Unnamed: 0,no_of_adults,no_ofChildren,no_of_weekend_nights,no_of_week_nights,requiredCarParking_space,lead_time,arrival_year,arrival_month,arrival_date,repeated_guest,no_ofPreviousCancellations,no_ofPrevious_bookings_notCanceled,avgPricePer_room,no_of_special_requests
count,36175.0,36275.0,36076.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0,36275.0
mean,1.844976,0.105279,0.810871,2.2043,0.030986,85.232557,2017.820427,7.423653,15.596995,0.025637,0.023349,0.153411,103.423539,0.619655
std,0.518701,0.402648,0.870437,1.410905,0.173281,85.930817,0.383836,3.069894,8.740447,0.158053,0.368331,1.754171,35.089424,0.786236
min,0.0,0.0,0.0,0.0,0.0,0.0,2017.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0
25%,2.0,0.0,0.0,1.0,0.0,17.0,2018.0,5.0,8.0,0.0,0.0,0.0,80.3,0.0
50%,2.0,0.0,1.0,2.0,0.0,57.0,2018.0,8.0,16.0,0.0,0.0,0.0,99.45,0.0
75%,2.0,0.0,2.0,3.0,0.0,126.0,2018.0,10.0,23.0,0.0,0.0,0.0,120.0,1.0
max,4.0,10.0,7.0,17.0,1.0,443.0,2018.0,12.0,31.0,1.0,13.0,58.0,540.0,5.0


In [None]:
Data Preprocessing

In [6]:
# categorical columns for the hotelReservation dataset
categorical_columns = [
    "type_of_mealPlan",
    "room_type_reserved",
    "market_segment_type",
]

# numeric columns
numeric_columns = HotelRes_data.columns.difference(
    categorical_columns + ["booking_status", "Booking_ID"]
).tolist()
numeric_columns

['arrival_date',
 'arrival_month',
 'arrival_year',
 'avgPricePer_room',
 'lead_time',
 'no_ofChildren',
 'no_ofPreviousCancellations',
 'no_ofPrevious_bookings_notCanceled',
 'no_of_adults',
 'no_of_special_requests',
 'no_of_week_nights',
 'no_of_weekend_nights',
 'repeated_guest',
 'requiredCarParking_space']

In [7]:
# column transformer (steps for data transformations, missing value imputation, transformation, scaling, and encoding)
colu_transformer = make_pipeline(
    ColumnTransformer(
        transformers=[
            (
                "imputeScale",
                Pipeline(
                    [
                        ("KNNImpute", KNNImputer(n_neighbors=3)),
                        (
                            "logTransform",
                            FunctionTransformer(
                                np.log1p, feature_names_out=col_out, validate=True
                            ),
                        ),
                        ("scaler", StandardScaler()),
                    ]
                ),
                ["no_of_adults", "no_of_weekend_nights"],
            ),
            (
                "scale",
                Pipeline(
                    [
                        (
                            "logTransform",
                            FunctionTransformer(
                                np.log1p, feature_names_out=col_out, validate=True
                            ),
                        ),
                        ("standardScale", StandardScaler()),
                    ]
                ),
                ["lead_time", "avgPricePer_room", "no_of_week_nights"],
            ),
            (
                "imputeOneHot",
                Pipeline(
                    [
                        (
                            "SimpleImpute",
                            SimpleImputer(
                                missing_values=np.NaN,
                                strategy="constant",
                                fill_value="is_missing",
                            ),
                        ),
                        ("oneHotEncode", OneHotEncoder(drop="first")),
                    ]
                ),
                categorical_columns,
            ),
        ],
        verbose_feature_names_out=False,
        remainder="passthrough",
    )
)

In [8]:
# column transformer (steps for data transformations, missing value imputation, transformation, scaling, and encoding)
# Define a column transformer
col_transformer = ColumnTransformer(
    transformers=[
        (
            "numeric",
            Pipeline(
                [
                    ("imputer", SimpleImputer(strategy="mean")),
                    ("scaler", StandardScaler()),
                ]
            ),
            numeric_columns,
        ),
        (
            "categorical",
            Pipeline(
                [
                    (
                        "imputer",
                        SimpleImputer(strategy="constant", fill_value="missing"),
                    ),
                    ("encoder", OneHotEncoder(handle_unknown="ignore")),
                ]
            ),
            categorical_columns,
        ),
    ]
)

In [9]:
# Convert target to binary
y = HotelRes_data["booking_status"].apply(lambda x: 1 if x == "Canceled" else 0)

# Drop the target and identifier columns to create the features dataframe
X = HotelRes_data.drop(columns=["booking_status", "Booking_ID"])
print(X.head(4))
# Splitting data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    X, y, test_size=0.3, random_state=42
)

   no_of_adults  no_ofChildren  no_of_weekend_nights  no_of_week_nights  \
0           2.0              0                   1.0                  2   
1           2.0              0                   2.0                  3   
2           1.0              0                   2.0                  1   
3           2.0              0                   0.0                  2   

  type_of_mealPlan  requiredCarParking_space room_type_reserved  lead_time  \
0      Meal Plan 1                         0        Room_Type 1        224   
1     Not Selected                         0        Room_Type 1          5   
2      Meal Plan 1                         0        Room_Type 1          1   
3      Meal Plan 1                         0        Room_Type 1        211   

   arrival_year  arrival_month  arrival_date market_segment_type  \
0          2017             10             2             Offline   
1          2018             11             6              Online   
2          2018              