# Importing Libraries and data

In [4]:
# Importing Libraries
import pandas as pd
import numpy as np
import seaborn as sns
from imblearn.over_sampling import SMOTE
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import PowerTransformer
from sklearn.preprocessing import MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split
from imblearn.pipeline import Pipeline
from sklearn.impute import KNNImputer
from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest, f_regression,RFE
from sklearn.compose import ColumnTransformer
from xgboost import XGBRegressor
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier
from xgboost import XGBClassifier
from sklearn.metrics import make_scorer, accuracy_score, precision_score, recall_score, f1_score, roc_auc_score
from sklearn.model_selection import cross_validate
from sklearn.model_selection import GridSearchCV

In [5]:
## To Ignore Warnings
from warnings import filterwarnings
filterwarnings('ignore')

In [6]:
# Reading Data
df = pd.read_csv("Hotel Reservations.csv", index_col = "Booking_ID")
df.head(10)

Unnamed: 0_level_0,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
Booking_ID,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1
INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled
INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled
INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled
INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled
INN00006,2,0,0,2,Meal Plan 2,0,Room_Type 1,346,2018,9,13,Online,0,0,0,115.0,1,Canceled
INN00007,2,0,1,3,Meal Plan 1,0,Room_Type 1,34,2017,10,15,Online,0,0,0,107.55,1,Not_Canceled
INN00008,2,0,1,3,Meal Plan 1,0,Room_Type 4,83,2018,12,26,Online,0,0,0,105.61,1,Not_Canceled
INN00009,3,0,0,4,Meal Plan 1,0,Room_Type 1,121,2018,7,6,Offline,0,0,0,96.9,1,Not_Canceled
INN00010,2,0,0,5,Meal Plan 1,0,Room_Type 4,44,2018,10,18,Online,0,0,0,133.44,3,Not_Canceled


In [7]:
#Seperating Categorical and Numerical Columns and Storing the Target Column Name
label = 'booking_status'
categorical_columns = df.drop(label, axis = 1).select_dtypes("object").columns.to_list()
numerical_columns = df.select_dtypes(["float", "int"]).columns.to_list()

In [8]:
#Encoding the Target Column
df[label] = df[label].map({"Canceled" : 1, "Not_Canceled" : 0})
df[label]

Booking_ID
INN00001    0
INN00002    0
INN00003    1
INN00004    1
INN00005    1
           ..
INN36271    0
INN36272    1
INN36273    0
INN36274    1
INN36275    0
Name: booking_status, Length: 36275, dtype: int64

# Feature Engineering

In [9]:
#Total Number of Members for each Group
df["no_of_members"] = df["no_of_adults"] + df["no_of_children"]
#Total Number of Nights They will Stay
df["no_of_nights"] = df["no_of_weekend_nights"] + df["no_of_week_nights"]

# Pipeline

In [10]:
#scaler = PowerTransformer(method = "box-cox", standardize = False)
X = df.drop(label, axis = 1)
y = df[label]

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = .2, random_state = 42)

In [12]:
#Classification Models
models = [
    ('Logistic Regression', LogisticRegression(random_state = 42)),
    ('SVC', SVC(random_state = 42)),
    ('DTC', DecisionTreeClassifier(random_state = 42)),
    ('KNC', KNeighborsClassifier()),
    ('RFC', RandomForestClassifier(random_state = 42)),
    ("XGC", XGBClassifier(random_state = 42))
]

In [13]:
#Feature Selection Object
RFE_selector = RFE(XGBRegressor(), n_features_to_select=12)

In [14]:
#Column Transformation Object
preprocessor = ColumnTransformer(transformers = [
    ("num", RobustScaler(), numerical_columns),
    ("cat", OneHotEncoder(sparse = False, drop = "first"), categorical_columns),
])

In [15]:
# Classification Scoring Methods for Cross Validation
scoring = {
    'accuracy': make_scorer(accuracy_score),
    'precision': make_scorer(precision_score),
    'recall': make_scorer(recall_score),
    'f1': make_scorer(f1_score),
    'roc_auc': make_scorer(roc_auc_score)
}

In [16]:
# looping Each Model to choose the best 
for model_name, model in models:
  steps = []
  #Column Transformation Step
  steps.append(("preprocessor", preprocessor))
  #Oversampling Step
  steps.append(("smote", SMOTE()))
  #Feature Selection Step
  steps.append(("RFE_Selector", RFE_selector))
  #Model Training Step
  steps.append((model_name, model))
  #Defining Pipeline
  pipeline = Pipeline(steps = steps)
  #Evaluation Using Cross Validation
  cv_results = cross_validate(pipeline, X, y, scoring=scoring, cv=5, return_train_score=True)

  #Evaluating Based on Accuracy
  print("test :", cv_results['test_accuracy'].mean())
  print("train:", cv_results['train_accuracy'].mean())
  print('*' * 50)

test : 0.7789662301860785
train: 0.7798897312198485
**************************************************


# Hyperparameter Tuning

In [None]:
# Define your parameter grid for GridSearchCV
param_grid = {
    'XGC__n_estimators': [50, 100, 200],
    'XGC__max_depth': [3, 4, 5],
    'XGC__learning_rate': [0.1, 0.2, 0.3]
}

In [None]:
#XGBClassifier Pipeline
steps = []
steps.append(("preprocessor", preprocessor))
steps.append(("smote", SMOTE()))
steps.append(("RFE_selector", RFE_selector))
steps.append(("XGC", XGBClassifier(random_state=42)))

pipeline = Pipeline(steps = steps)
cv_results = cross_validate(pipeline, X, y, scoring=scoring, cv=5, return_train_score=True)
print("test :", cv_results['test_accuracy'].mean())
print("train:", cv_results['train_accuracy'].mean())
print('*' * 50)

test : 0.8851274982770503
train: 0.9068022053756032
**************************************************


In [None]:
#Tuning the Hyperparameters using GridSearchCV
grid_search = GridSearchCV(estimator = pipeline, param_grid = param_grid, scoring=scoring["accuracy"], n_jobs=-1, return_train_score=True)

grid_search.fit(X, y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_

print("Best Parameters:", best_params)
print("Best Score:", best_score)

Best Parameters: {'XGC__learning_rate': 0.3, 'XGC__max_depth': 5, 'XGC__n_estimators': 200}
Best Score: 0.8885182632667126


In [None]:
#Creating and Training the Pipeline
steps = []
steps.append(("preprocessor", preprocessor))
steps.append(("smote", SMOTE()))
steps.append(("RFE_selector", RFE_selector))
steps.append(("XGC", XGBClassifier(random_state=42, learning_rate = 0.3, max_depth = 5, n_estimators = 200)))

pipeline = Pipeline(steps=steps)
pipeline.fit(X, y)

In [None]:
#Saving the Pipeline
joblib.dump(pipeline, "pipeline.joblib")