In [14]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report, accuracy_score

import joblib


In [2]:
df = pd.read_csv(r"/content/hotel_bookings.csv")
df.head()


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [4]:
required_columns = [
    'lead_time',
    'arrival_date_month',
    'stays_in_weekend_nights',
    'stays_in_week_nights',
    'adults',
    'children',
    'babies',
    'is_repeated_guest',
    'previous_cancellations',
    'previous_bookings_not_canceled',
    'booking_changes',
    'days_in_waiting_list',
    'adr',
    'required_car_parking_spaces',
    'total_of_special_requests',
    'is_canceled'   # target
]

df = df[required_columns]
df.head()


Unnamed: 0,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,is_canceled
0,342,July,0,0,2,0.0,0,0,0,0,3,0,0.0,0,0,0
1,737,July,0,0,2,0.0,0,0,0,0,4,0,0.0,0,0,0
2,7,July,0,1,1,0.0,0,0,0,0,0,0,75.0,0,0,0
3,13,July,0,1,1,0.0,0,0,0,0,0,0,75.0,0,0,0
4,14,July,0,2,2,0.0,0,0,0,0,0,0,98.0,0,1,0


In [5]:
df.isnull().sum()


Unnamed: 0,0
lead_time,0
arrival_date_month,0
stays_in_weekend_nights,0
stays_in_week_nights,0
adults,0
children,4
babies,0
is_repeated_guest,0
previous_cancellations,0
previous_bookings_not_canceled,0


In [6]:
df.fillna(0, inplace=True)


In [7]:
df.isnull().sum()


Unnamed: 0,0
lead_time,0
arrival_date_month,0
stays_in_weekend_nights,0
stays_in_week_nights,0
adults,0
children,0
babies,0
is_repeated_guest,0
previous_cancellations,0
previous_bookings_not_canceled,0


In [8]:
X = df.drop('is_canceled', axis=1)
y = df['is_canceled']


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [10]:
num_features = [
    'lead_time',
    'stays_in_weekend_nights',
    'stays_in_week_nights',
    'adults',
    'children',
    'babies',
    'previous_cancellations',
    'previous_bookings_not_canceled',
    'booking_changes',
    'days_in_waiting_list',
    'adr',
    'required_car_parking_spaces',
    'total_of_special_requests'
]

cat_features = [
    'arrival_date_month',
    'is_repeated_guest'
]


In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)


### Decision Tree

In [12]:
dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeClassifier(class_weight='balanced', random_state=42))
])


## Hyperparameter tuning

In [15]:
dt_params = {
    'model__max_depth': [4, 6, 8, 10],
    'model__min_samples_split': [5, 10, 20]
}

dt_grid = GridSearchCV(
    dt_pipeline,
    dt_params,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

dt_grid.fit(X_train, y_train)


In [16]:
dt_best = dt_grid.best_estimator_
dt_preds = dt_best.predict(X_test)

print("Decision Tree Best Params:", dt_grid.best_params_)
print(classification_report(y_test, dt_preds))

dt_acc = accuracy_score(y_test, dt_preds)


Decision Tree Best Params: {'model__max_depth': 10, 'model__min_samples_split': 5}
              precision    recall  f1-score   support

           0       0.83      0.78      0.80     15033
           1       0.65      0.72      0.69      8845

    accuracy                           0.76     23878
   macro avg       0.74      0.75      0.74     23878
weighted avg       0.76      0.76      0.76     23878



## KNN model

In [24]:
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', KNeighborsClassifier(
        n_neighbors=7,      # sensible default
        weights='distance'  # improves performance
    ))
])
knn_pipeline.fit(X_train, y_train)

## Hyperparameter tuning

In [25]:
'''knn_params = {
    'model__n_neighbors': [3,5,7,9],
    'model__weights': ['uniform', 'distance']
}

knn_grid = GridSearchCV(
    knn_pipeline,
    knn_params,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

knn_grid.fit(X_train, y_train)
'''



"knn_params = {\n    'model__n_neighbors': [3,5,7,9],\n    'model__weights': ['uniform', 'distance']\n}\n\nknn_grid = GridSearchCV(\n    knn_pipeline,\n    knn_params,\n    cv=5,\n    scoring='f1',\n    n_jobs=-1\n)\n\nknn_grid.fit(X_train, y_train)\n"

In [26]:
knn_preds = knn_pipeline.predict(X_test)

print("KNN Classification Report")
print(classification_report(y_test, knn_preds))

knn_acc = accuracy_score(y_test, knn_preds)


KNN Classification Report
              precision    recall  f1-score   support

           0       0.84      0.89      0.86     15033
           1       0.79      0.70      0.74      8845

    accuracy                           0.82     23878
   macro avg       0.81      0.80      0.80     23878
weighted avg       0.82      0.82      0.82     23878



##NAIVE BAYES

In [27]:
nb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GaussianNB())
])


## Hyperparameter tuning

In [28]:
nb_params = {
    'model__var_smoothing': [1e-9, 1e-8, 1e-7]
}

nb_grid = GridSearchCV(
    nb_pipeline,
    nb_params,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

nb_grid.fit(X_train, y_train)


In [29]:
nb_best = nb_grid.best_estimator_
nb_preds = nb_best.predict(X_test)

print("Naive Bayes Best Params:", nb_grid.best_params_)
print(classification_report(y_test, nb_preds))

nb_acc = accuracy_score(y_test, nb_preds)


Naive Bayes Best Params: {'model__var_smoothing': 1e-07}
              precision    recall  f1-score   support

           0       0.91      0.17      0.28     15033
           1       0.41      0.97      0.57      8845

    accuracy                           0.46     23878
   macro avg       0.66      0.57      0.43     23878
weighted avg       0.73      0.46      0.39     23878



## Comparision report

In [30]:
comparison_df = pd.DataFrame({
    'Model': ['Decision Tree', 'KNN', 'Naive Bayes'],
    'Accuracy': [dt_acc, knn_acc, nb_acc]
})

comparison_df


Unnamed: 0,Model,Accuracy
0,Decision Tree,0.755172
1,KNN,0.820295
2,Naive Bayes,0.464779


In [32]:
joblib.dump(dt_best, "hotel_booking_dt_model.pkl")
joblib.dump(knn_preds, "hotel_booking_knn_model.pkl")
joblib.dump(nb_best, "hotel_booking_nb_model.pkl")


['hotel_booking_nb_model.pkl']