In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

from sklearn.tree import DecisionTreeClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB

from sklearn.metrics import classification_report, accuracy_score

import joblib


In [2]:
df = pd.read_csv(r"C:\Users\user\Desktop\numcat\hotel_bookings.csv")
df.head()


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 119390 entries, 0 to 119389
Data columns (total 32 columns):
 #   Column                          Non-Null Count   Dtype  
---  ------                          --------------   -----  
 0   hotel                           119390 non-null  object 
 1   is_canceled                     119390 non-null  int64  
 2   lead_time                       119390 non-null  int64  
 3   arrival_date_year               119390 non-null  int64  
 4   arrival_date_month              119390 non-null  object 
 5   arrival_date_week_number        119390 non-null  int64  
 6   arrival_date_day_of_month       119390 non-null  int64  
 7   stays_in_weekend_nights         119390 non-null  int64  
 8   stays_in_week_nights            119390 non-null  int64  
 9   adults                          119390 non-null  int64  
 10  children                        119386 non-null  float64
 11  babies                          119390 non-null  int64  
 12  meal            

In [4]:
required_columns = [
    'lead_time',
    'arrival_date_month',
    'stays_in_weekend_nights',
    'stays_in_week_nights',
    'adults',
    'children',
    'babies',
    'is_repeated_guest',
    'previous_cancellations',
    'previous_bookings_not_canceled',
    'booking_changes',
    'days_in_waiting_list',
    'adr',
    'required_car_parking_spaces',
    'total_of_special_requests',

    # ADD THESE IMPORTANT FEATURES
    'meal',
    'market_segment',
    'deposit_type',
    'customer_type',

    'is_canceled'
]

df = df[required_columns]
df.head()


Unnamed: 0,lead_time,arrival_date_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,is_repeated_guest,previous_cancellations,previous_bookings_not_canceled,booking_changes,days_in_waiting_list,adr,required_car_parking_spaces,total_of_special_requests,meal,market_segment,deposit_type,customer_type,is_canceled
0,342,July,0,0,2,0.0,0,0,0,0,3,0,0.0,0,0,BB,Direct,No Deposit,Transient,0
1,737,July,0,0,2,0.0,0,0,0,0,4,0,0.0,0,0,BB,Direct,No Deposit,Transient,0
2,7,July,0,1,1,0.0,0,0,0,0,0,0,75.0,0,0,BB,Direct,No Deposit,Transient,0
3,13,July,0,1,1,0.0,0,0,0,0,0,0,75.0,0,0,BB,Corporate,No Deposit,Transient,0
4,14,July,0,2,2,0.0,0,0,0,0,0,0,98.0,0,1,BB,Online TA,No Deposit,Transient,0


In [5]:
df.isnull().sum()


lead_time                         0
arrival_date_month                0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          4
babies                            0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
booking_changes                   0
days_in_waiting_list              0
adr                               0
required_car_parking_spaces       0
total_of_special_requests         0
meal                              0
market_segment                    0
deposit_type                      0
customer_type                     0
is_canceled                       0
dtype: int64

In [6]:
df.fillna(0, inplace=True)


In [7]:
df.isnull().sum()


lead_time                         0
arrival_date_month                0
stays_in_weekend_nights           0
stays_in_week_nights              0
adults                            0
children                          0
babies                            0
is_repeated_guest                 0
previous_cancellations            0
previous_bookings_not_canceled    0
booking_changes                   0
days_in_waiting_list              0
adr                               0
required_car_parking_spaces       0
total_of_special_requests         0
meal                              0
market_segment                    0
deposit_type                      0
customer_type                     0
is_canceled                       0
dtype: int64

In [8]:
X = df.drop('is_canceled', axis=1)
y = df['is_canceled']


In [9]:
X_train, X_test, y_train, y_test = train_test_split(
    X,
    y,
    test_size=0.2,
    random_state=42,
    stratify=y
)


In [10]:
num_features = [
    'lead_time',
    'stays_in_weekend_nights',
    'stays_in_week_nights',
    'adults',
    'children',
    'babies',
    'previous_cancellations',
    'previous_bookings_not_canceled',
    'booking_changes',
    'days_in_waiting_list',
    'adr',
    'required_car_parking_spaces',
    'total_of_special_requests'
]

cat_features = [
    'arrival_date_month',
    'is_repeated_guest',
    'meal',
    'market_segment',
    'deposit_type',
    'customer_type'
]



In [11]:
preprocessor = ColumnTransformer(
    transformers=[
        ('num', StandardScaler(), num_features),
        ('cat', OneHotEncoder(handle_unknown='ignore'), cat_features)
    ]
)


### Decision Tree

In [12]:
dt_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', DecisionTreeClassifier(
        class_weight='balanced',
        random_state=42
    ))
])

dt_params = {
    'model__max_depth': [10, 15, 20],
    'model__min_samples_split': [10, 20],
    'model__min_samples_leaf': [5, 10]
}

dt_grid = GridSearchCV(
    dt_pipeline,
    dt_params,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

dt_grid.fit(X_train, y_train)

dt_best = dt_grid.best_estimator_
dt_pred = dt_best.predict(X_test)

print("Decision Tree")
print(classification_report(y_test, dt_pred))
dt_acc = accuracy_score(y_test, dt_pred)



Decision Tree
              precision    recall  f1-score   support

           0       0.85      0.84      0.85     15033
           1       0.74      0.75      0.75      8845

    accuracy                           0.81     23878
   macro avg       0.80      0.80      0.80     23878
weighted avg       0.81      0.81      0.81     23878



In [13]:
from sklearn.ensemble import RandomForestClassifier

rf_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', RandomForestClassifier(
        n_estimators=600,
        max_depth=None,
        min_samples_split=5,
        min_samples_leaf=2,
        max_features='sqrt',
        class_weight='balanced_subsample',
        random_state=42,
        n_jobs=-1
    ))
])

rf_pipeline.fit(X_train, y_train)

rf_pred = rf_pipeline.predict(X_test)

print("Random Forest")
print(classification_report(y_test, rf_pred))
rf_acc = accuracy_score(y_test, rf_pred)


Random Forest
              precision    recall  f1-score   support

           0       0.87      0.90      0.89     15033
           1       0.82      0.77      0.80      8845

    accuracy                           0.85     23878
   macro avg       0.85      0.84      0.84     23878
weighted avg       0.85      0.85      0.85     23878



## KNN model

In [14]:
knn_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', KNeighborsClassifier(
        n_neighbors=11,
        weights='distance',
        metric='euclidean'
    ))
])

# Train
knn_pipeline.fit(X_train, y_train)

# Predict
knn_pred = knn_pipeline.predict(X_test)

print("KNN")
print(classification_report(y_test, knn_pred))
knn_acc = accuracy_score(y_test, knn_pred)


KNN
              precision    recall  f1-score   support

           0       0.85      0.91      0.88     15033
           1       0.82      0.73      0.78      8845

    accuracy                           0.84     23878
   macro avg       0.84      0.82      0.83     23878
weighted avg       0.84      0.84      0.84     23878



## NAIVE BAYES

In [16]:
nb_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', GaussianNB())
])


## Hyperparameter tuning

In [17]:
nb_params = {
    'model__var_smoothing': [1e-9, 1e-8, 1e-7]
}

nb_grid = GridSearchCV(
    nb_pipeline,
    nb_params,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

nb_grid.fit(X_train, y_train)


In [18]:
nb_best = nb_grid.best_estimator_
nb_preds = nb_best.predict(X_test)

print("Naive Bayes Best Params:", nb_grid.best_params_)
print(classification_report(y_test, nb_preds))

nb_acc = accuracy_score(y_test, nb_preds)


Naive Bayes Best Params: {'model__var_smoothing': 1e-09}
              precision    recall  f1-score   support

           0       0.86      0.43      0.57     15033
           1       0.47      0.88      0.62      8845

    accuracy                           0.59     23878
   macro avg       0.67      0.65      0.59     23878
weighted avg       0.72      0.59      0.59     23878



## Linear Regression

In [19]:
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


In [20]:
from sklearn.linear_model import LogisticRegression

log_pipeline = Pipeline([
    ('preprocessor', preprocessor),
    ('model', LogisticRegression(
        max_iter=2000,
        class_weight='balanced'
    ))
])

log_params = {
    'model__C': [0.1, 1, 10]
}

log_grid = GridSearchCV(
    log_pipeline,
    log_params,
    cv=5,
    scoring='f1',
    n_jobs=-1
)

log_grid.fit(X_train, y_train)

log_best = log_grid.best_estimator_
log_pred = log_best.predict(X_test)

print("Logistic Regression")
print(classification_report(y_test, log_pred))
log_acc = accuracy_score(y_test, log_pred)


Logistic Regression
              precision    recall  f1-score   support

           0       0.83      0.82      0.83     15033
           1       0.71      0.72      0.71      8845

    accuracy                           0.79     23878
   macro avg       0.77      0.77      0.77     23878
weighted avg       0.79      0.79      0.79     23878



## Comparision report

In [21]:
'''comparison_df = pd.DataFrame({
    'Model': ['Decision Tree', 'KNN', 'Naive Bayes'],
    'Accuracy': [dt_acc, knn_acc, nb_acc]
})

comparison_df
'''

"comparison_df = pd.DataFrame({\n    'Model': ['Decision Tree', 'KNN', 'Naive Bayes'],\n    'Accuracy': [dt_acc, knn_acc, nb_acc]\n})\n\ncomparison_df\n"

In [22]:
comparison_df = pd.DataFrame({
    'Model': [
        'Decision Tree',
        'KNN',
        'Naive Bayes',
        'Random Forest',
        'Logistic Regression',
       # 'SVM'
    ],
    'Accuracy': [
        dt_acc,
        knn_acc,
        nb_acc,
        rf_acc,
        log_acc,
       # svm_acc
    ]
})

print(comparison_df.sort_values(by='Accuracy', ascending=False))


                 Model  Accuracy
3        Random Forest  0.853840
1                  KNN  0.842826
0        Decision Tree  0.810704
4  Logistic Regression  0.785116
2          Naive Bayes  0.594397


In [23]:
models = {
    'Decision Tree': dt_best,
    'KNN': knn_pipeline,
    'Naive Bayes': nb_best,
    'Random Forest': rf_pipeline,
    'Logistic Regression': log_best,
   # 'SVM': svm_best
}

accuracies = {
    'Decision Tree': dt_acc,
    'KNN': knn_acc,
    'Naive Bayes': nb_acc,
    'Random Forest': rf_acc,
    'Logistic Regression': log_acc,
    #'SVM': svm_acc
}

best_model_name = max(accuracies, key=accuracies.get)
best_model = models[best_model_name]

print(f"\nüèÜ Best Model: {best_model_name}")
print(f"üéØ Accuracy: {accuracies[best_model_name]:.4f}")

joblib.dump(best_model, "hotel_booking_best_model.pkl")

with open("best_model_name.txt", "w") as f:
    f.write(best_model_name)



üèÜ Best Model: Random Forest
üéØ Accuracy: 0.8538


In [24]:
'''joblib.dump(dt_best, "hotel_booking_dt_model.pkl")
joblib.dump(knn_preds, "hotel_booking_knn_model.pkl")
joblib.dump(nb_best, "hotel_booking_nb_model.pkl")
'''

'joblib.dump(dt_best, "hotel_booking_dt_model.pkl")\njoblib.dump(knn_preds, "hotel_booking_knn_model.pkl")\njoblib.dump(nb_best, "hotel_booking_nb_model.pkl")\n'