In [13]:
import numpy as np
import os
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder,RobustScaler
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier

In [14]:
FILE_PATH = os.path.join(os.getcwd(), 'Dataset','first inten project.csv')
df = pd.read_csv(FILE_PATH)

In [15]:
df.columns = df.columns.str.lower().str.strip().str.replace(' ', '_')

In [16]:
# create new features for number of people and number of nights
df['number_of_people'] = df['number_of_adults'] + df['number_of_children']
df['number_of_nights'] = df['number_of_weekend_nights'] + df['number_of_week_nights']
df.drop(columns=['booking_id', 'number_of_adults', 'number_of_children', 'number_of_weekend_nights', 'number_of_week_nights'], axis = 1,inplace=True)

In [17]:
# i chose to replace the invalid date with 2018-2-28 because 2018 is not a leap year and it will be the closest valid date to 2018-2-29
df['date_of_reservation'] = df['date_of_reservation'].replace('2018-2-29', '2018-2-28')

In [18]:
# convert date_of_reservation to datetime format
df['date_of_reservation'] = pd.to_datetime(df['date_of_reservation'], format= 'mixed', errors='coerce')

In [19]:
# create new features for year, month and day of reservation and drop the original date_of_reservation column
df['year'] = df['date_of_reservation'].dt.year
df['month'] = df['date_of_reservation'].dt.month
df['day'] = df['date_of_reservation'].dt.day
df.drop('date_of_reservation', axis=1, inplace=True)

In [20]:
# Reorder columns to have 'booking_status' at the end
cols = list(df.columns)
cols.remove('booking_status')
df = df[cols + ['booking_status']]


In [21]:
#convert booking_status to binary values
df['booking_status'] = df['booking_status'].map({'Canceled': 1, 'Not_Canceled': 0})

In [22]:
# split the data into training and testing sets
X = df.drop('booking_status', axis=1)
y = df['booking_status']   

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [23]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Index: 29028 entries, 387 to 15795
Data columns (total 15 columns):
 #   Column               Non-Null Count  Dtype  
---  ------               --------------  -----  
 0   type_of_meal         29028 non-null  object 
 1   car_parking_space    29028 non-null  int64  
 2   room_type            29028 non-null  object 
 3   lead_time            29028 non-null  int64  
 4   market_segment_type  29028 non-null  object 
 5   repeated             29028 non-null  int64  
 6   p-c                  29028 non-null  int64  
 7   p-not-c              29028 non-null  int64  
 8   average_price        29028 non-null  float64
 9   special_requests     29028 non-null  int64  
 10  number_of_people     29028 non-null  int64  
 11  number_of_nights     29028 non-null  int64  
 12  year                 29028 non-null  int32  
 13  month                29028 non-null  int32  
 14  day                  29028 non-null  int32  
dtypes: float64(1), int32(3), int64(8), obje

In [None]:
y_train.value_counts(normalize=True)

booking_status
0    0.673178
1    0.326822
Name: proportion, dtype: float64

In [25]:
numeric_features = list(X_train[['lead_time', 'p-c', 'p-not-c','average_price', 'special_requests', 'number_of_people','number_of_nights']])
categorical_features = list(X_train.select_dtypes(include=['object']).columns)


In [26]:
# for col in numerical_features.columns:
#     plt.figure(figsize=(8, 4))
#     sns.histplot(numerical_features[col], kde=True)
#     plt.title(f'Distribution of {col}')
#     plt.xlabel(col)
#     plt.ylabel('Frequency')
#     plt.show()
numeric_features, categorical_features

(['lead_time',
  'p-c',
  'p-not-c',
  'average_price',
  'special_requests',
  'number_of_people',
  'number_of_nights'],
 ['type_of_meal', 'room_type', 'market_segment_type'])

In [27]:
preprocessor = ColumnTransformer([
    ('num', RobustScaler(), numeric_features),
    ('cat', OneHotEncoder(handle_unknown='ignore'), categorical_features)
])

In [28]:
pipeline = Pipeline([
    ('preprocessing', preprocessor),
    ('model', RandomForestClassifier(random_state=42))
])

In [29]:
pipeline.fit(X_train, y_train)

In [30]:
y_pred = pipeline.predict(X_test)

In [31]:
# Evaluate the model
print("Random Forest Model Performance:")
print(f"Accuracy: {accuracy_score(y_test, y_pred)*100:.2f}%")
print(f"Precision: {precision_score(y_test, y_pred):.4f}")
print(f"Recall: {recall_score(y_test, y_pred):.4f}")
print(f"F1-Score: {f1_score(y_test, y_pred):.4f}")
print("\nConfusion Matrix:")
print(confusion_matrix(y_test, y_pred))
print("\nClassification Report:")
print(classification_report(y_test, y_pred))

Random Forest Model Performance:
Accuracy: 88.34%
Precision: 0.8533
Recall: 0.7823
F1-Score: 0.8162

Confusion Matrix:
[[4532  323]
 [ 523 1879]]

Classification Report:
              precision    recall  f1-score   support

           0       0.90      0.93      0.91      4855
           1       0.85      0.78      0.82      2402

    accuracy                           0.88      7257
   macro avg       0.87      0.86      0.87      7257
weighted avg       0.88      0.88      0.88      7257



In [None]:
# Save the trained model to a file using joblib
import joblib

joblib.dump(pipeline, "hotel_model.pkl")

['hotel_model.pkl']