# Libraries

In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from sklearn.metrics import mean_squared_error, r2_score
import warnings
warnings.filterwarnings("ignore")


# Loading the dataset and Display the first few rows

In [4]:
file_path = "/Users/mandeepsinghsahani/Desktop/hotel_bookings 3.csv"
df = pd.read_csv(file_path)

df.head()


Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,...,deposit_type,agent,company,days_in_waiting_list,customer_type,adr,required_car_parking_spaces,total_of_special_requests,reservation_status,reservation_status_date
0,Resort Hotel,0,342,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
1,Resort Hotel,0,737,2015,July,27,1,0,0,2,...,No Deposit,,,0,Transient,0.0,0,0,Check-Out,2015-07-01
2,Resort Hotel,0,7,2015,July,27,1,0,1,1,...,No Deposit,,,0,Transient,75.0,0,0,Check-Out,2015-07-02
3,Resort Hotel,0,13,2015,July,27,1,0,1,1,...,No Deposit,304.0,,0,Transient,75.0,0,0,Check-Out,2015-07-02
4,Resort Hotel,0,14,2015,July,27,1,0,2,2,...,No Deposit,240.0,,0,Transient,98.0,0,1,Check-Out,2015-07-03


# Data Preprocessing

In [6]:
df = df.drop(columns=['reservation_status', 'reservation_status_date'])

df.fillna(0, inplace=True)

df = pd.get_dummies(df, drop_first=True)

df.head()


Unnamed: 0,is_canceled,lead_time,arrival_date_year,arrival_date_week_number,arrival_date_day_of_month,stays_in_weekend_nights,stays_in_week_nights,adults,children,babies,...,assigned_room_type_H,assigned_room_type_I,assigned_room_type_K,assigned_room_type_L,assigned_room_type_P,deposit_type_Non Refund,deposit_type_Refundable,customer_type_Group,customer_type_Transient,customer_type_Transient-Party
0,0,342,2015,27,1,0,0,2,0.0,0,...,False,False,False,False,False,False,False,False,True,False
1,0,737,2015,27,1,0,0,2,0.0,0,...,False,False,False,False,False,False,False,False,True,False
2,0,7,2015,27,1,0,1,1,0.0,0,...,False,False,False,False,False,False,False,False,True,False
3,0,13,2015,27,1,0,1,1,0.0,0,...,False,False,False,False,False,False,False,False,True,False
4,0,14,2015,27,1,0,2,2,0.0,0,...,False,False,False,False,False,False,False,False,True,False


# Feature Engineering (New Features)

In [7]:
df['is_weekend'] = df['arrival_date_day_of_month'].apply(lambda x: 1 if x in [5, 6] else 0)

df['total_guests'] = df['adults'] + df['children'] + df['babies']

df['stay_duration'] = df['stays_in_weekend_nights'] + df['stays_in_week_nights']

df[['is_weekend', 'total_guests', 'stay_duration']].head()


Unnamed: 0,is_weekend,total_guests,stay_duration
0,0,2.0,0
1,0,2.0,0
2,0,1.0,1
3,0,1.0,1
4,0,2.0,2


# Split Data into Train & Test

In [8]:
X = df.drop(columns=['adr'])  
y = df['adr'] 

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)


 # Train Optimized Random Forest

In [9]:
rf_model = RandomForestRegressor(n_estimators=200, max_depth=10, random_state=42)
rf_model.fit(X_train, y_train)

y_pred_rf = rf_model.predict(X_test)

rmse_rf = mean_squared_error(y_test, y_pred_rf, squared=False)
r2_rf = r2_score(y_test, y_pred_rf)

print("Random Forest Performance:")
print(f"RMSE: {rmse_rf}")
print(f"R² Score: {r2_rf}")


Optimized Random Forest Performance:
RMSE: 24.002648015077764
R² Score: 0.7480843082508424


# Train XGBoost Model

In [10]:
# Initialize and train XGBoost model
xgb_model = xgb.XGBRegressor(objective='reg:squarederror', n_estimators=200, 
                             learning_rate=0.1, max_depth=10, subsample=0.8, 
                             colsample_bytree=0.8, random_state=42)
xgb_model.fit(X_train, y_train)

# Make predictions
y_pred_xgb = xgb_model.predict(X_test)

# Evaluate XGBoost
rmse_xgb = mean_squared_error(y_test, y_pred_xgb, squared=False)
r2_xgb = r2_score(y_test, y_pred_xgb)

print("XGBoost Model Performance:")
print(f"RMSE: {rmse_xgb}")
print(f"R² Score: {r2_xgb}")


XGBoost Model Performance:
RMSE: 14.435161280352652
R² Score: 0.9088870331683627


# Hyperparameter Tuning for XGBoost

In [11]:
param_grid = {
    'n_estimators': [100, 200, 300],
    'max_depth': [5, 10, 15],
    'learning_rate': [0.01, 0.1, 0.2],
    'subsample': [0.7, 0.8, 1.0],
    'colsample_bytree': [0.7, 0.8, 1.0]
}

xgb_tune = xgb.XGBRegressor(objective='reg:squarederror', random_state=42)

grid_search = GridSearchCV(estimator=xgb_tune, param_grid=param_grid, 
                           cv=3, n_jobs=-1, verbose=2, scoring='neg_root_mean_squared_error')

grid_search.fit(X_train, y_train)

best_xgb_params = grid_search.best_params_
print("Best Hyperparameters for XGBoost:", best_xgb_params)


Fitting 3 folds for each of 243 candidates, totalling 729 fits
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.7; total time=   4.7s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0; total time=   4.6s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.7; total time=   4.9s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=1.0; total time=   4.7s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.7; total time=   4.9s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8; total time=   5.0s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8; total time=   5.0s
[CV] END colsample_bytree=0.7, learning_rate=0.01, max_depth=5, n_estimators=100, subsample=0.8; total time=   5.1s
[CV] END 

# Train XGBoost with Best Parameters

In [12]:
# Train the best XGBoost model
best_xgb = xgb.XGBRegressor(**best_xgb_params, objective='reg:squarederror', random_state=42)
best_xgb.fit(X_train, y_train)

# Make predictions
y_pred_best_xgb = best_xgb.predict(X_test)

# Evaluate the final XGBoost model
rmse_best_xgb = mean_squared_error(y_test, y_pred_best_xgb, squared=False)
r2_best_xgb = r2_score(y_test, y_pred_best_xgb)

print("Final XGBoost Performance:")
print(f"RMSE: {rmse_best_xgb}")
print(f"R² Score: {r2_best_xgb}")


Final XGBoost Performance:
RMSE: 13.155813905641393
R² Score: 0.9243215287333372
