In [2]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

df = pd.read_csv('./hotel_bookings.csv')
hotel_df = df[['hotel', 'is_canceled', 'lead_time', 'arrival_date_year',
               'arrival_date_month', 'days_in_waiting_list', 'adr', 'adults', 
               'children', 'babies']]

hotel_df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,days_in_waiting_list,adr,adults,children,babies
0,Resort Hotel,0,342,2015,July,0,0.00,2,0.0,0
1,Resort Hotel,0,737,2015,July,0,0.00,2,0.0,0
2,Resort Hotel,0,7,2015,July,0,75.00,1,0.0,0
3,Resort Hotel,0,13,2015,July,0,75.00,1,0.0,0
4,Resort Hotel,0,14,2015,July,0,98.00,2,0.0,0
...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,0,96.14,2,0.0,0
119386,City Hotel,0,102,2017,August,0,225.43,3,0.0,0
119387,City Hotel,0,34,2017,August,0,157.71,2,0.0,0
119388,City Hotel,0,109,2017,August,0,104.40,2,0.0,0


In [3]:
hotel_not_canceled_df= hotel_df[hotel_df['is_canceled'] == 0]
guest_not_canceled = hotel_not_canceled_df.adults + hotel_not_canceled_df.children + hotel_not_canceled_df.babies
indices_to_drop = guest_not_canceled[(hotel_df['is_canceled'] == 0) & (guest_not_canceled == 0.0)].index

hotel_df = hotel_df.drop(indices_to_drop)
hotel_df

Unnamed: 0,hotel,is_canceled,lead_time,arrival_date_year,arrival_date_month,days_in_waiting_list,adr,adults,children,babies
0,Resort Hotel,0,342,2015,July,0,0.00,2,0.0,0
1,Resort Hotel,0,737,2015,July,0,0.00,2,0.0,0
2,Resort Hotel,0,7,2015,July,0,75.00,1,0.0,0
3,Resort Hotel,0,13,2015,July,0,75.00,1,0.0,0
4,Resort Hotel,0,14,2015,July,0,98.00,2,0.0,0
...,...,...,...,...,...,...,...,...,...,...
119385,City Hotel,0,23,2017,August,0,96.14,2,0.0,0
119386,City Hotel,0,102,2017,August,0,225.43,3,0.0,0
119387,City Hotel,0,34,2017,August,0,157.71,2,0.0,0
119388,City Hotel,0,109,2017,August,0,104.40,2,0.0,0


In [4]:
# most common lead time
most_common_lead_time = hotel_not_canceled_df['lead_time'].mode().values[0]

# 2015~2017 monthly occupancy rate of hotels
hotel_df['total_guests'] = hotel_df['adults'] + hotel_df['children'] + hotel_df['babies']

guests_by_month = hotel_df.groupby(['arrival_date_year', 'arrival_date_month'])['total_guests'].sum().reset_index()
guests_table = pd.pivot_table(guests_by_month, values='total_guests', index='arrival_date_month',
                              columns='arrival_date_year', aggfunc='sum', fill_value=0, observed=False)
months_order = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
guests_table = guests_table.reindex(months_order)
average_guests_by_month = guests_table.mean(axis=1).round(2)

# average_guests_by_month의 각 월별 값들을 내림차순으로 정렬하여 저장
sorted_average_guests = average_guests_by_month.sort_values(ascending=False)

# the months with the most waiting list days
waiting_list = hotel_not_canceled_df.groupby(['arrival_date_month']).agg({'days_in_waiting_list': 'sum'}).reindex(months_order, axis=0)

# the months with the most waiting list days의 각 월별 값들을 내림차순으로 정렬
sorted_waiting_list = waiting_list.sort_values(by='days_in_waiting_list', ascending=False)

# 2015~2017 the monthly average price per guest per day
adr1 = hotel_not_canceled_df.groupby(['arrival_date_year', 'arrival_date_month']).agg({'adr': 'mean' })
adr1_unstack = adr1.unstack(level=0).reindex(months_order, axis=0).round(2)
adr1_unstack['average'] = adr1_unstack.mean(axis=1).round(1)

In [5]:
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error

X = hotel_not_canceled_df[['lead_time', 'arrival_date_month', 'days_in_waiting_list', 'adr']]
y = hotel_not_canceled_df['lead_time'] 

X = pd.get_dummies(X, columns=['arrival_date_month'])

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

rf_model = RandomForestRegressor(random_state=42)
rf_model.fit(X_train_scaled, y_train)

In [10]:
user_lead_time = int(input("Enter the remaining time until the reservation date (days): "))
user_month = input("Enter the month you want to book (ex-January): ")
user_adr = float(input("Enter the cost of a day's stay on the date of your reservation: "))

guests_occupancy = sorted_average_guests.index.get_loc(user_month) + 1 
adr_comparison = adr1_unstack.loc[user_month, 'average']
waiting_list_rank = sorted_waiting_list.index.get_loc(user_month) + 1

user_data = pd.DataFrame({
    'lead_time': [user_lead_time],
    'days_in_waiting_list': [0],
    'adr': [user_adr]
})

all_months = ['January', 'February', 'March', 'April', 'May', 'June', 'July', 'August', 'September', 'October', 'November', 'December']
for month in all_months:
    user_data['arrival_date_month_' + month] = 0

user_data['arrival_date_month_' + user_month] = 1
user_data = user_data[['lead_time', 'days_in_waiting_list', 'adr'] + ['arrival_date_month_' + month for month in all_months]]

training_feature_order = X.columns.tolist()

user_data = user_data[training_feature_order]
user_data_scaled = scaler.transform(user_data)
prediction = rf_model.predict(user_data_scaled)[0]

# 모델 예측 성능 측정
y_pred = rf_model.predict(X_test_scaled)

mse = mean_squared_error(y_test, y_pred)

print(f'Mean Squared Error: {mse}')


Enter the remaining time until the reservation date (days): 45
Enter the month you want to book (ex-January): March
Enter the cost of a day's stay on the date of your reservation: 320
Mean Squared Error: 0.0005281495277371286
