In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import random
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore")
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score
from sklearn.decomposition import PCA
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score,classification_report, precision_recall_curve
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score
import joblib
import os
from dotenv import load_dotenv
load_dotenv()

True

In [2]:
# Load the hotel data from the CSV file
hotel_path = os.getenv("hotel_path")
hotel = pd.read_csv(hotel_path)

In [3]:
hotel.head()

Unnamed: 0,travelCode,userCode,name,place,days,price,total,date
0,0,0,Hotel A,Florianopolis (SC),4,313.02,1252.08,09/26/2019
1,2,0,Hotel K,Salvador (BH),2,263.41,526.82,10-10-2019
2,7,0,Hotel K,Salvador (BH),3,263.41,790.23,11/14/2019
3,11,0,Hotel K,Salvador (BH),4,263.41,1053.64,12-12-2019
4,13,0,Hotel A,Florianopolis (SC),1,313.02,313.02,12/26/2019


In [4]:
hotel.shape

(40552, 8)

In [5]:
hotel.describe()

Unnamed: 0,travelCode,userCode,days,price,total
count,40552.0,40552.0,40552.0,40552.0,40552.0
mean,67911.794461,666.963726,2.499679,214.439554,536.229513
std,39408.199333,391.136794,1.119326,76.742305,319.331482
min,0.0,0.0,1.0,60.39,60.39
25%,33696.75,323.0,1.0,165.99,247.62
50%,67831.0,658.0,2.0,242.88,495.24
75%,102211.25,1013.0,4.0,263.41,742.86
max,135942.0,1339.0,4.0,313.02,1252.08


In [6]:
hotel.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 40552 entries, 0 to 40551
Data columns (total 8 columns):
 #   Column      Non-Null Count  Dtype  
---  ------      --------------  -----  
 0   travelCode  40552 non-null  int64  
 1   userCode    40552 non-null  int64  
 2   name        40552 non-null  object 
 3   place       40552 non-null  object 
 4   days        40552 non-null  int64  
 5   price       40552 non-null  float64
 6   total       40552 non-null  float64
 7   date        40552 non-null  object 
dtypes: float64(2), int64(3), object(3)
memory usage: 2.5+ MB


In [7]:
hotel.isnull().sum()

travelCode    0
userCode      0
name          0
place         0
days          0
price         0
total         0
date          0
dtype: int64

In [8]:
# Converting date column to datetime format to handle inconsistencies in date format
hotel['date'] = pd.to_datetime(hotel['date'], errors='coerce')

In [9]:
# Label encoding for categorical columns to convert them into numeric values
label_encoder_name = LabelEncoder()
hotel['name'] = label_encoder_name.fit_transform(hotel['name'])

label_encoder_place = LabelEncoder()
hotel['place'] = label_encoder_place.fit_transform(hotel['place'])

In [10]:
# Selecting features and target variable 
X = hotel[['travelCode', 'userCode', 'days', 'price', 'total']]
y_name = hotel['name']
y_place = hotel['place']
y_price = hotel['price']

#  Splitting the hotel into training and testing sets  
X_train, X_test, y_name_train, y_name_test = train_test_split(X, y_name, test_size=0.2, random_state=42)
_, _, y_place_train, y_place_test = train_test_split(X, y_place, test_size=0.2, random_state=42)
_, _, y_price_train, y_price_test = train_test_split(X, y_price, test_size=0.2, random_state=42)

In [11]:
# Train the model for hotel name prediction
model_name = RandomForestClassifier()
model_name.fit(X_train, y_name_train)

# Train the model for hotel place prediction
model_place = RandomForestClassifier()
model_place.fit(X_train, y_place_train)

# Train the model for hotel price prediction  
model_price = RandomForestRegressor()
model_price.fit(X_train, y_price_train)

# Make predictions
y_name_pred = model_name.predict(X_test)
y_place_pred = model_place.predict(X_test)
y_price_pred = model_price.predict(X_test)

# Print the classification report
print("Hotel Name Prediction Report:\n", classification_report(y_name_test, y_name_pred))
print("*" * 60)
print("Hotel Place Prediction Report:\n", classification_report(y_place_test, y_place_pred))
print("*" * 60)
print("Hotel Price Prediction Report:\n", mean_squared_error(y_price_test, y_price_pred))


Hotel Name Prediction Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       655
           1       1.00      1.00      1.00      1006
           2       1.00      1.00      1.00       894
           3       1.00      1.00      1.00       969
           4       1.00      1.00      1.00       841
           5       1.00      1.00      1.00       896
           6       1.00      1.00      1.00       997
           7       1.00      1.00      1.00      1025
           8       1.00      1.00      1.00       828

    accuracy                           1.00      8111
   macro avg       1.00      1.00      1.00      8111
weighted avg       1.00      1.00      1.00      8111

************************************************************
Hotel Place Prediction Report:
               precision    recall  f1-score   support

           0       1.00      1.00      1.00       828
           1       1.00      1.00      1.00       841
        

In [12]:
# Sample data for fitting label encoder (ensure these cover all possible labels)
hotel_names = ['Hotel A', 'Hotel K', 'Hotel Z']
hotel_places = ['Florianopolis (SC)', 'Salvador (BH)', 'Aracaju (SE)']

# Initialize and fit the LabelEncoder for hotel names and places
label_encoder_name = LabelEncoder()
label_encoder_place = LabelEncoder()

label_encoder_name.fit(hotel_names)
label_encoder_place.fit(hotel_places)

# Example: Making a prediction
sample_data = pd.DataFrame({
    'travelCode': [0],
    'userCode': [0],
    'days': [4],
    'price': [313.02],
    'total': [1252.08]
})

# Assuming model_name, model_place, and model_price are already trained
predicted_name = model_name.predict(sample_data)
predicted_place = model_place.predict(sample_data)
predicted_price = model_price.predict(sample_data)

# Check if the predicted labels are within the range of the fitted labels
print(f"Predicted name index: {predicted_name[0]}")
print(f"Predicted place index: {predicted_place[0]}")

# Ensure the predictions are within the range of labels known to the encoder
if max(predicted_name) < len(label_encoder_name.classes_) and max(predicted_place) < len(label_encoder_place.classes_):
    # Inverse transform to get original labels
    print("Predicted Hotel Name:", label_encoder_name.inverse_transform(predicted_name))
    print("Predicted Hotel Place:", label_encoder_place.inverse_transform(predicted_place))
else:
    print("Error: Predicted labels are out of the known range of the encoder")

print("Predicted Hotel Price:", predicted_price)


Predicted name index: 0
Predicted place index: 3
Error: Predicted labels are out of the known range of the encoder
Predicted Hotel Price: [313.02]


In [13]:
def predict_hotel(travelCode, userCode, days, price, total):
    sample_data = pd.DataFrame({
        'travelCode': [travelCode],
        'userCode': [userCode],
        'days': [days],
        'price': [price],
        'total': [total]
    })

    predicted_name = model_name.predict(sample_data)
    predicted_place = model_place.predict(sample_data)
    predicted_price = model_price.predict(sample_data)

    # Defensive check for unseen labels
    if predicted_place[0] >= len(label_encoder_place.classes_):
        place = f"Unknown label: {predicted_place[0]}"
    else:
        place = label_encoder_place.inverse_transform(predicted_place)[0]

    if predicted_name[0] >= len(label_encoder_name.classes_):
        name = f"Unknown label: {predicted_name[0]}"
    else:
        name = label_encoder_name.inverse_transform(predicted_name)[0]

    return {
        'name': name,
        'place': place,
        'price': predicted_price[0]
    }

# Example prediction
print(predict_hotel(0, 0, 4, 313.02, 1252.08))

{'name': np.str_('Hotel A'), 'place': 'Unknown label: 3', 'price': np.float64(313.02000000000703)}


In [14]:
# Format: predict_hotel(travelCode, userCode, days, price, total)

# 1. Short stay, low price
print(predict_hotel(1, 2, 2, 150.0, 300.0))

# 2. Medium stay, medium price
print(predict_hotel(3, 5, 5, 220.0, 1100.0))

# 3. Long stay, high price
print(predict_hotel(7, 1, 10, 450.0, 4500.0))

# 4. Edge case: minimum values
print(predict_hotel(0, 0, 1, 100.0, 100.0))

# 5. Edge case: maximum plausible values
print(predict_hotel(15, 10, 30, 1000.0, 30000.0))

{'name': np.str_('Hotel K'), 'place': 'Unknown label: 8', 'price': np.float64(139.09999999999954)}
{'name': 'Unknown label: 8', 'place': np.str_('Aracaju (SE)'), 'price': np.float64(208.03999999999968)}
{'name': np.str_('Hotel A'), 'place': 'Unknown label: 3', 'price': np.float64(313.02000000000703)}
{'name': np.str_('Hotel K'), 'place': 'Unknown label: 8', 'price': np.float64(139.0999999999998)}
{'name': np.str_('Hotel A'), 'place': 'Unknown label: 3', 'price': np.float64(313.02000000000703)}


In [15]:
# Save models and encoders
joblib.dump(model_name, 'model_name.joblib')
joblib.dump(model_place, 'model_place.joblib')
joblib.dump(model_price, 'model_price.joblib')
joblib.dump(label_encoder_name, 'label_encoder_name.joblib')
joblib.dump(label_encoder_place, 'label_encoder_place.joblib')

['label_encoder_place.joblib']