In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split, RandomizedSearchCV
from sklearn.metrics import accuracy_score, classification_report
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier
import xgboost as xgb
from sklearn.preprocessing import LabelEncoder

# Carregar o dataset
base_hotel = pd.read_csv(r'/content/Hotel Reservations.csv')

# Classificação de preço
def classify_price(price):
    if price <= 85:
        return 1  # Mantendo o formato 1, 2, 3
    elif 85 < price < 115:
        return 2  # Mantendo o formato 1, 2, 3
    else:
        return 3  # Mantendo o formato 1, 2, 3

# Criar a coluna de label 'label_avg_price_per_room'
base_hotel['label_avg_price_per_room'] = base_hotel['avg_price_per_room'].apply(classify_price)
base_hotel = base_hotel.drop(columns=['avg_price_per_room'])

# Reorganizar as colunas
colunas = ['label_avg_price_per_room'] + list(base_hotel.columns[:-1])
base_hotel = base_hotel[colunas]

# Lista de colunas a serem removidas
colunas_para_remover = ['Booking_ID', 'arrival_date', 'market_segment_type',
                         'repeated_guest', 'no_of_previous_cancellations',
                         'no_of_previous_bookings_not_canceled', 'booking_status']

# Filtrar e remover as colunas existentes
colunas_existentes = [col for col in colunas_para_remover if col in base_hotel.columns]
base_hotel = base_hotel.drop(columns=colunas_existentes, axis=1).copy()

# Aplicando One-Hot Encoding
base_hotel = pd.get_dummies(base_hotel, prefix=['type_of_meal_plan', 'required_car_parking_space',
                                                'room_type_reserved', 'arrival_year', 'arrival_month',
                                                'no_of_special_requests'],
                             columns=['type_of_meal_plan', 'required_car_parking_space',
                                      'room_type_reserved', 'arrival_year', 'arrival_month',
                                      'no_of_special_requests'])

# Definindo X e y
X = base_hotel.iloc[:, 1:].values
X = np.array(X).astype('float32')
y = base_hotel.iloc[:, 0].values

# Codificar os rótulos para começarem de 0
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Dividir o conjunto de dados em treinamento e teste
X_treinamento, X_teste, Y_treinamento, Y_teste = train_test_split(X, y_encoded, test_size=0.3, random_state=0)

# Configuração dos hiperparâmetros para RandomizedSearchCV para XGBoost
param_distributions_xgb = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 15],
    'learning_rate': [0.05, 0.1],
    'subsample': [0.9, 1.0],
    'colsample_bytree': [0.8, 0.9],
    'gamma': [0, 0.2],
    'min_child_weight': [1, 3],
    'reg_alpha': [0, 0.1],
    'reg_lambda': [1, 1.5]
}

# Inicializar o classificador XGBoost
model_xgb = xgb.XGBClassifier(
    eval_metric='mlogloss',
    num_class=len(np.unique(y_encoded))
)

# Inicializar o RandomizedSearchCV para XGBoost
random_search_xgb = RandomizedSearchCV(
    estimator=model_xgb,
    param_distributions=param_distributions_xgb,
    n_iter=20,
    scoring='accuracy',
    cv=3,
    n_jobs=-1,
    verbose=2,
    random_state=0
)

# Treinar o RandomizedSearchCV para XGBoost
random_search_xgb.fit(X_treinamento, Y_treinamento)
best_model_xgb = random_search_xgb.best_estimator_

# Fazer previsões com XGBoost
Y_pred_xgb_encoded = best_model_xgb.predict(X_teste)
Y_pred_xgb = label_encoder.inverse_transform(Y_pred_xgb_encoded)
Y_teste_decoded = label_encoder.inverse_transform(Y_teste)

# Avaliar o modelo XGBoost
print("XGBoost Accuracy Score:", accuracy_score(Y_teste_decoded, Y_pred_xgb))
print("XGBoost Classification Report:\n", classification_report(Y_teste_decoded, Y_pred_xgb))

# Inicializar e treinar o classificador Random Forest
model_rf = RandomForestClassifier(n_estimators=100, random_state=0)
model_rf.fit(X_treinamento, Y_treinamento)

# Fazer previsões com Random Forest
Y_pred_rf_encoded = model_rf.predict(X_teste)
Y_pred_rf = label_encoder.inverse_transform(Y_pred_rf_encoded)

# Avaliar o modelo Random Forest
print("Random Forest Accuracy Score:", accuracy_score(Y_teste_decoded, Y_pred_rf))
print("Random Forest Classification Report:\n", classification_report(Y_teste_decoded, Y_pred_rf))

# Inicializar e treinar a rede neural MLP
model_mlp = MLPClassifier(hidden_layer_sizes=(100,), max_iter=500, random_state=0)
model_mlp.fit(X_treinamento, Y_treinamento)

# Fazer previsões com MLP
Y_pred_mlp_encoded = model_mlp.predict(X_teste)
Y_pred_mlp = label_encoder.inverse_transform(Y_pred_mlp_encoded)

# Avaliar o modelo MLP
print("MLP Accuracy Score:", accuracy_score(Y_teste_decoded, Y_pred_mlp))
print("MLP Classification Report:\n", classification_report(Y_teste_decoded, Y_pred_mlp))

Fitting 3 folds for each of 20 candidates, totalling 60 fits
XGBoost Accuracy Score: 0.7953689240099238
XGBoost Classification Report:
               precision    recall  f1-score   support

           1       0.80      0.79      0.80      3258
           2       0.76      0.76      0.76      4006
           3       0.83      0.84      0.83      3619

    accuracy                           0.80     10883
   macro avg       0.80      0.80      0.80     10883
weighted avg       0.80      0.80      0.80     10883

Random Forest Accuracy Score: 0.7848019847468529
Random Forest Classification Report:
               precision    recall  f1-score   support

           1       0.79      0.78      0.79      3258
           2       0.75      0.75      0.75      4006
           3       0.82      0.83      0.82      3619

    accuracy                           0.78     10883
   macro avg       0.79      0.79      0.79     10883
weighted avg       0.78      0.78      0.78     10883

MLP Accuracy Sc

#Carregamento e pré-processamento dos dados
#Treinamento e ajuste de hiperparâmetros para Random Forest, MLP e XGBoost
# Avaliação dos modelos
#Uso de Voting Classifier para combinar os modelos
#Validação cruzada

In [None]:
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import train_test_split, GridSearchCV, cross_val_score
from sklearn.ensemble import RandomForestClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from sklearn.metrics import accuracy_score, classification_report

# 1. Carregar e pré-processar os dados
base_hotel = pd.read_csv('/content/Hotel Reservations.csv')

# Função para classificar o preço
def classify_price(price):
    if price <= 85:
        return 1

    elif price > 85 and price < 115:
        return 2
    else:
        return 3

# Criar a coluna de label 'label_avg_price_per_room'
base_hotel['label_avg_price_per_room'] = base_hotel['avg_price_per_room'].apply(classify_price)
base_hotel = base_hotel.drop(columns=['avg_price_per_room'])

# Reorganizar as colunas
colunas = ['label_avg_price_per_room'] + list(base_hotel.columns[:-1])
base_hotel = base_hotel[colunas]

# Lista de colunas a serem removidas
colunas_para_remover = ['Booking_ID', 'arrival_date', 'market_segment_type',
                         'repeated_guest', 'no_of_previous_cancellations',
                         'no_of_previous_bookings_not_canceled', 'booking_status']
colunas_existentes = [col for col in colunas_para_remover if col in base_hotel.columns]
base_hotel = base_hotel.drop(columns=colunas_existentes, axis=1).copy()

# Aplicando One-Hot Encoding
base_hotel = pd.get_dummies(base_hotel, prefix=['type_of_meal_plan', 'required_car_parking_space',
                                                'room_type_reserved', 'arrival_year', 'arrival_month',
                                                'no_of_special_requests'],
                             columns=['type_of_meal_plan', 'required_car_parking_space',
                                      'room_type_reserved', 'arrival_year', 'arrival_month',
                                      'no_of_special_requests'])

# Definindo X e y
X = base_hotel.iloc[:, 1:].values
y = base_hotel.iloc[:, 0].values

# Codificar os rótulos para começarem de 0
label_encoder = LabelEncoder()
y_encoded = label_encoder.fit_transform(y)

# Dividir o conjunto de dados em treinamento e teste
X_treinamento, X_teste, Y_treinamento, Y_teste = train_test_split(X, y_encoded, test_size=0.3, random_state=0)

In [None]:
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
from xgboost import XGBClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.neural_network import MLPClassifier

# Configuração reduzida para GridSearchCV para XGBoost
param_grid_xgb = {
    'n_estimators': [100, 150],
    'max_depth': [10, 15],
    'learning_rate': [0.1],
    'subsample': [0.8],
    'colsample_bytree': [0.8, 0.9],
    'gamma': [0],
    'reg_alpha': [0],
    'reg_lambda': [1]
}

model_xgb = XGBClassifier(random_state=0)
grid_search_xgb = GridSearchCV(estimator=model_xgb, param_grid=param_grid_xgb, scoring='accuracy', cv=2, n_jobs=-1, verbose=2)
grid_search_xgb.fit(X_treinamento, Y_treinamento)
best_xgb_model = grid_search_xgb.best_estimator_

print("Melhores hiperparâmetros para XGBoost:\n", grid_search_xgb.best_params_)

Fitting 2 folds for each of 8 candidates, totalling 16 fits
Melhores hiperparâmetros para XGBoost:
 {'colsample_bytree': 0.9, 'gamma': 0, 'learning_rate': 0.1, 'max_depth': 15, 'n_estimators': 100, 'reg_alpha': 0, 'reg_lambda': 1, 'subsample': 0.8}


In [None]:

# Configuração para RandomizedSearchCV para Random Forest
param_dist_rf = {
    'n_estimators': [50, 100, 200],
    'max_depth': [10, 15, 20],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4],
    'bootstrap': [True, False]
}

model_rf = RandomForestClassifier(random_state=0)
random_search_rf = RandomizedSearchCV(estimator=model_rf, param_distributions=param_dist_rf, n_iter=20, scoring='accuracy', cv=3, n_jobs=-1, verbose=2, random_state=0)
random_search_rf.fit(X_treinamento, Y_treinamento)
best_rf_model = random_search_rf.best_estimator_

print("Melhores hiperparâmetros para Random Forest:\n", random_search_rf.best_params_)



Fitting 3 folds for each of 20 candidates, totalling 60 fits
Melhores hiperparâmetros para Random Forest:
 {'n_estimators': 100, 'min_samples_split': 2, 'min_samples_leaf': 2, 'max_depth': 20, 'bootstrap': False}


In [None]:
# Configuração reduzida para RandomizedSearchCV para MLP
param_dist_mlp = {
    'hidden_layer_sizes': [(50,), (100,)],
    'activation': ['relu'],
    'solver': ['adam'],
    'alpha': [0.0001, 0.001],
    'learning_rate': ['constant']
}

model_mlp = MLPClassifier(max_iter=200, random_state=0)  # Reduzido o número de iterações
random_search_mlp = RandomizedSearchCV(estimator=model_mlp, param_distributions=param_dist_mlp, n_iter=10, scoring='accuracy', cv=2, n_jobs=-1, verbose=2, random_state=0)
random_search_mlp.fit(X_treinamento, Y_treinamento)
best_mlp_model = random_search_mlp.best_estimator_

print("Melhores hiperparâmetros para MLP:\n", random_search_mlp.best_params_)

Fitting 2 folds for each of 4 candidates, totalling 8 fits




Melhores hiperparâmetros para MLP:
 {'solver': 'adam', 'learning_rate': 'constant', 'hidden_layer_sizes': (100,), 'alpha': 0.001, 'activation': 'relu'}


In [None]:
from sklearn.metrics import accuracy_score, classification_report

# 3. Avaliação dos Modelos


# XGBoost
Y_pred_xgb_encoded = best_xgb_model.predict(X_teste)
Y_pred_xgb = label_encoder.inverse_transform(Y_pred_xgb_encoded)

print("XGBoost Accuracy Score:", accuracy_score(Y_teste, Y_pred_xgb_encoded))
print("XGBoost Classification Report:\n", classification_report(Y_teste, Y_pred_xgb_encoded))

# Random Forest
Y_pred_rf_encoded = best_rf_model.predict(X_teste)
Y_pred_rf = label_encoder.inverse_transform(Y_pred_rf_encoded)

print("Random Forest Accuracy Score:", accuracy_score(Y_teste, Y_pred_rf_encoded))
print("Random Forest Classification Report:\n", classification_report(Y_teste, Y_pred_rf_encoded))

# MLP
Y_pred_mlp_encoded = best_mlp_model.predict(X_teste)
Y_pred_mlp = label_encoder.inverse_transform(Y_pred_mlp_encoded)

print("MLP Accuracy Score:", accuracy_score(Y_teste, Y_pred_mlp_encoded))
print("MLP Classification Report:\n", classification_report(Y_teste, Y_pred_mlp_encoded))

XGBoost Accuracy Score: 0.799411926858403
XGBoost Classification Report:
               precision    recall  f1-score   support

           0       0.81      0.78      0.80      3258
           1       0.76      0.77      0.77      4006
           2       0.83      0.85      0.84      3619

    accuracy                           0.80     10883
   macro avg       0.80      0.80      0.80     10883
weighted avg       0.80      0.80      0.80     10883

Random Forest Accuracy Score: 0.7811265276118717
Random Forest Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.75      0.78      3258
           1       0.73      0.76      0.75      4006
           2       0.82      0.83      0.82      3619

    accuracy                           0.78     10883
   macro avg       0.78      0.78      0.78     10883
weighted avg       0.78      0.78      0.78     10883

MLP Accuracy Score: 0.7083524763392447
MLP Classification Report:
           

In [None]:
# 4. Voting Classifier

from sklearn.ensemble import VotingClassifier

# Configuração do Voting Classifier
voting_clf = VotingClassifier(
    estimators=[
        ('xgb', best_xgb_model),  # Usando o modelo XGBoost treinado
        ('rf', best_rf_model),    # Usando o modelo Random Forest treinado
        ('mlp', best_mlp_model)   # Usando o modelo MLP treinado
    ],
    voting='hard'  # 'hard' voting classifica a classe com mais votos
)

# Treinamento do Voting Classifier
voting_clf.fit(X_treinamento, Y_treinamento)

# Previsão com o Voting Classifier
Y_pred_voting_encoded = voting_clf.predict(X_teste)
Y_pred_voting = label_encoder.inverse_transform(Y_pred_voting_encoded)

# Avaliação do Voting Classifier
print("Voting Classifier Accuracy Score:", accuracy_score(Y_teste, Y_pred_voting_encoded))
print("Voting Classifier Classification Report:\n", classification_report(Y_teste, Y_pred_voting_encoded))

Voting Classifier Accuracy Score: 0.789855738307452
Voting Classifier Classification Report:
               precision    recall  f1-score   support

           0       0.80      0.77      0.79      3258
           1       0.75      0.76      0.75      4006
           2       0.82      0.84      0.83      3619

    accuracy                           0.79     10883
   macro avg       0.79      0.79      0.79     10883
weighted avg       0.79      0.79      0.79     10883



In [None]:
from sklearn.model_selection import cross_val_score
import numpy as np

# 5. Validação Cruzada

cv_scores_xgb = cross_val_score(best_xgb_model, X, y_encoded, cv=5)
print("XGBoost Cross-Validation Scores:", cv_scores_xgb)
print("XGBoost Mean Cross-Validation Score:", np.mean(cv_scores_xgb))

cv_scores_rf = cross_val_score(best_rf_model, X, y_encoded, cv=5)
print("Random Forest Cross-Validation Scores:", cv_scores_rf)
print("Random Forest Mean Cross-Validation Score:", np.mean(cv_scores_rf))

cv_scores_mlp = cross_val_score(best_mlp_model, X, y_encoded, cv=5)
print("MLP Cross-Validation Scores:", cv_scores_mlp)
print("MLP Mean Cross-Validation Score:", np.mean(cv_scores_mlp))

XGBoost Cross-Validation Scores: [0.79517574 0.80551344 0.79421089 0.80096485 0.79159201]
XGBoost Mean Cross-Validation Score: 0.7974913852515507
Random Forest Cross-Validation Scores: [0.77725706 0.78842178 0.773949   0.78001378 0.77574087]
Random Forest Mean Cross-Validation Score: 0.7790764989662302
MLP Cross-Validation Scores: [0.69069607 0.7145417  0.7061337  0.70640937 0.69345279]
MLP Mean Cross-Validation Score: 0.7022467263955893
