# Tratamento da base de dados

In [None]:
import pandas as pd 
import numpy as np
import xgboost as xgb
import numpy as np
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

base_hotel = pd.read_csv('HotelReservations.csv')

In [None]:
base_hotel = base_hotel.drop(['no_of_previous_bookings_not_canceled', 'Booking_ID', 'no_of_weekend_nights', 'no_of_week_nights'], axis = 1)
# Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status

In [None]:
# Supondo que seu DataFrame se chama base_hotel e já possui a coluna avg_price_per_room
# base_hotel = pd.read_csv('caminho/para/seu/arquivo.csv')  # Caso você esteja lendo de um arquivo CSV

# Criar a coluna label_avg_price_per_room com base nos critérios fornecidos
base_hotel['label_avg_price_per_room'] = pd.cut(base_hotel['avg_price_per_room'],
                                                 bins=[-float('inf'), 85, 115, float('inf')],
                                                 labels=[1, 2, 3])

# Mapear os valores de 1, 2, 3 para 0, 1, 2
mapping = {1: 0, 2: 1, 3: 2}
base_hotel['label_avg_price_per_room'] = base_hotel['label_avg_price_per_room'].replace(mapping)

# Reordenar as colunas para ter a coluna alvo primeiro
target_column = 'label_avg_price_per_room'
columns = [target_column] + [col for col in base_hotel.columns if col != target_column]
base_hotel = base_hotel[columns]

# Exibir o DataFrame resultante
print(base_hotel)


In [None]:
base_hotel = base_hotel.drop(['avg_price_per_room'], axis = 1)

In [None]:

base_hotel

In [None]:
base_treinamento = base_hotel.iloc[0:28000,:]
print(base_treinamento.shape)

In [None]:
base_teste = base_hotel.iloc[28000:,:]
print(base_teste.shape)


In [None]:
base_treinamento = pd.get_dummies(base_treinamento, columns=['room_type_reserved', 'market_segment_type','type_of_meal_plan','booking_status'])
base_teste = pd.get_dummies(base_teste, columns=['room_type_reserved', 'market_segment_type', 'type_of_meal_plan', 'booking_status'])

In [None]:
base_teste


In [None]:
X_teste = base_teste.iloc[:,1:29].values
y_teste = base_teste.iloc[:, 0].values

In [None]:
print(base_teste.shape)
print(X_teste.shape)

In [None]:
print(base_treinamento.shape)
print(base_teste.shape)
print(X_teste.shape)
print(y_teste.shape)


In [None]:
base_treinamento.to_csv('hotel_reservations_train_xgboost.csv', header = False, index = False)
base_teste.to_csv('hotel_reservations_test_xgboost.csv', header = False, index = False)

# Configurações SageMaker

In [None]:
import sagemaker
import boto3
from sagemaker import Session

profile_name = "ester"
boto_session = boto3.Session() #profile_name=profile_name
boto3.setup_default_session()

session = sagemaker.Session(boto_session)

#bucket_name = "bucketcsvdataset"
bucket_name = 'sagemaker-curso-bucket'

subpasta_modelo = 'modelos/hotel-reservationXGBoost3V1-7-1/xgboost'
subpasta_dataset = 'datasets/hotel-reservations'
key_train = 'hotel-train-data-xgboost'
key_test = 'hotel-test-data-xgboost'

#role = 'arn:aws:iam::730335509042:role/service-role/AmazonSageMaker-ExecutionRole-20240702T161917'
role = "AmazonSageMaker-ExecutionRole-20240702T173175"

s3_train_data = 's3://{}/{}/train/{}'.format(bucket_name, subpasta_dataset, key_train)
s3_test_data = 's3://{}/{}/test/{}'.format(bucket_name, subpasta_dataset, key_test)
output_location = 's3://{}/{}/output'.format(bucket_name, subpasta_modelo)

print(s3_train_data)
print(s3_test_data)
print(output_location)

In [None]:
import os
with open('hotel_reservations_train_xgboost.csv', 'rb') as f:
    s3_path = os.path.join(subpasta_dataset, 'train', key_train).replace('\\', '/')
    boto_session.resource('s3').Bucket(bucket_name).Object(s3_path).upload_fileobj(f)
    
print(s3_path)

In [None]:
import os
with open('hotel_reservations_test_xgboost.csv', 'rb') as f:
    s3_path = os.path.join(subpasta_dataset, 'test', key_test).replace('\\', '/')
    boto_session.resource('s3').Bucket(bucket_name).Object(s3_path).upload_fileobj(f)
    
print(s3_path)

# Treinamento XGBoost

In [None]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/ecr-sa-east-1.html
from sagemaker import image_uris
container = image_uris.retrieve(framework = 'xgboost', region=boto3.Session().region_name, version='1.7-1')

In [None]:
hyperparameters = {
    "booster": "dart",
    "num_round": "200",
    "num_class": "3",
    "objective": "multi:softmax",
    "eval_metric": "mlogloss",
    "min_child_weight": "6"
}

In [None]:

# https://sagemaker.readthedocs.io/en/stable/api/training/estimators.html
xgboost = sagemaker.estimator.Estimator(image_uri = container,
                                        role = role,
                                        instance_count = 1,
                                        instance_type = 'ml.m5.large',
                                        output_path = output_location,
                                        sagemaker_session = session,
                                        use_spot_instances = True,
                                        max_run = 3600,
                                        max_wait = 3600,
                                        hyperparameters=hyperparameters
                                        )

In [None]:
train_input = sagemaker.inputs.TrainingInput(s3_data = s3_train_data, content_type='csv', s3_data_type = 'S3Prefix')
validation_input = sagemaker.inputs.TrainingInput(s3_data = s3_test_data, content_type='csv', s3_data_type = 'S3Prefix')
data_channels = {'train': train_input, 'validation': validation_input}

In [None]:
job = 'XGBoost-V25'
xgboost.fit(data_channels, job_name = job)

# Inferências

In [None]:
import xgboost as xgb

model_file_key = 'modelos/hotel-reservationXGBoost3V1-7-1/xgboost/output/XGBoost-V25/output/model.tar.gz'
local_model_path = '../model/model.tar.gz'

s3 = boto3.client('s3')
s3.download_file(bucket_name, model_file_key, local_model_path)

In [None]:
import tarfile
import os
with tarfile.open(local_model_path, 'r:gz') as tar:
    tar_list = tar.getnames()
    print("Files in the tar archive:", tar_list)
    
    # Extract all files
    tar.extractall()


model_file = 'xgboost-model'
if os.path.exists(model_file):
    with open(model_file, 'rb') as f:
        file_header = f.read(4)
        print("File header:", file_header)

In [None]:
model_file = 'xgboost-model'

# Carregar o modelo
model = xgb.Booster()
model.load_model(model_file)

In [None]:
dtest = xgb.DMatrix(X_teste)
previsoes = model.predict(dtest)

# Arredondar previsões para o valor mais próximo
previsoes_rounded = np.round(previsoes).astype(int)

# Calcular e imprimir a acurácia
accuracy = accuracy_score(y_teste, previsoes_rounded)
print(f'Acurácia: {accuracy * 100:.2f}%')

# Relatório de classificação
print(classification_report(y_teste, previsoes_rounded))

# Matriz de confusão
conf_matrix = confusion_matrix(y_teste, previsoes_rounded)
plt.figure(figsize=(7, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()

In [None]:
%%sh
rm xgboost-model
rm model.tar.gz