# Arquivo para treinamento do modelo, inferencias e testes

In [22]:
# Adicionando diretorios ao path do python env - Usar como modelo para o seu projeto
import sys
sys.path.append('/home/luizdantas/sprints-4-5-pb-aws-maio')
sys.path.append('/home/luizdantas/sprints-4-5-pb-aws-maio/data')
sys.path.append('/home/luizdantas/sprints-4-5-pb-aws-maio/data/processed')
sys.path.append('/home/luizdantas/sprints-4-5-pb-aws-maio/data/external')
sys.path.append('/home/luizdantas/sprints-4-5-pb-aws-maio/notebooks')
sys.path.append('/home/luizdantas/sprints-4-5-pb-aws-maio/notebooks/exploratory')
sys.path.append('/home/luizdantas/sprints-4-5-pb-aws-maio/notebooks/modeling')

In [None]:
import pandas as pd 
import numpy as np
from data.processed.data_prepare import prepared_base

# Load data
base_hotel = prepared_base
base_hotel

In [None]:
# substituir booleanos
base_hotel = base_hotel.apply(lambda col: col.astype(int) if col.dtype == 'bool' else col)

In [None]:
base_train = base_hotel.iloc[0:28000,:]
base_test = base_hotel.iloc[28000:,:]

In [None]:
X_test = base_test.iloc[:,1:29].values
y_test = base_test.iloc[:, 0].values

In [None]:
print(base_train.shape)
print(base_test.shape)
print(X_test.shape)
print(y_test.shape)

In [19]:
base_train_path = '../../data/raw/hotel_reservations_train_xgboost.csv'
base_test_path = '../../data/raw/hotel_reservations_test_xgboost.csv'

base_train.to_csv(base_train_path, header = False, index = False)
base_test.to_csv(base_test_path, header = False, index = False)

# Configurações SageMaker

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

# Variaveis de ambiente
profile_name = os.getenv("PROFILE_NAME")
role_arn = os.getenv("ROLE_ARN")

In [None]:
import sagemaker
import boto3
from sagemaker import Session

# create and configure sessions
boto_session = boto3.Session(profile_name=profile_name)
boto3.setup_default_session(profile_name=profile_name)
session = sagemaker.Session(boto_session)
role = role_arn

# create bucket
bucket_name = "bucket-sprint5-compassuol"
s3_client = boto_session.client('s3')
try:
    response = s3_client.create_bucket(
        Bucket=bucket_name
    )
    print(f"Bucket '{bucket_name}' criado com sucesso.")
except s3_client.exceptions.BucketAlreadyOwnedByYou:
    print(f"O bucket '{bucket_name}' já existe e é de sua propriedade.")
except s3_client.exceptions.BucketAlreadyExists:
    print(f"O bucket '{bucket_name}' já existe, mas não é de sua propriedade.")
except Exception as e:
    print(f"Erro ao criar o bucket: {e}")

In [None]:
subpasta_modelo = 'modelos/hotel-reservations/xgboost'
subpasta_dataset = 'datasets/hotel-reservations'
key_train = 'hotel-train-data-xgboost'
key_test = 'hotel-test-data-xgboost'

s3_train_data = 's3://{}/{}/train/{}'.format(bucket_name, subpasta_dataset, key_train)
s3_test_data = 's3://{}/{}/test/{}'.format(bucket_name, subpasta_dataset, key_test)
output_location = 's3://{}/{}/output'.format(bucket_name, subpasta_modelo)

print(output_location)
print(role_arn)
print(profile_name)

In [None]:
import os
with open(base_train_path, 'rb') as f:
    s3_path = os.path.join(subpasta_dataset, 'train', key_train).replace('\\', '/')
    boto_session.resource('s3').Bucket(bucket_name).Object(s3_path).upload_fileobj(f)
    
print(s3_path)

In [None]:
with open(base_test_path, 'rb') as f:
    s3_path = os.path.join(subpasta_dataset, 'test', key_test).replace('\\', '/')
    boto_session.resource('s3').Bucket(bucket_name).Object(s3_path).upload_fileobj(f)
    
print(s3_path)

# Treinamento XGBoost

In [None]:
from sagemaker import image_uris
container = image_uris.retrieve(framework = 'xgboost', region=boto3.Session().region_name, version='1.7-1')

In [None]:
params = {
}

In [None]:
xgboost = sagemaker.estimator.Estimator(image_uri = container,
                                        role = role,
                                        instance_count = 1,
                                        instance_type = 'ml.m5.large',
                                        output_path = output_location,
                                        sagemaker_session = session,
                                        use_spot_instances = True,
                                        max_run = 3600,
                                        max_wait = 3600,
                                        hyperparameters=params)

In [None]:
train_input = sagemaker.inputs.TrainingInput(s3_data = s3_train_data, content_type='csv', s3_data_type = 'S3Prefix')
validation_input = sagemaker.inputs.TrainingInput(s3_data = s3_test_data, content_type='csv', s3_data_type = 'S3Prefix')
data_channels = {'train': train_input, 'validation': validation_input}

In [None]:
job = 'XGBoost-Sprint5'
xgboost.fit(data_channels, job_name=job)

# Inferences

In [None]:
import xgboost as xgb


model_file_key = 'modelos/hotel-reservations/xgboost/output/XGBoost-Sprint5/output/model.tar.gz'
local_model_path = 'model.tar.gz'

s3 = boto3.client('s3')
s3.download_file(bucket_name, model_file_key, local_model_path)

In [None]:
import tarfile
with tarfile.open(local_model_path, 'r:gz') as tar:
    tar_list = tar.getnames()
    print("Files in the tar archive:", tar_list)
    
    # Extract all files
    tar.extractall()


model_file = 'xgboost-model'
if os.path.exists(model_file):
    with open(model_file, 'rb') as f:
        file_header = f.read(4)
        print("File header:", file_header)

In [None]:
model_file = 'xgboost-model'

# Load model
model = xgb.Booster()
model.load_model(model_file)

In [None]:
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

dtest = xgb.DMatrix(X_test)
previsoes = model.predict(dtest)

# Arredondar previsões para o valor mais próximo
previsoes_rounded = np.round(previsoes).astype(int)

# Calcular e imprimir a acurácia
accuracy = accuracy_score(y_test, previsoes_rounded)
print(f'Acurácia: {accuracy * 100:.2f}%')

# Relatório de classificação
print(classification_report(y_test, previsoes_rounded))

# Matriz de confusão
conf_matrix = confusion_matrix(y_test, previsoes_rounded)
plt.figure(figsize=(7, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
plt.xlabel('Predicted Label')
plt.ylabel('True Label')
plt.title('Confusion Matrix')
plt.show()