# Arquivo para treinamento do modelo, inferencias e testes

In [1]:
import sys
from pathlib import Path

# Get root directory 
notebook_dir = Path.cwd()
project_root = notebook_dir.parents[1]

# Adicionando diretórios ao path do Python env
# Adding directories to Python env path
sys.path.append(str(project_root))
sys.path.append(str(project_root / 'data'))
sys.path.append(str(project_root / 'data' / 'processed'))
sys.path.append(str(project_root / 'data' / 'external'))
sys.path.append(str(project_root / 'notebooks'))
sys.path.append(str(project_root / 'notebooks' / 'exploratory'))
sys.path.append(str(project_root / 'notebooks' / 'modeling'))

In [None]:
import pandas as pd 
import numpy as np
from data.processed.data_prepare import prepared_base

# Load data
base_hotel = prepared_base
base_hotel

In [None]:
# Replace boolean values
base_hotel = base_hotel.apply(lambda col: col.astype(int) if col.dtype == 'bool' else col)

In [None]:
# Train and Test base split
base_train = base_hotel.iloc[0:28000,:]
base_test = base_hotel.iloc[28000:,:]

In [None]:
# Select columns 1 to 28 (inclusive) as feature variables (X_test)
X_test = base_test.iloc[:, 1:29].values

# Select column 0 as the target variable (y_test)
y_test = base_test.iloc[:, 0].values


In [None]:
print(base_train.shape)
print(base_test.shape)
print(X_test.shape)
print(y_test.shape)

In [None]:
# Writing train and test csv files
base_train_path = '../../data/raw/hotel_reservations_train_xgboost.csv'
base_test_path = '../../data/raw/hotel_reservations_test_xgboost.csv'

base_train.to_csv(base_train_path, header = False, index = False)
base_test.to_csv(base_test_path, header = False, index = False)

# Configurações SageMaker

In [None]:
import os
from dotenv import load_dotenv

load_dotenv()

# Environment Variables
profile_name = os.getenv("PROFILE_NAME")
role_arn = os.getenv("ROLE_ARN")

In [None]:
import sagemaker
import boto3
from sagemaker import Session

# create and configure sessions
boto_session = boto3.Session(profile_name=profile_name)
boto3.setup_default_session(profile_name=profile_name)
session = sagemaker.Session(boto_session)
role = role_arn

# create bucket
bucket_name = "bucket-sprint5-compassuol"
s3_client = boto_session.client('s3')
try:
    response = s3_client.create_bucket(
        Bucket=bucket_name
    )
    print(f"Bucket '{bucket_name}' criado com sucesso.")
except s3_client.exceptions.BucketAlreadyOwnedByYou:
    print(f"O bucket '{bucket_name}' já existe e é de sua propriedade.")
except s3_client.exceptions.BucketAlreadyExists:
    print(f"O bucket '{bucket_name}' já existe, mas não é de sua propriedade.")
except Exception as e:
    print(f"Erro ao criar o bucket: {e}")

In [None]:
subpasta_modelo = 'modelos/hotel-reservations/xgboost'
subpasta_dataset = 'datasets/hotel-reservations'
key_train = 'hotel-train-data-xgboost'
key_test = 'hotel-test-data-xgboost'

s3_train_data = 's3://{}/{}/train/{}'.format(bucket_name, subpasta_dataset, key_train)
s3_test_data = 's3://{}/{}/test/{}'.format(bucket_name, subpasta_dataset, key_test)
output_location = 's3://{}/{}/output'.format(bucket_name, subpasta_modelo)

print(output_location)
print(role_arn)
print(profile_name)

In [None]:
# Sending Csv files to S3
import os
with open(base_train_path, 'rb') as f:
    s3_path = os.path.join(subpasta_dataset, 'train', key_train).replace('\\', '/')
    boto_session.resource('s3').Bucket(bucket_name).Object(s3_path).upload_fileobj(f)
    
print(s3_path)

In [None]:
with open(base_test_path, 'rb') as f:
    s3_path = os.path.join(subpasta_dataset, 'test', key_test).replace('\\', '/')
    boto_session.resource('s3').Bucket(bucket_name).Object(s3_path).upload_fileobj(f)
    
print(s3_path)

# Treinamento XGBoost

In [None]:
from sagemaker import image_uris

# Retrieve the URI of the Docker image for the XGBoost framework
container = image_uris.retrieve(
    framework = 'xgboost',                 # Specify the machine learning framework (XGBoost)
    region = boto3.Session().region_name,  # Get the current AWS region from the boto3 session
    version = '1.7-1'                      # Specify the version of the XGBoost framework
)


In [None]:
# Define the hyperparameters for the XGBoost model
params = {
    'objective': 'multi:softmax',             # The learning task and objective ('multi:softmax' for multi-class classification)
    'num_class': 3,                           # The number of classes to classify
    'eval_metric': 'mlogloss',                # The evaluation metric to monitor during training (multiclass log-loss)
    'gamma': '0.6307462738756113',            # Minimum loss reduction required to make a further partition on a leaf node
    'lambda': '0.13870950469471877',          # L2 regularization term on weights
    'colsample_bytree': '0.9580688142306052', # Subsample ratio of columns when constructing each tree
    'eta': '0.05375196116547447',             # Step size shrinkage to prevent overfitting (learning rate)
    'max_depth': '9',                         # Maximum depth of a tree
    'min_child_weight': '2.629400053328948',  # Minimum sum of instance weight needed in a child
    'subsample': '0.9553898205800991',        # Subsample ratio of the training instance
    'num_round': 1000                         # Number of boosting rounds
}


In [None]:
# Creating an XGBoost Estimator object
xgboost = sagemaker.estimator.Estimator(
                                            image_uri = container,          # The URI of the container image for the XGBoost algorithm
                                            role = role,                    # The AWS IAM role that SageMaker can assume to access AWS resources
                                            instance_count = 1,             # The number of instances to use for the training job
                                            instance_type = 'ml.m5.large',  # The type of EC2 instance to use for the training job
                                            output_path = output_location,  # The S3 path where the model artifacts will be stored
                                            sagemaker_session = session,    # The SageMaker session object
                                            use_spot_instances = True,      # Whether to use Amazon EC2 Spot instances for the training job
                                            max_run = 3600,                 # The maximum run time in seconds for the training job
                                            max_wait = 3600,                # The maximum wait time in seconds for spot instances
                                            hyperparameters = params        # The hyperparameters for the XGBoost algorithm
)


In [None]:
# Creating a TrainingInput object for the training data
train_input = sagemaker.inputs.TrainingInput(
    s3_data = s3_train_data,      # The S3 URI where the training data is stored
    content_type = 'csv',         # The format of the training data (CSV)
    s3_data_type = 'S3Prefix'     # The type of S3 data source (S3Prefix)
)

# Creating a TrainingInput object for the validation data
validation_input = sagemaker.inputs.TrainingInput(
    s3_data = s3_test_data,       # The S3 URI where the validation data is stored
    content_type = 'csv',         # The format of the validation data (CSV)
    s3_data_type = 'S3Prefix'     # The type of S3 data source (S3Prefix)
)

# Defining the data channels for the training job
data_channels = {
    'train': train_input,         # The training data channel
    'validation': validation_input  # The validation data channel
}


In [None]:
# Starting training
job = 'XGBoost-Sprint5'
xgboost.fit(data_channels, job_name=job)

# Inferences

In [None]:
import xgboost as xgb


model_file_key = 'modelos/hotel-reservations/xgboost/output/XGBoost-Sprint5/output/model.tar.gz'
local_model_path = 'model.tar.gz'

# Starting S3 session
s3 = boto3.client('s3')

# Downloading model from s3
s3.download_file(bucket_name, model_file_key, local_model_path)

In [None]:
import tarfile  # Import the tarfile module for working with tar archives

# Open the tar archive in read mode
with tarfile.open(local_model_path, 'r:gz') as tar:
    # Get a list of all files in the tar archive
    tar_list = tar.getnames()
    print("Files in the tar archive:", tar_list)
    
    # Extract all files from the tar archive
    tar.extractall()

# Define the model file name
model_file = 'xgboost-model'

# Check if the model file exists
if os.path.exists(model_file):
    # Open the model file in binary read mode
    with open(model_file, 'rb') as f:
        # Read the first 4 bytes of the file to get the file header
        file_header = f.read(4)
        print("File header:", file_header)


In [None]:
model_file = 'xgboost-model'

# Load model
model = xgb.Booster()
model.load_model(model_file)

In [None]:
# Import necessary libraries for evaluation and visualization
from sklearn.metrics import accuracy_score, classification_report, confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns

# Convert the test data into the DMatrix format required by XGBoost
dtest = xgb.DMatrix(X_test)

# Make predictions using the trained model
previsoes = model.predict(dtest)

# Round predictions to the nearest integer
previsoes_rounded = np.round(previsoes).astype(int)

# Calculate and print the accuracy
accuracy = accuracy_score(y_test, previsoes_rounded)
print(f'Acurácia: {accuracy * 100:.2f}%')

# Print the classification report
print(classification_report(y_test, previsoes_rounded))

# Generate the confusion matrix
conf_matrix = confusion_matrix(y_test, previsoes_rounded)

# Plot the confusion matrix
plt.figure(figsize=(7, 4))
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues', xticklabels=[0, 1, 2], yticklabels=[0, 1, 2])
plt.xlabel('Predicted Label')  # Set the x-axis label
plt.ylabel('True Label')       # Set the y-axis label
plt.title('Confusion Matrix')  # Set the title of the plot
plt.show()                     # Display the plot
