## Treinamento do modelo

In [125]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns
import sagemaker
import boto3
from sagemaker import Session

df = pd.read_csv("dataframe/Hotel Reservations tratado.csv")

In [126]:
df.columns

Index(['label_avg_price_per_room', 'room_type_reserved_1',
       'room_type_reserved_2', 'room_type_reserved_3', 'room_type_reserved_4',
       'room_type_reserved_5', 'room_type_reserved_6', 'room_type_reserved_7',
       'arrival_year_2017', 'arrival_year_2018', 'no_of_adults_0',
       'no_of_adults_1', 'no_of_adults_2', 'no_of_adults_3', 'no_of_adults_4',
       'no_of_children_0', 'no_of_children_1', 'no_of_children_2',
       'no_of_children_3', 'no_of_children_9', 'no_of_children_10',
       'no_of_special_requests_0', 'no_of_special_requests_1',
       'no_of_special_requests_2', 'no_of_special_requests_3',
       'no_of_special_requests_4', 'no_of_special_requests_5'],
      dtype='object')

In [127]:
df.shape

(36275, 27)

### Dropando colunas não numéricas e com pouca correlação com a coluna alvo do treinamento

### Separação dos dados de treino e teste

In [90]:
data_treinamento = df.iloc[0:30000,:]
data_treinamento.shape

(30000, 29)

In [91]:
data_teste = df.iloc[30000:,:]
data_teste.shape

(6275, 29)

In [116]:
x_teste = data_teste.iloc[:,:28].values


In [117]:
x_teste.shape

(6275, 28)

In [119]:
y_teste = data_teste.iloc[:,28].values
y_teste

array([0, 0, 0, ..., 0, 0, 1])

In [101]:
data_treinamento.to_csv('hotel-reservations-treino.csv', header = False, index = False)

In [102]:
data_teste.to_csv('hotel-reservations-teste.csv', header = False, index = False)

### Definição das pastas onde serão salvos os dados de treino, teste e o modelo treinado

In [103]:
session = sagemaker.Session()
bucket = 'modelo-treinado-grupo4'
pasta_modelo = 'modelos/xgboost'
pasta_dataset = 'datasets'
key_train = 'hotel-reservations-treino'
key_test = 'hotel-reservations-teste'
role = sagemaker.get_execution_role()
s3_train_data = 's3://{}/{}/treino/{}'.format(bucket, pasta_dataset, key_train)
s3_test_data = 's3://{}/{}/teste/{}'.format(bucket, pasta_dataset, key_test)
output_location = 's3://{}/{}/output'.format(bucket, pasta_modelo)
print('Role: {}'.format(role))
print('Dados de treinamento: {}'.format(s3_train_data))
print('Dados de teste: {}'.format(s3_test_data))
print('Modelo será salvo em: {}'.format(output_location))

Role: arn:aws:iam::528260343431:role/service-role/AmazonSageMaker-ExecutionRole-20230125T090598
Dados de treinamento: s3://modelo-treinado-grupo4/datasets/treino/hotel-reservations-treino
Dados de teste: s3://modelo-treinado-grupo4/datasets/teste/hotel-reservations-teste
Modelo será salvo em: s3://modelo-treinado-grupo4/modelos/xgboost/output


In [104]:
import os
with open('hotel-reservations-treino.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(pasta_dataset, 'treino', key_train)).upload_fileobj(f)

In [105]:
import os
with open('hotel-reservations-teste.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(pasta_dataset, 'teste', key_test)).upload_fileobj(f)

In [106]:
from sagemaker import image_uris
container = image_uris.retrieve(framework = 'xgboost', region=boto3.Session().region_name, version='latest')

### Definição da instância, número de épocas e dados que serão utilizados para o treinamento:

In [107]:
xgboost = sagemaker.estimator.Estimator(image_uri = container,
                                        role = role,
                                        instance_count = 1,
                                        instance_type = 'ml.m5.xlarge',
                                        output_path = output_location,
                                        sagemaker_session = session)

In [108]:
xgboost.set_hyperparameters(num_round = 100)

In [109]:
train_input = sagemaker.inputs.TrainingInput(s3_data = s3_train_data, content_type='csv', s3_data_type = 'S3Prefix')
validation_input = sagemaker.inputs.TrainingInput(s3_data = s3_test_data, content_type='csv', s3_data_type = 'S3Prefix')
data_channels = {'train': train_input, 'validation': validation_input}

### Treinamento do modelo

In [110]:
xgboost.fit(data_channels)

2023-01-30 14:41:31 Starting - Starting the training job...
2023-01-30 14:41:55 Starting - Preparing the instances for trainingProfilerReport-1675089691: InProgress
......
2023-01-30 14:42:55 Downloading - Downloading input data...
2023-01-30 14:43:30 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[2023-01-30:14:43:42:INFO] Running standalone xgboost training.[0m
[34m[2023-01-30:14:43:42:INFO] File size need to be processed in the node: 2.01mb. Available memory size in the node: 8301.41mb[0m
[34m[2023-01-30:14:43:42:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:43:42] S3DistributionType set as FullyReplicated[0m
[34m[14:43:42] 30000x28 matrix with 840000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2023-01-30:14:43:42:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:43:42] S3DistributionType set as FullyReplicated[0m
[34m[14:43:42] 6275x28 matrix with 17

### Deploy do modelo e previsões

In [111]:
xgboost_regressor = xgboost.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

---------!

In [112]:
from sagemaker.serializers import CSVSerializer
xgboost_regressor.serializer = CSVSerializer()

In [120]:
previsoes = np.array(xgboost_regressor.predict(x_teste).decode('utf-8').split(',')).astype(np.float32)

In [131]:
previsoes = (previsoes >= 0.5)
previsoes

array([False, False, False, ..., False, False, False])

### Testes de acurácia do modelo

In [132]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, confusion_matrix, classification_report, accuracy_score
import math
mae = mean_absolute_error(y_teste, previsoes)
mse = mean_squared_error(y_teste, previsoes)
rmse = math.sqrt(mean_squared_error(y_teste, previsoes))
print('MAE = ', mae, '\nMSE = ', mse, '\nRMSE = ', rmse)

MAE =  0.32812749003984065 
MSE =  0.32812749003984065 
RMSE =  0.5728241353503191


In [133]:
accuracy_score(y_teste,previsoes)

0.6718725099601593

In [134]:
mtx = confusion_matrix(y_teste, previsoes)
mtx

array([[4199,    8],
       [2051,   17]])

ValueError: Classification metrics can't handle a mix of multiclass and continuous targets

In [135]:
print(classification_report(y_teste, previsoes))

              precision    recall  f1-score   support

           0       0.67      1.00      0.80      4207
           1       0.68      0.01      0.02      2068

    accuracy                           0.67      6275
   macro avg       0.68      0.50      0.41      6275
weighted avg       0.67      0.67      0.54      6275

