In [2]:
#-- Código Samara --#

In [3]:
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline
import numpy as np
import seaborn as sns
import sagemaker
import boto3
from sagemaker import Session

df = pd.read_csv("dataframe/Hotel Reservations tratado.csv")

In [4]:
df.columns

Index(['Booking_ID', 'no_of_adults', 'no_of_children', 'no_of_weekend_nights',
       'no_of_week_nights', 'type_of_meal_plan', 'required_car_parking_space',
       'room_type_reserved', 'lead_time', 'arrival_year', 'arrival_month',
       'arrival_date', 'market_segment_type', 'repeated_guest',
       'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
       'no_of_special_requests', 'booking_status', 'label_avg_price_per_room'],
      dtype='object')

In [5]:
df.drop(columns = ['Booking_ID', 'type_of_meal_plan', 'room_type_reserved', 'market_segment_type', 'booking_status', 'no_of_weekend_nights', 'no_of_week_nights', 'repeated_guest', 'lead_time', 'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled'], axis = 1, inplace = True)

In [6]:
data_treinamento = df.iloc[0:30000,:]
data_treinamento.shape

(30000, 8)

In [7]:
data_teste = df.iloc[30000:,:]
data_teste.shape

(6275, 8)

In [8]:
x_teste = data_teste.iloc[:,:7].values


In [9]:
x_teste[0]

array([   2,    0,    0, 2017,   10,   16,    0])

In [10]:
y_teste = data_teste.iloc[:,7].values
y_teste

array([2, 1, 2, ..., 2, 2, 3])

In [11]:
data_treinamento.to_csv('hotel-reservations-treino.csv', header = False, index = False)

In [12]:
data_teste.to_csv('hotel-reservations-teste.csv', header = False, index = False)

In [13]:
session = sagemaker.Session()
bucket = 'modelo-treinado-grupo4'
pasta_modelo = 'modelos/xgboost'
pasta_dataset = 'datasets'
key_train = 'hotel-reservations-treino'
key_test = 'hotel-reservations-teste'
role = sagemaker.get_execution_role()
s3_train_data = 's3://{}/{}/treino/{}'.format(bucket, pasta_dataset, key_train)
s3_test_data = 's3://{}/{}/teste/{}'.format(bucket, pasta_dataset, key_test)
output_location = 's3://{}/{}/output'.format(bucket, pasta_modelo)
print('Role: {}'.format(role))
print('Dados de treinamento: {}'.format(s3_train_data))
print('Dados de teste: {}'.format(s3_test_data))
print('Modelo será salvo em: {}'.format(output_location))

Role: arn:aws:iam::528260343431:role/service-role/AmazonSageMaker-ExecutionRole-20230125T090598
Dados de treinamento: s3://modelo-treinado-grupo4/datasets/treino/hotel-reservations-treino
Dados de teste: s3://modelo-treinado-grupo4/datasets/teste/hotel-reservations-teste
Modelo será salvo em: s3://modelo-treinado-grupo4/modelos/xgboost/output


In [14]:
import os
with open('hotel-reservations-treino.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(pasta_dataset, 'treino', key_train)).upload_fileobj(f)

In [15]:
import os
with open('hotel-reservations-teste.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(pasta_dataset, 'teste', key_test)).upload_fileobj(f)

In [16]:
from sagemaker import image_uris
container = image_uris.retrieve(framework = 'xgboost', region=boto3.Session().region_name, version='latest')

In [17]:
xgboost = sagemaker.estimator.Estimator(image_uri = container,
                                        role = role,
                                        instance_count = 1,
                                        instance_type = 'ml.m5.xlarge',
                                        output_path = output_location,
                                        sagemaker_session = session)

In [18]:
xgboost.set_hyperparameters(num_round = 100)

In [19]:
train_input = sagemaker.inputs.TrainingInput(s3_data = s3_train_data, content_type='csv', s3_data_type = 'S3Prefix')
validation_input = sagemaker.inputs.TrainingInput(s3_data = s3_test_data, content_type='csv', s3_data_type = 'S3Prefix')
data_channels = {'train': train_input, 'validation': validation_input}

In [20]:
xgboost.fit(data_channels)

2023-01-27 14:30:50 Starting - Starting the training job...
2023-01-27 14:31:14 Starting - Preparing the instances for trainingProfilerReport-1674829850: InProgress
......
2023-01-27 14:32:15 Downloading - Downloading input data...
2023-01-27 14:32:40 Training - Training image download completed. Training in progress..[34mArguments: train[0m
[34m[2023-01-27:14:32:53:INFO] Running standalone xgboost training.[0m
[34m[2023-01-27:14:32:53:INFO] File size need to be processed in the node: 0.69mb. Available memory size in the node: 8282.27mb[0m
[34m[2023-01-27:14:32:53:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:32:53] S3DistributionType set as FullyReplicated[0m
[34m[14:32:53] 30000x7 matrix with 210000 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2023-01-27:14:32:53:INFO] Determined delimiter of CSV input is ','[0m
[34m[14:32:53] S3DistributionType set as FullyReplicated[0m
[34m[14:32:53] 6275x7 matrix with 43925

In [None]:
xgboost_regressor = xgboost.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

--------

In [None]:
from sagemaker.serializers import CSVSerializer
xgboost_regressor.serializer = CSVSerializer()

In [None]:
previsoes = np.array(xgboost_regressor.predict(x_teste).decode('utf-8').split(',')).astype(np.float32)

In [None]:
previsoes[2], y_teste[2]

In [None]:
from sklearn.metrics import mean_absolute_error, mean_squared_error
import math
mae = mean_absolute_error(y_teste, previsoes)
mse = mean_squared_error(y_teste, previsoes)
rmse = math.sqrt(mean_squared_error(y_teste, previsoes))
print('MAE = ', mae, '\nMSE = ', mse, '\nRMSE = ', rmse)