# Objetivo do projeto

O objetivo deste projeto é classificar reservas de hotel em três faixas de preço usando um modelo de aprendizado de máquina. As faixas são determinadas com base no preço médio por quarto das reservas. Utilizamos o dataset de reservas de hotel para treinar um modelo de classificação e avaliar seu desempenho.

# Bibliotecas utilizadas

In [293]:
!pip install pandas matplotlib scikit-learn



In [294]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split

In [295]:
import sagemaker
import boto3
from sagemaker import Session
import sagemaker.amazon.common as smac # sagemaker commom library
from sagemaker.estimator import Estimator
from sagemaker import get_execution_role
from sagemaker.xgboost import XGBoost
import io
import os



In [296]:
from sagemaker import image_uris
from sagemaker.serializers import CSVSerializer
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score

# 3. Carregamanto do Dataset
O primeiro passo foi carregar o dataset a partir de um arquivo CSV. Este dataset contém informações sobre reservas de hotel, incluindo o preço médio por quarto (avg_price_per_room), entre outras variáveis.


In [297]:
# 1. Carregar o dataset
hotel_data = pd.read_csv('Hotel Reservations.csv')
hotel_data


Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.00,0,Not_Canceled
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.00,0,Canceled
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.00,0,Canceled
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.50,0,Canceled
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,INN36271,3,0,2,6,Meal Plan 1,0,Room_Type 4,85,2018,8,3,Online,0,0,0,167.80,1,Not_Canceled
36271,INN36272,2,0,1,3,Meal Plan 1,0,Room_Type 1,228,2018,10,17,Online,0,0,0,90.95,2,Canceled
36272,INN36273,2,0,2,6,Meal Plan 1,0,Room_Type 1,148,2018,7,1,Online,0,0,0,98.39,2,Not_Canceled
36273,INN36274,2,0,0,3,Not Selected,0,Room_Type 1,63,2018,4,21,Online,0,0,0,94.50,0,Canceled


# 4. Criação da Coluna de Rótulo
Baseado na coluna avg_price_per_room, criamos uma nova coluna chamada label_avg_price_per_room, que classifica as reservas em três categorias:

1: Preços menores ou iguais a 85

2: Preços entre 85 e 115

3: Preços maiores ou iguais a 115

Depois disso, removemos a coluna original avg_price_per_room, pois não é mais necessária

In [298]:
# 2. Criar a coluna 'label_avg_price_per_room' com base nas regras definidas
hotel_data['label_avg_price_per_room'] = pd.cut(
    hotel_data['avg_price_per_room'],
    bins=[-float('inf'), 85, 115, float('inf')],
    labels=[1, 2, 3]
)
hotel_data.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,avg_price_per_room,no_of_special_requests,booking_status,label_avg_price_per_room
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,65.0,0,Not_Canceled,1
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,106.68,1,Not_Canceled,2
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,60.0,0,Canceled,1
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,100.0,0,Canceled,2
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,94.5,0,Canceled,2


In [299]:
# Excluir a coluna 'avg_price_per_room'
hotel_data = hotel_data.drop(columns=['avg_price_per_room'])

In [300]:
hotel_data.head()

Unnamed: 0,Booking_ID,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,no_of_special_requests,booking_status,label_avg_price_per_room
0,INN00001,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,0,Not_Canceled,1
1,INN00002,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,1,Not_Canceled,2
2,INN00003,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,0,Canceled,1
3,INN00004,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,0,Canceled,2
4,INN00005,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,0,Canceled,2


In [301]:
hotel_data.columns

Index(['Booking_ID', 'no_of_adults', 'no_of_children', 'no_of_weekend_nights',
       'no_of_week_nights', 'type_of_meal_plan', 'required_car_parking_space',
       'room_type_reserved', 'lead_time', 'arrival_year', 'arrival_month',
       'arrival_date', 'market_segment_type', 'repeated_guest',
       'no_of_previous_cancellations', 'no_of_previous_bookings_not_canceled',
       'no_of_special_requests', 'booking_status', 'label_avg_price_per_room'],
      dtype='object')

In [302]:
colunas = []
colunas.append('label_avg_price_per_room')
for i in range (len(hotel_data.columns[:-1])):
    if not hotel_data.columns[i] == 'Booking_ID':
      colunas.append(hotel_data.columns[i])

hotel_data = hotel_data[colunas]
hotel_data

Unnamed: 0,label_avg_price_per_room,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,no_of_special_requests,booking_status
0,1,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,0,Not_Canceled
1,2,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,1,Not_Canceled
2,1,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,0,Canceled
3,2,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,0,Canceled
4,2,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,0,Canceled
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,3,3,0,2,6,Meal Plan 1,0,Room_Type 4,85,2018,8,3,Online,0,0,0,1,Not_Canceled
36271,2,2,0,1,3,Meal Plan 1,0,Room_Type 1,228,2018,10,17,Online,0,0,0,2,Canceled
36272,2,2,0,2,6,Meal Plan 1,0,Room_Type 1,148,2018,7,1,Online,0,0,0,2,Not_Canceled
36273,2,2,0,0,3,Not Selected,0,Room_Type 1,63,2018,4,21,Online,0,0,0,0,Canceled


In [303]:
hotel_data['label_avg_price_per_room'] = hotel_data['label_avg_price_per_room'].apply(lambda x: x - 1)
hotel_data

Unnamed: 0,label_avg_price_per_room,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,type_of_meal_plan,required_car_parking_space,room_type_reserved,lead_time,arrival_year,arrival_month,arrival_date,market_segment_type,repeated_guest,no_of_previous_cancellations,no_of_previous_bookings_not_canceled,no_of_special_requests,booking_status
0,0,2,0,1,2,Meal Plan 1,0,Room_Type 1,224,2017,10,2,Offline,0,0,0,0,Not_Canceled
1,1,2,0,2,3,Not Selected,0,Room_Type 1,5,2018,11,6,Online,0,0,0,1,Not_Canceled
2,0,1,0,2,1,Meal Plan 1,0,Room_Type 1,1,2018,2,28,Online,0,0,0,0,Canceled
3,1,2,0,0,2,Meal Plan 1,0,Room_Type 1,211,2018,5,20,Online,0,0,0,0,Canceled
4,1,2,0,1,1,Not Selected,0,Room_Type 1,48,2018,4,11,Online,0,0,0,0,Canceled
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,2,3,0,2,6,Meal Plan 1,0,Room_Type 4,85,2018,8,3,Online,0,0,0,1,Not_Canceled
36271,1,2,0,1,3,Meal Plan 1,0,Room_Type 1,228,2018,10,17,Online,0,0,0,2,Canceled
36272,1,2,0,2,6,Meal Plan 1,0,Room_Type 1,148,2018,7,1,Online,0,0,0,2,Not_Canceled
36273,1,2,0,0,3,Not Selected,0,Room_Type 1,63,2018,4,21,Online,0,0,0,0,Canceled


In [304]:
hotel_data = pd.get_dummies(hotel_data, prefix=['type_of_meal_plan', 'room_type_reserved', 'market_segment_type', 'booking_status'], dtype=float,
                   columns=['type_of_meal_plan', 'room_type_reserved', 'market_segment_type', 'booking_status'])
hotel_data

Unnamed: 0,label_avg_price_per_room,no_of_adults,no_of_children,no_of_weekend_nights,no_of_week_nights,required_car_parking_space,lead_time,arrival_year,arrival_month,arrival_date,...,room_type_reserved_Room_Type 5,room_type_reserved_Room_Type 6,room_type_reserved_Room_Type 7,market_segment_type_Aviation,market_segment_type_Complementary,market_segment_type_Corporate,market_segment_type_Offline,market_segment_type_Online,booking_status_Canceled,booking_status_Not_Canceled
0,0,2,0,1,2,0,224,2017,10,2,...,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,1.0
1,1,2,0,2,3,0,5,2018,11,6,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
2,0,1,0,2,1,0,1,2018,2,28,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
3,1,2,0,0,2,0,211,2018,5,20,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
4,1,2,0,1,1,0,48,2018,4,11,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
36270,2,3,0,2,6,0,85,2018,8,3,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
36271,1,2,0,1,3,0,228,2018,10,17,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0
36272,1,2,0,2,6,0,148,2018,7,1,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0
36273,1,2,0,0,3,0,63,2018,4,21,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,1.0,0.0


# 5. Divisão dos dados
Dividimos os dados em características (X) e rótulos (y). Em seguida, separámos os dados em conjuntos de treino e teste, utilizando 80% dos dados para treino e 20% para teste.

In [305]:
base_treinamento = hotel_data.iloc[0:25392,:]
base_treinamento.shape

(25392, 32)

In [306]:
base_teste = hotel_data.iloc[25392:,:]
base_teste.shape

(10883, 32)

In [307]:
X_teste = base_teste.iloc[:, 1:len(hotel_data)].values
X_teste

array([[3., 0., 2., ..., 1., 0., 1.],
       [1., 0., 1., ..., 0., 0., 1.],
       [1., 0., 0., ..., 1., 0., 1.],
       ...,
       [2., 0., 2., ..., 1., 0., 1.],
       [2., 0., 0., ..., 1., 1., 0.],
       [2., 0., 1., ..., 0., 0., 1.]])

In [308]:
base_treinamento.to_csv('reservas_train_xgboost.csv', header = False, index = False)
base_teste.to_csv('reservas_test_xgboost.csv', header = False, index = False)

# Configuração do SageMaker

In [337]:
session = Session()
bucket = 'sprint-4-5-aws'
subpasta_modelo = 'modelos/xgboost'
subpasta_dataset = 'dataset'
key_train = 'reservas-train-data-xgboost'
key_test = 'reservas-test-xgboost'
role = sagemaker.get_execution_role()
s3_train_data = 's3://{}/{}/train/{}'.format(bucket, subpasta_dataset, key_train)
s3_test_data = 's3://{}/{}/test/{}'.format(bucket, subpasta_dataset, key_test)
output_location = 's3://{}/{}/output'.format(bucket, subpasta_modelo)
print('Role: {}'.format(role))
print('Localização da base de treinamento: {}'.format(s3_train_data))
print('Localização da base de teste: {}'.format(s3_test_data))
print('Modelo final será salvo em: {}'.format(output_location))

Role: arn:aws:iam::767397691387:role/service-role/AmazonSageMaker-ExecutionRole-20240905T202363
Localização da base de treinamento: s3://sprint-4-5-aws/dataset/train/reservas-train-data-xgboost
Localização da base de teste: s3://sprint-4-5-aws/dataset/test/reservas-test-xgboost
Modelo final será salvo em: s3://sprint-4-5-aws/modelos/xgboost/output


In [338]:
with open('reservas_train_xgboost.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(subpasta_dataset, 'train', key_train)).upload_fileobj(f)

In [339]:
with open('reservas_test_xgboost.csv', 'rb') as f:
    boto3.Session().resource('s3').Bucket(bucket).Object(os.path.join(subpasta_dataset, 'test', key_test)).upload_fileobj(f)

# 7. Treinamento do Modelo

In [340]:
container = image_uris.retrieve(framework = 'xgboost', region=boto3.Session().region_name, version='latest')


INFO:sagemaker.image_uris:Ignoring unnecessary instance type: None.


In [341]:
xgboost = sagemaker.estimator.Estimator(image_uri = container,
                                        role = role,
                                        instance_count = 1,
                                        instance_type = 'ml.m5.2xlarge',
                                        output_path = output_location,
                                        sagemaker_session = session)

In [342]:
xgboost.set_hyperparameters(num_round = 100, objective = 'multi:softmax', num_class=3)


In [343]:
train_input = sagemaker.inputs.TrainingInput(s3_data = s3_train_data, content_type='csv', s3_data_type = 'S3Prefix')
validation_input = sagemaker.inputs.TrainingInput(s3_data = s3_test_data, content_type='csv', s3_data_type = 'S3Prefix')
data_channels = {'train': train_input, 'validation': validation_input}

In [344]:
xgboost.fit(data_channels)


INFO:sagemaker:Creating training-job with name: xgboost-2024-09-14-03-53-13-224


2024-09-14 03:53:14 Starting - Starting the training job...
2024-09-14 03:53:29 Starting - Preparing the instances for training...
2024-09-14 03:54:08 Downloading - Downloading the training image...
2024-09-14 03:54:44 Training - Training image download completed. Training in progress...[34mArguments: train[0m
[34m[2024-09-14:03:55:02:INFO] Running standalone xgboost training.[0m
[34m[2024-09-14:03:55:02:INFO] File size need to be processed in the node: 3.64mb. Available memory size in the node: 23840.98mb[0m
[34m[2024-09-14:03:55:02:INFO] Determined delimiter of CSV input is ','[0m
[34m[03:55:02] S3DistributionType set as FullyReplicated[0m
[34m[03:55:02] 25392x31 matrix with 787152 entries loaded from /opt/ml/input/data/train?format=csv&label_column=0&delimiter=,[0m
[34m[2024-09-14:03:55:02:INFO] Determined delimiter of CSV input is ','[0m
[34m[03:55:02] S3DistributionType set as FullyReplicated[0m
[34m[03:55:02] 10883x31 matrix with 337373 entries loaded from /opt/m

# Deploy , previsão e avaliação

In [345]:
xgboost_classifier = xgboost.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

INFO:sagemaker:Creating model with name: xgboost-2024-09-14-03-56-00-595
INFO:sagemaker:Creating endpoint-config with name xgboost-2024-09-14-03-56-00-595
INFO:sagemaker:Creating endpoint with name xgboost-2024-09-14-03-56-00-595


------!

In [348]:
xgboost_classifier.serializer = CSVSerializer()

In [349]:
X_teste.shape, type(X_teste)

((10883, 31), numpy.ndarray)

In [350]:
previsoes = np.array(xgboost_classifier.predict(X_teste).decode('utf-8').split(',')).astype(np.float32)
previsoes

array([2., 0., 0., ..., 1., 1., 2.], dtype=float32)

In [351]:
previsoes.shape, y_teste.shape

((10883,), (10883,))

In [352]:
cm = confusion_matrix(y_teste, previsoes)
cm

array([[2797,  499,   62],
       [ 360, 3500,  351],
       [  41,  407, 2866]])

In [353]:
accuracy_score(y_teste, previsoes)

0.84195534319581

In [354]:
print(classification_report(y_teste, previsoes))

              precision    recall  f1-score   support

           0       0.87      0.83      0.85      3358
           1       0.79      0.83      0.81      4211
           2       0.87      0.86      0.87      3314

    accuracy                           0.84     10883
   macro avg       0.85      0.84      0.85     10883
weighted avg       0.84      0.84      0.84     10883



# Tuning dos Parêmetros