# Tratamento dos dados

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import datetime

In [None]:
# https://www.kaggle.com/competitions/bike-sharing-demand/data
base = pd.read_csv('./csv-files/train.csv')
base

In [None]:
base = pd.read_csv('./csv-files/train.csv', parse_dates=['datetime'], index_col = 0)
base

In [None]:
base.drop(columns = ['season', 'holiday','workingday','weather','temp','atemp', 'humidity','windspeed','casual', 'registered'], axis = 1, inplace = True)
base

In [None]:
base = base.groupby(pd.Grouper(freq = 'D')).sum()
base

In [None]:
plt.plot(base['count'])
plt.ylabel('Quantidade')
plt.title('Aluguel de bicicletas');

# Configurações do SageMaker

In [None]:
import boto3
import sagemaker
from sagemaker import RandomCutForest

In [None]:
session = sagemaker.Session()
bucket = 'sagemaker-curso-bucket'
subpasta_dataset = 'datasets/bike/random-cut'
subpasta_modelo = 'modelos/bike/random-cut'
key = 'bike-random-cut'
s3_data = 's3://{}/{}/{}'.format(bucket, subpasta_dataset, key)
output_location = 's3://{}/{}'.format(bucket, subpasta_modelo)
role = "AmazonSageMaker-ExecutionRole-20240702T173175"
boto3.Session().client('s3').head_bucket(Bucket = bucket)
print('Localização da base de dados: ', s3_data)
print('Localização do modelo: ', output_location)

# Treinamento do Random Cut

In [None]:
base['count'].to_numpy().reshape(-1,1).shape

In [None]:
# https://docs.aws.amazon.com/sagemaker/latest/dg/randomcutforest.html
# https://docs.aws.amazon.com/sagemaker/latest/dg/rcf_hyperparameters.html
# https://sagemaker.readthedocs.io/en/stable/algorithms/sagemaker.amazon.amazon_estimator.html
random_cut = RandomCutForest(role = role,
                             instance_count = 1,
                             instance_type = 'ml.m4.xlarge',
                             data_location = s3_data,
                             output_path = output_location,
                             use_spot_instances = True,
                             max_run = 3600,
                             max_wait = 3600)
random_cut.fit(random_cut.record_set(base['count'].to_numpy().reshape(-1,1)))

# Deploy e inferências

In [None]:
random_cut_bikes = random_cut.deploy(initial_instance_count = 1, instance_type = 'ml.m4.xlarge')

In [None]:
from sagemaker.serializers import CSVSerializer
from sagemaker.deserializers import JSONDeserializer
random_cut_bikes.serializer = CSVSerializer()
random_cut_bikes.deserializer = JSONDeserializer()

In [None]:
base_numpy = base['count'].to_numpy().reshape(-1,1)
base_numpy[0:5], base_numpy.shape

In [None]:
previsoes = random_cut_bikes.predict(base_numpy)
previsoes

In [None]:
previsoes = [p['score'] for p in previsoes['scores']]

In [None]:
previsoes[0:5]

In [None]:
# Gerando nova bose com os scores retornados pelo algoritmo
base['score'] = pd.Series(previsoes, index = base.index)
base

In [None]:
base.describe()

In [None]:
min(previsoes), max(previsoes), 1.4 * max(previsoes)

In [None]:
fig, ax1 = plt.subplots(figsize=(20,10))
# https://matplotlib.org/stable/api/_as_gen/matplotlib.axes.Axes.twinx.html
ax2 = ax1.twinx()

ax1.plot(base['count'], color = 'b')
ax2.plot(base['score'], color = 'r')

ax1.set_ylabel('Quantidade', color = 'b')
ax2.set_ylabel('Score', color = 'r')

ax1.tick_params('y', colors = 'b')
ax2.tick_params('y', colors = 'r')

ax2.set_ylim(min(previsoes), 1.4 * max(previsoes));

In [None]:
media_score = base['score'].mean()
media_score

In [None]:
desvio_padrao_score = base['score'].std()
desvio_padrao_score

In [None]:
corte_score = media_score + 3 * desvio_padrao_score
corte_score

In [None]:
outliers = base[base['score'] > corte_score]
outliers

In [None]:
# https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.plot.html
ax2.plot(outliers.index, outliers['score'], 'ko')
fig

In [None]:
random_cut_bikes.delete_endpoint()