# Aprendizado de Máquina

In [10]:
import plotly.express as px
import pandas as pd
import numpy as np
import requests
import gzip
import plotly.graph_objects as go
import datetime
from datetime import datetime
from datetime import timedelta
from calendar import month_abbr
from os import environ

In [11]:
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.performance_metrics.forecasting import smape_loss

In [31]:
from numpy import ravel
import sklearn.model_selection
from sktime.forecasting.compose import RecursiveRegressionForecaster
from skopt import dummy_minimize
from lightgbm import LGBMRegressor
from sklearn.impute import SimpleImputer

In [4]:
url = 'https://github.com/wcota/covid19br/blob/master/cases-brazil-cities-time.csv.gz?raw=true'
r = requests.get(url, allow_redirects=True)
open('data.csv.gz','wb').write(r.content)
gz = gzip.open('data.csv.gz')
df = pd.read_csv(gz)

## Validação de Modelos

In [13]:
total_de_casos_amazonas = df.query("state == 'AM'").groupby('date').sum()
total_de_casos_amazonas.index = pd.to_datetime(total_de_casos_amazonas.index)
total_de_casos_amazonas.index.freq = 'D'

In [14]:
SEED = 4

In [19]:
dici = {'Monday': 'Segunda',
'Tuesday': 'Terça',
'Wednesday': 'Quarta',
'Thursday': 'Quinta',
'Friday': 'Sexta',
'Saturday': 'Sábado',
'Sunday': 'Domingo'}

In [21]:
total_de_casos_amazonas['dia_da_semana'] = total_de_casos_amazonas.index.day_name()

In [23]:
total_de_casos_amazonas['dia_da_semana'] = total_de_casos_amazonas['dia_da_semana'].apply(lambda x: dici[x])

In [24]:
ignorar_dias = ['Segunda', 'Domingo']

In [25]:
nova_coluna = []

In [26]:
for i in range(len(total_de_casos_amazonas)):
    if total_de_casos_amazonas['dia_da_semana'].iloc[i] in ignorar_dias:
        nova_coluna.append(np.nan)
    else:
        nova_coluna.append(total_de_casos_amazonas['newCases'].iloc[i])

In [30]:
total_de_casos_amazonas['newCases'] = nova_coluna

In [58]:
imputer = SimpleImputer(missing_values=np.nan,strategy='median')
imputer = imputer.fit_transform(total_de_casos_amazonas.iloc[:,:-1])

In [59]:
total_de_casos_amazonas.iloc[:,:-1] = imputer

In [66]:
total_de_casos_amazonas.newCases = total_de_casos_amazonas.newCases.round()

In [68]:
y = total_de_casos_amazonas.newCases

In [69]:
y_train, y_test = temporal_train_test_split(y, test_size= 7)

In [70]:
fh = np.arange(1, len(y_test) + 1)

In [76]:
def treinar_modelo(params):
    SEED = 4

    learning_rate = params[0]
    num_leaves = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]

    print(params, '\n')
    
    fh = np.arange(1, len(y_test) + 1)
    mdl = LGBMRegressor(random_state=SEED,
    learning_rate = learning_rate, 
    num_leaves = num_leaves, 
    min_child_samples = min_child_samples, 
    subsample = subsample,
    colsample_bytree = colsample_bytree,
    subsample_freq=1,
    n_estimators=n_estimators)
    reg = RecursiveRegressionForecaster(regressor = mdl, window_length= 7)
    reg.fit(y_train)
    y_pred = reg.predict(fh)

    return smape_loss(y_test, y_pred)

space = [(1e-3, 1e-1, 'log-uniform'), #learning rate
(2, 128), #num_leaves
(1, 100), #min_child_samples
(0.05, 1.0), #subsample
(0.1, 1.0), #colsample_bytree
(100, 1000)] #n_estimators

result = dummy_minimize(treinar_modelo, 
space,
random_state=SEED, 
verbose=1,
n_calls = 30)

Iteration No: 1 started. Evaluating function at random point.
[0.06327656730105531, 71, 2, 0.8628399009188652, 0.6481320413049753, 493] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.5027
Function value obtained: 0.2977
Current minimum: 0.2977
Iteration No: 2 started. Evaluating function at random point.
[0.0027050730096210155, 105, 95, 0.7134992241139206, 0.9128606713660984, 776] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.1187
Function value obtained: 0.3135
Current minimum: 0.2977
Iteration No: 3 started. Evaluating function at random point.
[0.03620477422893411, 40, 53, 0.9842306433165472, 0.24745801726422886, 249] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.1137
Function value obtained: 0.4275
Current minimum: 0.2977
Iteration No: 4 started. Evaluating function at random point.
[0.042062540164342105, 127, 57, 0.0919520550349246, 0.9609876709428125, 748] 

Iteration No: 4 ended. Evaluation done at rand

In [103]:
result.x

[0.0023168120809679477, 32, 5, 0.4602309677741663, 0.8471692918885118, 643]

In [86]:
learning_rate, num_leaves, min_child_samples, subsample, colsample_bytree, n_estimators = result.x

In [90]:
mdl = LGBMRegressor(random_state=SEED,
learning_rate=learning_rate,
num_leaves=num_leaves,
min_child_samples=min_child_samples,
subsample=subsample,
subsample_freq=1,
colsample_bytree=colsample_bytree,
n_estimators=n_estimators)

In [104]:
mdl = LGBMRegressor(random_state=SEED,
learning_rate=0.0023168120809679477,
num_leaves=32,
min_child_samples=5,
subsample=0.4602309677741663,
subsample_freq=1,
colsample_bytree=0.8471692918885118,
n_estimators=643)

In [105]:
reg = RecursiveRegressionForecaster(regressor=mdl ,window_length=7)
reg.fit(y)
y_pred = reg.predict(fh)

In [106]:
px.line(y_pred.round())