# Aprendizado de Máquina

In [1]:
import plotly.express as px
import pandas as pd
import numpy as np
import requests
import gzip
import plotly.graph_objects as go
import datetime
from datetime import datetime
from datetime import timedelta
from calendar import month_abbr
from os import environ
from fbprophet.plot import plot_plotly

In [2]:
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.performance_metrics.forecasting import smape_loss

In [3]:
from numpy import ravel
import sklearn.model_selection
from sktime.forecasting.compose import RecursiveRegressionForecaster
from skopt import dummy_minimize
from lightgbm import LGBMRegressor
from sklearn.impute import SimpleImputer

In [4]:
url = 'https://github.com/wcota/covid19br/blob/master/cases-brazil-cities-time.csv.gz?raw=true'
r = requests.get(url, allow_redirects=True)
open('data.csv.gz','wb').write(r.content)
gz = gzip.open('data.csv.gz')
df = pd.read_csv(gz)

## Validação de Modelos

In [5]:
total_de_casos_amazonas = df.query("state == 'AM'").groupby('date').sum()
total_de_casos_amazonas.index = pd.to_datetime(total_de_casos_amazonas.index)
total_de_casos_amazonas.index.freq = 'D'

In [6]:
SEED = 4

In [7]:
dici = {'Monday': 'Segunda',
'Tuesday': 'Terça',
'Wednesday': 'Quarta',
'Thursday': 'Quinta',
'Friday': 'Sexta',
'Saturday': 'Sábado',
'Sunday': 'Domingo'}

In [8]:
total_de_casos_amazonas['dia_da_semana'] = total_de_casos_amazonas.index.day_name()

In [9]:
total_de_casos_amazonas['dia_da_semana'] = total_de_casos_amazonas['dia_da_semana'].apply(lambda x: dici[x])

In [10]:
ignorar_dias = ['Segunda', 'Domingo']

In [11]:
nova_coluna = []

In [12]:
for i in range(len(total_de_casos_amazonas)):
    if total_de_casos_amazonas['dia_da_semana'].iloc[i] in ignorar_dias:
        nova_coluna.append(np.nan)
    else:
        nova_coluna.append(total_de_casos_amazonas['newCases'].iloc[i])

In [13]:
total_de_casos_amazonas['newCases'] = nova_coluna

In [14]:
imputer = SimpleImputer(missing_values=np.nan,strategy='median')
imputer = imputer.fit_transform(total_de_casos_amazonas.iloc[:,:-1])

In [15]:
total_de_casos_amazonas.iloc[:,:-1] = imputer

In [16]:
total_de_casos_amazonas.newCases = total_de_casos_amazonas.newCases.round()

In [17]:
y = total_de_casos_amazonas.newCases

In [18]:
y_train, y_test = temporal_train_test_split(y, test_size= 7)

In [19]:
fh = np.arange(1, len(y_test) + 1)

In [20]:
def treinar_modelo(params):
    SEED = 4

    learning_rate = params[0]
    num_leaves = params[1]
    min_child_samples = params[2]
    subsample = params[3]
    colsample_bytree = params[4]
    n_estimators = params[5]

    print(params, '\n')
    
    fh = np.arange(1, len(y_test) + 1)
    mdl = LGBMRegressor(random_state=SEED,
    learning_rate = learning_rate, 
    num_leaves = num_leaves, 
    min_child_samples = min_child_samples, 
    subsample = subsample,
    colsample_bytree = colsample_bytree,
    subsample_freq=1,
    n_estimators=n_estimators)
    reg = RecursiveRegressionForecaster(regressor = mdl, window_length= 7)
    reg.fit(y_train)
    y_pred = reg.predict(fh)

    return smape_loss(y_test, y_pred)

space = [(1e-3, 1e-1, 'log-uniform'), #learning rate
(2, 128), #num_leaves
(1, 100), #min_child_samples
(0.05, 1.0), #subsample
(0.1, 1.0), #colsample_bytree
(100, 1000)] #n_estimators

result = dummy_minimize(treinar_modelo, 
space,
random_state=SEED, 
verbose=1,
n_calls = 30)

Iteration No: 1 started. Evaluating function at random point.
[0.06327656730105531, 71, 2, 0.8628399009188652, 0.6481320413049753, 493] 

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 0.5495
Function value obtained: 0.2825
Current minimum: 0.2825
Iteration No: 2 started. Evaluating function at random point.
[0.0027050730096210155, 105, 95, 0.7134992241139206, 0.9128606713660984, 776] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 0.1470
Function value obtained: 0.3686
Current minimum: 0.2825
Iteration No: 3 started. Evaluating function at random point.
[0.03620477422893411, 40, 53, 0.9842306433165472, 0.24745801726422886, 249] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 0.1250
Function value obtained: 0.3745
Current minimum: 0.2825
Iteration No: 4 started. Evaluating function at random point.
[0.042062540164342105, 127, 57, 0.0919520550349246, 0.9609876709428125, 748] 

Iteration No: 4 ended. Evaluation done at rand

In [21]:
result.x

[0.004377551755279261, 43, 4, 0.5681080692628621, 0.43399556156121155, 619]

In [22]:
learning_rate, num_leaves, min_child_samples, subsample, colsample_bytree, n_estimators = result.x

In [23]:
mdl = LGBMRegressor(random_state=SEED,
learning_rate=learning_rate,
num_leaves=num_leaves,
min_child_samples=min_child_samples,
subsample=subsample,
subsample_freq=1,
colsample_bytree=colsample_bytree,
n_estimators=n_estimators)

In [24]:
mdl = LGBMRegressor(random_state=SEED,
learning_rate=0.0023168120809679477,
num_leaves=32,
min_child_samples=5,
subsample=0.4602309677741663,
subsample_freq=1,
colsample_bytree=0.8471692918885118,
n_estimators=643)

In [25]:
reg = RecursiveRegressionForecaster(regressor=mdl ,window_length=7)
reg.fit(y_train)
y_pred = reg.predict(fh)

In [26]:
smape_loss(y_test, y_pred)

0.29303928931066425

In [27]:
px.line(y_pred.round())

In [28]:
from fbprophet import Prophet

In [29]:
df_prophet = df.query("state == 'AM'").groupby('date').sum()
df_prophet.index = pd.to_datetime(total_de_casos_amazonas.index)
df_prophet.index.freq = 'D'

In [30]:
df_prophet = df_prophet.reset_index().rename(columns=({'newCases': 'y', 'date': 'ds'}))

In [58]:
y_train, y_test = temporal_train_test_split(df_prophet, test_size=7)

In [59]:
ml_model = Prophet(interval_width= 0.95, changepoint_prior_scale=0.5, seasonality_prior_scale=10) #modelo RAW
ml_model.fit(y_train)

INFO:fbprophet:Disabling yearly seasonality. Run prophet with yearly_seasonality=True to override this.
INFO:fbprophet:Disabling daily seasonality. Run prophet with daily_seasonality=True to override this.


<fbprophet.forecaster.Prophet at 0x1d6ab76c1f0>

In [60]:
future_dates = ml_model.make_future_dataframe(periods=7, freq='D')

In [61]:
forecast = ml_model.predict(future_dates)

In [62]:
fig = plot_plotly(ml_model, forecast)
fig.add_trace(go.Scatter(x=y_test.ds,y=y_test.y, name='y_test'))

In [63]:
px.line(y=forecast.trend.round(), x=forecast.ds)