# Aprendizado de Máquina

### Teste paramétricos para elaboração de modelo de Light GBM. Ainda em fases de teste

In [112]:
import plotly.express as px
import pandas as pd
import numpy as np
import requests
import gzip
from statsmodels.tsa.filters.hp_filter import hpfilter
from skopt import gp_minimize
from sklearn.preprocessing import MinMaxScaler
from sktime.performance_metrics.forecasting import smape_loss
from lightgbm import LGBMRegressor
from sktime.forecasting.model_selection import ExpandingWindowSplitter
from sktime.forecasting.model_evaluation import evaluate
from sktime.forecasting.compose import make_reduction

In [113]:
import warnings
warnings.filterwarnings("ignore")

In [114]:
SEED = 4
forecasting_horizon = 14
scaler = MinMaxScaler([0,1])

In [115]:
url = 'https://github.com/wcota/covid19br/blob/master/cases-brazil-cities-time.csv.gz?raw=true'
r = requests.get(url, allow_redirects=True)
open('data.csv.gz','wb').write(r.content)
gz = gzip.open('data.csv.gz')
df = pd.read_csv(gz)

In [116]:
def to_zero(x):
    if x < 0:
        return 0
    return x

In [117]:
data = df.query("state == 'AM'").groupby('date').sum()
data.index = pd.to_datetime(data.index)
data.index.freq = 'D'
data.drop(columns=['epi_week', 'ibgeID', 'cod_RegiaoDeSaude', 'deaths', 'totalCases'], inplace=True)
data = data.iloc[:,:2]

In [118]:
data['newDeaths'] = data['newDeaths'].apply(to_zero)
data['newCases'] = data['newCases'].apply(to_zero)

In [119]:
for col in data.columns:
    data['log' + "_" + col] = np.log(data[col] + 1)

In [120]:
def special_days(x):
    if x == 'Monday' or x == 'Sunday':
        return 1
    return 0

In [121]:
for col in data.columns[:2]:
    cycle, trend = hpfilter(data[col])
    data[col + '_' + 'cycle'] = cycle.round()
    data[col + '_' + 'trend'] = trend.round()

In [122]:
data['newDeaths_trend'] = data['newDeaths_trend'].apply(to_zero)
data['newCases_trend'] = data['newCases_trend'].apply(to_zero)
data.index.freq = 'D'
data.reset_index(inplace=True)
data['date'] = pd.to_datetime(data['date'])

In [123]:
data_from_newCases = data[['date', 'newCases_trend']]

In [124]:
data_from_newDeaths = data[['date', 'newDeaths_trend']]

### Modelo para Novos Casos

In [158]:
fh = np.arange(1, 14 + 1)

In [159]:
y = data_from_newCases.set_index('date')['newCases_trend']
y.index.freq = 'D'

In [161]:
model = LGBMRegressor(random_state=SEED,
    learning_rate = 0.04591301953670739, 
    num_leaves = 45, 
    min_child_samples = 1, 
    subsample = 0.05,
    colsample_bytree = 0.9828905761860228,
    subsample_freq=1,
    n_estimators=685)

In [162]:
reg = make_reduction(estimator=model, window_length=14)

In [163]:
cv = ExpandingWindowSplitter(initial_window=60)
cross_val = evaluate(forecaster=reg, y=y, cv=cv, strategy="refit", return_data=True)
cross_val['test_sMAPE'].mean()

0.05235297082711649

In [164]:
reg.fit(y)

RecursiveTabularRegressionForecaster(estimator=LGBMRegressor(colsample_bytree=0.9828905761860228,
                                                             learning_rate=0.04591301953670739,
                                                             min_child_samples=1,
                                                             n_estimators=685,
                                                             num_leaves=45,
                                                             random_state=4,
                                                             subsample=0.05,
                                                             subsample_freq=1),
                                     window_length=14)

In [167]:
px.scatter(reg.predict(fh).round())

### Modelo para Óbitos

In [148]:
y = data_from_newDeaths.set_index('date')['newDeaths_trend']
y.index.freq = 'D'

In [149]:
model = LGBMRegressor(random_state=SEED,
    learning_rate = 0.07876350967431142, 
    num_leaves = 94, 
    min_child_samples = 1, 
    subsample = 0.05181184025013618,
    colsample_bytree = 1.0,
    subsample_freq=1,
    n_estimators=646)

In [150]:
reg = make_reduction(estimator=model, window_length=14)

In [151]:
cv = ExpandingWindowSplitter(initial_window=60)
cross_val = evaluate(forecaster=reg, y=y, cv=cv, strategy="refit", return_data=True)
cross_val['test_sMAPE'].mean()

0.0776585857582303

In [153]:
reg.fit(y)

RecursiveTabularRegressionForecaster(estimator=LGBMRegressor(learning_rate=0.07876350967431142,
                                                             min_child_samples=1,
                                                             n_estimators=646,
                                                             num_leaves=94,
                                                             random_state=4,
                                                             subsample=0.05181184025013618,
                                                             subsample_freq=1),
                                     window_length=14)

In [155]:
px.line(reg.predict(fh).round(), line_shape='spline')