# Aprendizado de Máquina

### Este documento apenas tem objetivo para fins exploratórios. Ele não agrega a nada sobre o relatório principal. Nele, testei e explorei possibilidades acerca de utilizar algoritmos envolvendo Gradient Boosting, porém, devido a inconscitência dos dados, isto é, ser um Random Walk extremamente volátil, junto a um desbalanceamento de emissões de casos, tornou-se dificultoso um trabalho para se chegar em um modelo que apresentasse uma acurácia consistente, se baseando no erro percentual absoluto médio simétrico.

In [2]:
import plotly.express as px
import pandas as pd
import numpy as np
import requests
import gzip
import plotly.graph_objects as go

In [3]:
from sktime.forecasting.model_selection import temporal_train_test_split
from sktime.performance_metrics.forecasting import smape_loss

In [4]:
from numpy import ravel
import sklearn.model_selection
from sktime.forecasting.compose import make_reduction
from skopt import gp_minimize
from lightgbm import LGBMRegressor
from statsmodels.tsa.filters.hp_filter import hpfilter
from sktime.forecasting.model_evaluation import evaluate
from sktime.forecasting.model_selection import ExpandingWindowSplitter
from sktime.utils.plotting import plot_series
from sktime.forecasting.model_selection import ForecastingGridSearchCV

In [5]:
from sktime.forecasting.naive import NaiveForecaster
from sklearn.model_selection import train_test_split
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim

In [6]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor

from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer

In [108]:
from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters

In [7]:
SEED = 4

In [8]:
url = 'https://github.com/wcota/covid19br/blob/master/cases-brazil-cities-time.csv.gz?raw=true'
r = requests.get(url, allow_redirects=True)
open('data.csv.gz','wb').write(r.content)
gz = gzip.open('data.csv.gz')
df = pd.read_csv(gz)

## Validação de Modelos

In [9]:
total_de_casos_amazonas = df.query("state == 'AM'").groupby('date').sum()
total_de_casos_amazonas.index = pd.to_datetime(total_de_casos_amazonas.index)
total_de_casos_amazonas.index.freq = 'D'
total_de_casos_amazonas.drop(columns=['epi_week', 'ibgeID', 'cod_RegiaoDeSaude', 'deaths', 'totalCases'], inplace=True)
total_de_casos_amazonas = total_de_casos_amazonas.iloc[:,:2]

In [10]:
def to_zero(x):
    if x < 0:
        return 0
    return x

In [11]:
for col in total_de_casos_amazonas.columns:
    cycle, trend = hpfilter(total_de_casos_amazonas[col])
    total_de_casos_amazonas[col + '_' + 'cycle'] = cycle.round()
    total_de_casos_amazonas[col + '_' + 'trend'] = trend.round()

In [12]:
total_de_casos_amazonas

Unnamed: 0_level_0,newDeaths,newCases,newDeaths_cycle,newDeaths_trend,newCases_cycle,newCases_trend
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-13,0,2,1.0,-1.0,10.0,-8.0
2020-03-14,0,0,1.0,-1.0,6.0,-6.0
2020-03-15,0,0,1.0,-1.0,5.0,-5.0
2020-03-16,0,0,1.0,-1.0,3.0,-3.0
2020-03-17,0,0,1.0,-1.0,2.0,-2.0
...,...,...,...,...,...,...
2021-04-14,25,820,8.0,17.0,64.0,756.0
2021-04-15,18,893,1.0,17.0,152.0,741.0
2021-04-16,15,773,-1.0,16.0,48.0,725.0
2021-04-17,10,701,-6.0,16.0,-7.0,708.0


In [13]:
total_de_casos_amazonas['newDeaths_trend'] = total_de_casos_amazonas['newDeaths_trend'].apply(to_zero)
total_de_casos_amazonas['newCases_trend'] = total_de_casos_amazonas['newCases_trend'].apply(to_zero)

In [14]:
total_de_casos_amazonas

Unnamed: 0_level_0,newDeaths,newCases,newDeaths_cycle,newDeaths_trend,newCases_cycle,newCases_trend
date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
2020-03-13,0,2,1.0,0.0,10.0,0.0
2020-03-14,0,0,1.0,0.0,6.0,0.0
2020-03-15,0,0,1.0,0.0,5.0,0.0
2020-03-16,0,0,1.0,0.0,3.0,0.0
2020-03-17,0,0,1.0,0.0,2.0,0.0
...,...,...,...,...,...,...
2021-04-14,25,820,8.0,17.0,64.0,756.0
2021-04-15,18,893,1.0,17.0,152.0,741.0
2021-04-16,15,773,-1.0,16.0,48.0,725.0
2021-04-17,10,701,-6.0,16.0,-7.0,708.0


In [17]:
total_de_casos_amazonas['dia_da_semana'] = total_de_casos_amazonas.index.day_name()

In [18]:
df_cases = total_de_casos_amazonas[['newCases', 'newCases_trend', 'newCases_cycle', 'dia_da_semana']]

In [19]:
df_cases.reset_index(inplace=True)

In [20]:
df['date'] = pd.to_datetime(df['date'], errors='coerce')

In [21]:
df_cases

Unnamed: 0,date,newCases,newCases_trend,newCases_cycle,dia_da_semana
0,2020-03-13,2,0.0,10.0,Friday
1,2020-03-14,0,0.0,6.0,Saturday
2,2020-03-15,0,0.0,5.0,Sunday
3,2020-03-16,0,0.0,3.0,Monday
4,2020-03-17,0,0.0,2.0,Tuesday
...,...,...,...,...,...
397,2021-04-14,820,756.0,64.0,Wednesday
398,2021-04-15,893,741.0,152.0,Thursday
399,2021-04-16,773,725.0,48.0,Friday
400,2021-04-17,701,708.0,-7.0,Saturday


In [97]:
max_prediction_length = 14
max_encoder_length = 24
training_cutoff = df_cases["time_idx"].max() - max_prediction_length

In [22]:
training = TimeSeriesDataSet(
    df_cases[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target="newCases",
    group_ids=["dia_da_semana"],
    min_encoder_length=max_encoder_length // 2,  # keep encoder length long (as it is in the validation set)
    max_encoder_length=max_encoder_length,
    min_prediction_length=1,
    max_prediction_length=max_prediction_length,
    static_categoricals=["dia_da_semana"],
    time_varying_unknown_reals=[
        "newCases",
        "newCases_trend",
        "newCases_cycle",
    ],
    target_normalizer=GroupNormalizer(
        groups=["dia_da_semana"], transformation="softplus"
    ),  # use softplus and normalize by group
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True
    )

AttributeError: 'DataFrame' object has no attribute 'time_idx'

In [102]:
df_cases

Unnamed: 0,date,newCases,newCases_trend,newCases_cycle,dia_da_semana,time_idx
0,2020-03-13,2,0.0,10.0,Friday,0
1,2020-03-14,0,0.0,6.0,Saturday,1
2,2020-03-15,0,0.0,5.0,Sunday,2
3,2020-03-16,0,0.0,3.0,Monday,3
4,2020-03-17,0,0.0,2.0,Tuesday,4
...,...,...,...,...,...,...
396,2021-04-13,995,797.0,198.0,Tuesday,396
397,2021-04-14,820,787.0,33.0,Wednesday,397
398,2021-04-15,893,776.0,117.0,Thursday,398
399,2021-04-16,773,765.0,8.0,Friday,399


In [85]:
df_cases["time_idx"] = [i for i in range(len(df_cases['date']))]

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_cases["time_idx"] = [i for i in range(len(df_cases['date']))]


In [86]:
df_cases

Unnamed: 0,date,newCases,newCases_trend,newCases_cycle,dia_da_semana,time_idx
0,2020-03-13,2,0.0,10.0,Friday,0
1,2020-03-14,0,0.0,6.0,Saturday,1
2,2020-03-15,0,0.0,5.0,Sunday,2
3,2020-03-16,0,0.0,3.0,Monday,3
4,2020-03-17,0,0.0,2.0,Tuesday,4
...,...,...,...,...,...,...
396,2021-04-13,995,797.0,198.0,Tuesday,396
397,2021-04-14,820,787.0,33.0,Wednesday,397
398,2021-04-15,893,776.0,117.0,Thursday,398
399,2021-04-16,773,765.0,8.0,Friday,399


Unnamed: 0,date,newCases,newCases_trend,newCases_cycle,dia_da_semana,time_idx
0,2020-03-13,2,0.0,10.0,Friday,0
1,2020-03-14,0,0.0,6.0,Saturday,1
2,2020-03-15,0,0.0,5.0,Sunday,2
3,2020-03-16,0,0.0,3.0,Monday,3
4,2020-03-17,0,0.0,2.0,Tuesday,4
...,...,...,...,...,...,...
382,2021-03-30,1490,946.0,544.0,Tuesday,382
383,2021-03-31,1313,936.0,377.0,Wednesday,383
384,2021-04-01,1165,926.0,239.0,Thursday,384
385,2021-04-02,830,915.0,-85.0,Friday,385
