# Aprendizado de Máquina

### Teste paramétricos para elaboração de Stacking de modelo de Gradient Boosting com Rede Neural. Ainda em fases de teste

In [1]:
import plotly.express as px
import pandas as pd
import numpy as np
import requests
import gzip

In [2]:
from skopt import gp_minimize
from statsmodels.tsa.filters.hp_filter import hpfilter
from yaml import safe_load
import torch

In [3]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping, LearningRateMonitor
from pytorch_forecasting import TimeSeriesDataSet, TemporalFusionTransformer

In [4]:
from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from pytorch_forecasting.data import GroupNormalizer
from pytorch_forecasting.metrics import SMAPE, PoissonLoss, QuantileLoss
from pytorch_forecasting.models.temporal_fusion_transformer.tuning import optimize_hyperparameters
from pytorch_forecasting.data.encoders import TorchNormalizer

In [5]:
SEED = 4

In [6]:
url = 'https://github.com/wcota/covid19br/blob/master/cases-brazil-cities-time.csv.gz?raw=true'
r = requests.get(url, allow_redirects=True)
open('data.csv.gz','wb').write(r.content)
gz = gzip.open('data.csv.gz')
df = pd.read_csv(gz)

In [30]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

cuda


In [7]:
with open('config.yml') as f:
    config = safe_load(f)

## Validação de Modelos

In [8]:
def to_zero(x):
    if x < 0:
        return 0
    return x

In [9]:
data = df.query("state == 'AM'").groupby('date').sum()
data.index = pd.to_datetime(data.index)
data.index.freq = 'D'
data.drop(columns=['epi_week', 'ibgeID', 'cod_RegiaoDeSaude', 'deaths', 'totalCases'], inplace=True)
data = data.iloc[:,:2]

In [10]:
data['newDeaths'] = data['newDeaths'].apply(to_zero)
data['newCases'] = data['newCases'].apply(to_zero)

In [11]:
for col in data.columns:
    data['log' + "_" + col] = np.log(data[col] + 1)

In [12]:
for col in data.columns[:2]:
    cycle, trend = hpfilter(data[col])
    data[col + '_' + 'cycle'] = cycle.round()
    data[col + '_' + 'trend'] = trend.round()

In [13]:
data['newDeaths_trend'] = data['newDeaths_trend'].apply(to_zero)
data['newCases_trend'] = data['newCases_trend'].apply(to_zero)
data['dia_da_semana'] = data.index.day_name()
data.reset_index(inplace=True)
data['date'] = pd.to_datetime(data['date'])
data_from_newCases = data[['date', 'newCases', 'log_newCases', 'newCases_trend','newCases_cycle', 'dia_da_semana']]
data_from_newCases['MA_7'] = data_from_newCases['newCases'].rolling(7).mean()
data_from_newCases = data_from_newCases.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_from_newCases['MA_7'] = data_from_newCases['newCases'].rolling(7).mean()


In [14]:
data_from_newCases['group_ids'] = 1

In [15]:
data_from_newCases["time_idx"] = range(len(data_from_newCases['date'])) #time_idx
training_cutoff = data_from_newCases["time_idx"].max() - config['max_prediction_length']['value']

In [16]:
training = TimeSeriesDataSet(
    data_from_newCases[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target=config['target']['value'],
    group_ids=['group_ids'],
    min_encoder_length= config['max_encoder_length']['value'] // 2,
    max_encoder_length= config['max_encoder_length']['value'],
    max_prediction_length=config['max_prediction_length']['value'],
    min_prediction_length=1,
    time_varying_known_categoricals=['dia_da_semana'],
    time_varying_known_reals=["time_idx"],
    time_varying_unknown_reals=config['time_varying_unknown_reals']['value'],
    target_normalizer = GroupNormalizer(groups=['dia_da_semana'], transformation=config['transformation']['value']),
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    allow_missings=True
    )

In [17]:
validation = TimeSeriesDataSet.from_dataset(training, data_from_newCases, predict=True, stop_randomization=True)
batch_size = 128  # set this between 32 to 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers = 0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers = 0)

In [34]:
pl.seed_everything(42)
trainer = pl.Trainer(
    gpus=1,
    # clipping gradients is a hyperparameter and important to prevent divergance
    # of the gradient for recurrent neural networks
    gradient_clip_val=config['gradient_clip_val']['value'],
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    # not meaningful for finding the learning rate but otherwise very important
    learning_rate=config['learning_rate']['value'],
    hidden_size=config['hidden_size']['value'],  # most important hyperparameter apart from learning rate
    # number of attention heads. Set to up to 4 for large datasets
    attention_head_size=config['attention_head_size']['value'],
    dropout=config['dropout']['value'],  # between 0.1 and 0.3 are good values
    hidden_continuous_size=config['hidden_continuous_size']['value'],  # set to <= hidden_size
    output_size=config['output_size']['value'],  # 7 quantiles by default
    loss=SMAPE(),
    # reduce learning rate if no improvement in validation loss after x epochs
    reduce_on_plateau_patience=config['reduce_on_plateau_patience']['value'],
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

Global seed set to 42
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Number of parameters in network: 18.3k


In [35]:
res = trainer.tuner.lr_find(
    tft,
    train_dataloader=train_dataloader,
    val_dataloaders=val_dataloader,
    max_lr=10.0,
    min_lr=1e-6,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | SMAPE                           | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 35    
3  | prescalers                         | ModuleDict                      | 72    
4  | static_variable_selection          | VariableSelectionNetwork        | 954   
5  | encoder_variable_selection         | VariableSelectionNetwork        | 2.3 K 
6  | decoder_variable_selection         | VariableSelectionNetwork        | 765   
7  | static_context_variable_selection  | GatedResidualNetwork            | 1.1 K 
8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 1.1 K 
9  | static_context_initial_cell_lstm 

AssertionError: Passed groups and fitted do not match

In [37]:
print(f"suggested learning rate: {res.suggestion()}")
fig = res.plot(show=True, suggest=True)
fig.show()

NameError: name 'res' is not defined