# Aprendizado de Máquina

### Teste paramétricos para elaboração de Stacking de modelo de Gradient Boosting com Rede Neural. Ainda em fases de teste

In [1]:
import plotly.express as px
import pandas as pd
import numpy as np
import requests
import gzip

In [2]:
from statsmodels.tsa.filters.hp_filter import hpfilter
from yaml import safe_load
import torch

In [3]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping

In [4]:
from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from sklearn.preprocessing import MinMaxScaler
from pytorch_forecasting.metrics import QuantileLoss
from sktime.performance_metrics.forecasting import smape_loss
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor

In [5]:
SEED = 4

In [6]:
url = 'https://github.com/wcota/covid19br/blob/master/cases-brazil-cities-time.csv.gz?raw=true'
r = requests.get(url, allow_redirects=True)
open('data.csv.gz','wb').write(r.content)
gz = gzip.open('data.csv.gz')
df = pd.read_csv(gz)

In [7]:
if torch.cuda.is_available():
    device = torch.device('cuda')
else:
    device = torch.device('cpu')
print(device)

cuda


In [8]:
with open('config.yml') as f:
    config = safe_load(f)

## Validação de Modelos

In [9]:
def to_zero(x):
    if x < 0:
        return 0
    return x

In [10]:
data = df.query("state == 'AM'").groupby('date').sum()
data.index = pd.to_datetime(data.index)
data.index.freq = 'D'
data.drop(columns=['epi_week', 'ibgeID', 'cod_RegiaoDeSaude', 'deaths', 'totalCases'], inplace=True)
data = data.iloc[:,:2]

In [11]:
data['newDeaths'] = data['newDeaths'].apply(to_zero)
data['newCases'] = data['newCases'].apply(to_zero)

In [12]:
for col in data.columns:
    data['log' + "_" + col] = np.log(data[col] + 1)

In [13]:
for col in data.columns[:2]:
    cycle, trend = hpfilter(data[col])
    data[col + '_' + 'cycle'] = cycle.round()
    data[col + '_' + 'trend'] = trend.round()

In [14]:
data['newDeaths_trend'] = data['newDeaths_trend'].apply(to_zero)
data['newCases_trend'] = data['newCases_trend'].apply(to_zero)
data['dia_da_semana'] = data.index.day_name()
data.reset_index(inplace=True)
data['date'] = pd.to_datetime(data['date'])
data_from_newCases = data[['date', 'newCases', 'log_newCases', 'newCases_trend','newCases_cycle', 'dia_da_semana']]
data_from_newCases['MA_7'] = data_from_newCases['newCases'].rolling(7).mean()
data_from_newCases = data_from_newCases.fillna(0)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_from_newCases['MA_7'] = data_from_newCases['newCases'].rolling(7).mean()


In [15]:
data_from_newCases['group_ids'] = 1

In [16]:
scaler = MinMaxScaler([0,10])

In [17]:
data_from_newCases['newCases_cycle'] = scaler.fit_transform(data_from_newCases[['newCases_cycle']])
data_from_newCases['MA_7'] = scaler.fit_transform(data_from_newCases[['MA_7']])

In [18]:
data_from_newCases["time_idx"] = range(len(data_from_newCases['date'])) #time_idx
training_cutoff = data_from_newCases["time_idx"].max() - config['max_prediction_length']['value']

In [19]:
def customLegend(fig, nameSwap):
    for i, dat in enumerate(fig.data):
        for elem in dat:
            if elem == 'name':
                fig.data[i].name = nameSwap[fig.data[i].name]
    return(fig)

In [20]:
def special_days(x):
    if x == 'Monday' or x == 'Sunday':
        return '1'
    return '0'

In [21]:
data_from_newCases['dia_da_semana'] = data_from_newCases['dia_da_semana'].apply(special_days)
data_from_newCases.rename(columns={'dia_da_semana': 'is_special_day'}, inplace=True)

In [22]:
training = TimeSeriesDataSet(
    data_from_newCases[lambda x: x.time_idx <= training_cutoff],
    time_idx="time_idx",
    target=config['target']['value'],
    group_ids=['group_ids'],
    min_encoder_length= config['max_encoder_length']['value'] // 2,
    max_encoder_length= config['max_encoder_length']['value'],
    time_varying_known_categoricals=['is_special_day'],
    max_prediction_length=config['max_prediction_length']['value'],
    min_prediction_length=1,
    time_varying_unknown_reals=config['time_varying_unknown_reals']['value'],
    add_relative_time_idx=True,
    add_target_scales=True,
    add_encoder_length=True,
    )

In [23]:
validation = TimeSeriesDataSet.from_dataset(training, data_from_newCases, predict=True, stop_randomization=True)
batch_size = 128  # set this between 32 to 128
train_dataloader = training.to_dataloader(train=True, batch_size=batch_size, num_workers = 0)
val_dataloader = validation.to_dataloader(train=False, batch_size=batch_size * 10, num_workers = 0)

In [24]:
actuals = list(val_dataloader)[0][0]['encoder_target'][0][-14:]
baseline_predictions = Baseline().predict(val_dataloader)
(actuals - baseline_predictions).abs().mean().item()

81.35713958740234

In [25]:
pl.seed_everything(42)
early_stop_callback = EarlyStopping(monitor="val_loss", min_delta=1e-4, patience=10,      verbose=False, mode="min")

trainer = pl.Trainer(
    gpus= 1,
    # clipping gradients is a hyperparameter and important to prevent divergance
    # of the gradient for recurrent neural networks
    gradient_clip_val=config['gradient_clip_val']['value'],
    callbacks=[early_stop_callback]
)


tft = TemporalFusionTransformer.from_dataset(
    training,
    # not meaningful for finding the learning rate but otherwise very important
    learning_rate=config['learning_rate']['value'],
    hidden_size=config['hidden_size']['value'],  # most important hyperparameter apart from learning rate
    # number of attention heads. Set to up to 4 for large datasets
    attention_head_size=config['attention_head_size']['value'],
    dropout=config['dropout']['value'],  # between 0.1 and 0.3 are good values
    hidden_continuous_size=config['hidden_continuous_size']['value'],  # set to <= hidden_size
    output_size= config['output_size']['value'],  # 7 quantiles by default
    loss=QuantileLoss(),
    # reduce learning rate if no improvement in validation loss after x epochs
    reduce_on_plateau_patience=config['reduce_on_plateau_patience']['value'],
)
print(f"Number of parameters in network: {tft.size()/1e3:.1f}k")

Global seed set to 42
GPU available: True, used: True
TPU available: False, using: 0 TPU cores
Number of parameters in network: 8.8k


In [26]:
trainer.fit(
    tft,
    train_dataloader=train_dataloader,
    val_dataloaders=val_dataloader,
)

LOCAL_RANK: 0 - CUDA_VISIBLE_DEVICES: [0]

   | Name                               | Type                            | Params
----------------------------------------------------------------------------------------
0  | loss                               | QuantileLoss                    | 0     
1  | logging_metrics                    | ModuleList                      | 0     
2  | input_embeddings                   | MultiEmbedding                  | 2     
3  | prescalers                         | ModuleDict                      | 128   
4  | static_variable_selection          | VariableSelectionNetwork        | 1.2 K 
5  | encoder_variable_selection         | VariableSelectionNetwork        | 2.3 K 
6  | decoder_variable_selection         | VariableSelectionNetwork        | 460   
7  | static_context_variable_selection  | GatedResidualNetwork            | 378   
8  | static_context_initial_hidden_lstm | GatedResidualNetwork            | 378   
9  | static_context_initial_cell_lstm 

1

In [27]:
best_model_path = trainer.checkpoint_callback.best_model_path

In [28]:
best_tft = TemporalFusionTransformer.load_from_checkpoint(best_model_path)

In [29]:
actuals = list(val_dataloader)[0][0]['encoder_target'][0][-14:]
predictions = best_tft.predict(val_dataloader)
(actuals - predictions).abs().mean().item()

61.68909454345703

In [30]:
fig = px.line(y=[predictions[0].round(), actuals, data_from_newCases['newCases'][-14:]], line_shape='spline', labels={'variable': 'Variáveis', 'index': 'Índice', 'value': 'Quantidade'})

customLegend(fig=fig,nameSwap = {'wide_variable_0': 'NN', 'wide_variable_1': 'H&W', 'wide_variable_2': 'Número de Casos'})

In [31]:
X_data = data_from_newCases.iloc[:,1:].drop(columns=['newCases_trend'])

In [32]:
y_data = data_from_newCases['newCases_trend']

In [33]:
X_train, X_test = X_data[:-14], X_data[-14:]
y_train, y_test = y_data[:-14], y_data[-14:]

In [34]:
smape_loss(y_test, pd.Series(predictions[0], index=y_test.index))

0.3854660966028666

In [35]:
X_train['is_special_day'] = data_from_newCases['is_special_day'].apply(lambda x: int(x))
X_test['is_special_day'] = data_from_newCases['is_special_day'].apply(lambda x: int(x))



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [36]:
model_lgbm = LGBMRegressor()

In [37]:
model_lgbm.fit(X_train, y_train)

LGBMRegressor()

In [38]:
y_pred_lgbm = pd.Series(model_lgbm.predict(X_test), index=y_test.index)

In [39]:
def customLegend(fig, nameSwap):
    for i, dat in enumerate(fig.data):
        for elem in dat:
            if elem == 'name':
                fig.data[i].name = nameSwap[fig.data[i].name]
    return(fig)

#customLegend(fig=fig,nameSwap = {'0': 'Sim', '1': 'Não'})

In [40]:
fig = px.line(y=[y_pred_lgbm.round(), y_test, data_from_newCases['newCases'][-14:]], line_shape='spline', labels={'variable': 'Variáveis', 'index': 'Índice', 'value': 'Quantidade'})

customLegend(fig=fig,nameSwap = {'wide_variable_0': 'Light GBM', 'wide_variable_1': 'H&W', 'wide_variable_2': 'Número de Casos'})

In [41]:
model_knn = KNeighborsRegressor(n_neighbors=5)

In [42]:
model_knn.fit(X_train, y_train)

KNeighborsRegressor()

In [44]:
y_pred_knn = model_knn.predict(X_test)

In [45]:
fig = px.line(y=[y_pred_knn.round(), y_test, data_from_newCases['newCases'][-14:]], line_shape='spline', labels={'variable': 'Variáveis', 'index': 'Índice', 'value': 'Quantidade'})

customLegend(fig=fig,nameSwap = {'wide_variable_0': 'KNN', 'wide_variable_1': 'H&W', 'wide_variable_2': 'Número de Casos'})

In [46]:
smape_loss(y_test, pd.Series(y_pred_knn, index=y_test.index))

0.4711062568616537