# Aprendizado de Máquina

### Teste paramétricos para elaboração de modelo de Light GBM. Ainda em fases de teste

In [1]:
import plotly.express as px
import pandas as pd
import numpy as np
import requests
import gzip

In [2]:
from statsmodels.tsa.filters.hp_filter import hpfilter
from yaml import safe_load
import torch
from skopt import gp_minimize

In [3]:
import pytorch_lightning as pl
from pytorch_lightning.callbacks import EarlyStopping

In [4]:
from pytorch_forecasting import Baseline, TemporalFusionTransformer, TimeSeriesDataSet
from sklearn.preprocessing import MinMaxScaler
from pytorch_forecasting.metrics import QuantileLoss
from sktime.performance_metrics.forecasting import smape_loss
from sklearn.neighbors import KNeighborsRegressor
from lightgbm import LGBMRegressor
from mlxtend.regressor import StackingRegressor

In [5]:
SEED = 4

In [6]:
url = 'https://github.com/wcota/covid19br/blob/master/cases-brazil-cities-time.csv.gz?raw=true'
r = requests.get(url, allow_redirects=True)
open('data.csv.gz','wb').write(r.content)
gz = gzip.open('data.csv.gz')
df = pd.read_csv(gz)

In [7]:
with open('config.yml') as f:
    config = safe_load(f)

## Validação de Modelos

In [8]:
def to_zero(x):
    if x < 0:
        return 0
    return x

In [9]:
data = df.query("state == 'AM'").groupby('date').sum()
data.index = pd.to_datetime(data.index)
data.index.freq = 'D'
data.drop(columns=['epi_week', 'ibgeID', 'cod_RegiaoDeSaude', 'deaths', 'totalCases'], inplace=True)
data = data.iloc[:,:2]

In [10]:
data = data[:-1]

In [11]:
data['newDeaths'] = data['newDeaths'].apply(to_zero)
data['newCases'] = data['newCases'].apply(to_zero)

In [12]:
for col in data.columns:
    data['log' + "_" + col] = np.log(data[col] + 1)

In [42]:
def special_days(x):
    if x == 'Monday' or x == 'Sunday':
        return 1
    return 0

In [13]:
for col in data.columns[:2]:
    cycle, trend = hpfilter(data[col])
    data[col + '_' + 'cycle'] = cycle.round()
    data[col + '_' + 'trend'] = trend.round()

In [14]:
data['newDeaths_trend'] = data['newDeaths_trend'].apply(to_zero)
data['newCases_trend'] = data['newCases_trend'].apply(to_zero)
data['dia_da_semana'] = data.index.day_name()
data.reset_index(inplace=True)
data['date'] = pd.to_datetime(data['date'])
data_from_newCases = data[['date', 'newCases', 'log_newCases', 'newCases_trend','newCases_cycle', 'dia_da_semana']]
data_from_newCases['MA_7'] = data_from_newCases['newCases'].rolling(7).mean()
data_from_newCases = data_from_newCases.fillna(0)
data_from_newCases["time_idx"] = range(len(data_from_newCases['date'])) #time_idx
data_from_newCases['dia_da_semana'] = data_from_newCases['dia_da_semana'].apply(special_days)
data_from_newCases.rename(columns={'dia_da_semana': 'is_special_day'}, inplace=True)
data_from_newCases.iloc[:,1:-1] = scaler.fit_transform(data_from_newCases.iloc[:,1:-1])

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  data_from_newCases['MA_7'] = data_from_newCases['newCases'].rolling(7).mean()


In [45]:
scaler = MinMaxScaler([0,1])

In [43]:
data_from_newDeaths = data[['date', 'newDeaths', 'log_newDeaths', 'newDeaths_cycle', 'newDeaths_trend', 'dia_da_semana']]
data_from_newDeaths['MA_7'] = data_from_newDeaths['newDeaths'].rolling(7).mean()
data_from_newDeaths = data_from_newDeaths.fillna(0)
data_from_newDeaths["time_idx"] = range(len(data_from_newDeaths['date'])) #time_idx
data_from_newDeaths['dia_da_semana'] = data_from_newDeaths['dia_da_semana'].apply(special_days)
data_from_newDeaths.rename(columns={'dia_da_semana': 'is_special_day'}, inplace=True)
data_from_newDeaths.iloc[:,1:-1] = scaler.fit_transform(data_from_newDeaths.iloc[:,1:-1])



A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [19]:
forecasting_horizon = 14

In [20]:
X_data = data_from_newCases.iloc[:,1:].drop(columns=['newCases_trend'])
y_data = data_from_newCases['newCases_trend']
X_train, X_test = X_data[:-forecasting_horizon], X_data[-forecasting_horizon:]
y_train, y_test = y_data[:-forecasting_horizon], y_data[-forecasting_horizon:]

In [21]:
model_lgbm = LGBMRegressor(random_state=SEED,
    learning_rate=0.1,
    colsample_bytree=0.19578750056100663,
    min_child_samples=78,
    n_estimators=666, num_leaves=104,
    subsample=1.0,
    subsample_freq=1)

In [22]:
model_lgbm.fit(X_train, y_train)

LGBMRegressor(colsample_bytree=0.19578750056100663, min_child_samples=78,
              n_estimators=666, num_leaves=104, random_state=4,
              subsample_freq=1)

In [23]:
y_pred_lgbm = pd.Series(model_lgbm.predict(X_test), index=y_test.index)

In [24]:
fig = px.line(y=[y_pred_lgbm, y_test, data_from_newCases['newCases'][-14:]], line_shape='spline', labels={'variable': 'Variáveis', 'index': 'Índice', 'value': 'Quantidade'})

fig

In [25]:
smape_loss(y_test, y_pred_lgbm)

0.04439156755278304

In [26]:
def ExpandingWindowSplitter(data_frame, target, model, fh=1, step_length=1, initial_window = 10, SEED = 4):
    data_frame = data_frame.copy()
    data_frame = data_frame.drop(columns=['date'])

    X_data = data_frame.drop(columns=[target])
    y_data = data_frame[target]
    
    data_frame['test_sMAPE'] = np.nan

    for i in range(initial_window, len(data_frame), step_length):

        X_train, X_test = X_data[:i], X_data[i:i + fh]
        y_train, y_test = y_data[:i], y_data[i:i + fh]

        model.fit(X_train, y_train)

        y_pred = pd.Series(model.predict(X_test), index=y_test.index)

        res = smape_loss(y_test, y_pred)

        data_frame['test_sMAPE'].loc[i:i + fh] = res

    return data_frame

### Bayesian Search: newCases

In [27]:
#def search_hyperparams(params):
#    SEED = 4
#
#    learning_rate = params[0]
#    num_leaves = params[1]
#    min_child_samples = params[2]
#    subsample = params[3]
#    colsample_bytree = params[4]
#    n_estimators = params[5]
#
#    print(params, '\n')
#    
#    model = LGBMRegressor(random_state=SEED,
#    learning_rate = learning_rate, 
#    num_leaves = num_leaves, 
#    min_child_samples = min_child_samples, 
#    subsample = subsample,
#    colsample_bytree = colsample_bytree,
#    subsample_freq=1,
#    n_estimators=n_estimators)
#
#
#    cross_val = ExpandingWindowSplitter(data_from_newCases, 'newCases_trend', model,initial_window=60, fh=14)
#    cross_val = cross_val['test_sMAPE'].mean()
#
#    return cross_val
#space = [(1e-3, 1e-1, 'log-uniform'), #learning rate
#(2, 128), #num_leaves
#(1, 100), #min_child_samples
#(0.05, 1.0), #subsample
#(0.1, 1.0), #colsample_bytree
#(100, 1000)] #n_estimators
#
#result = gp_minimize(search_hyperparams, 
#space,
#random_state=SEED, 
#verbose=1,
#n_calls = 30,
#n_random_starts=10)
#
#learning_rate, num_leaves, min_child_samples, subsample, colsample_bytree, n_estimators = result.x #[0.07894585841146558, 45, 1, #0.4726915572064738, 0.706265285031939, 190]

In [28]:
#result.x

NameError: name 'result' is not defined

In [29]:
model = LGBMRegressor(random_state=SEED,
    learning_rate = 0.04591301953670739, 
    num_leaves = 45, 
    min_child_samples = 1, 
    subsample = 0.05,
    colsample_bytree = 0.9828905761860228,
    subsample_freq=1,
    n_estimators=685)

In [30]:
cross_val = ExpandingWindowSplitter(data_from_newCases, 'newCases_trend', model, initial_window=60, fh=14)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [281]:
cross_val['test_sMAPE'].mean()

0.08653091463112592

In [31]:
cross_val['test_sMAPE'].median()

0.06697830466044706

In [33]:
cross_val.to_csv(r'C:\Users\heylu\Documents\github\HeyLucasLeao.github.io\ipynb\ML\resultados_de_validacao\cross_val_newCases.csv')

### Bayesian Search: newDeaths

In [52]:
X_data = data_from_newDeaths.iloc[:,1:].drop(columns=['newDeaths_trend'])
y_data = data_from_newDeaths['newDeaths_trend']
X_train, X_test = X_data[:-forecasting_horizon], X_data[-forecasting_horizon:]
y_train, y_test = y_data[:-forecasting_horizon], y_data[-forecasting_horizon:]

In [53]:
#def search_hyperparams(params):
#    SEED = 4
#
#    learning_rate = params[0]
#    num_leaves = params[1]
#    min_child_samples = params[2]
#    subsample = params[3]
#    colsample_bytree = params[4]
#    n_estimators = params[5]
#
#    print(params, '\n')
#    
#    model = LGBMRegressor(random_state=SEED,
#    learning_rate = learning_rate, 
#    num_leaves = num_leaves, 
#    min_child_samples = min_child_samples, 
#    subsample = subsample,
#    colsample_bytree = colsample_bytree,
#    subsample_freq=1,
#    n_estimators=n_estimators)
#
#
#    cross_val = ExpandingWindowSplitter(data_from_newDeaths, 'newDeaths_trend', model,initial_window=60, fh=14)
#    cross_val = cross_val['test_sMAPE'].mean()
#
#    return cross_val
#space = [(1e-3, 1e-1, 'log-uniform'), #learning rate
#(2, 128), #num_leaves
#(1, 100), #min_child_samples
#(0.05, 1.0), #subsample
#(0.1, 1.0), #colsample_bytree
#(100, 1000)] #n_estimators
#
#result = gp_minimize(search_hyperparams, 
#space,
#random_state=SEED, 
#verbose=1,
#n_calls = 30,
#n_random_starts=10)
#
#learning_rate, num_leaves, min_child_samples, subsample, colsample_bytree, n_estimators = result.x #[0.07894585841146558, 45, 1, #0.4726915572064738, 0.706265285031939, 190]

Iteration No: 1 started. Evaluating function at random point.
[0.06327656730105531, 24, 86, 0.628583821377474, 0.6378005846923631, 227] 



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

Iteration No: 1 ended. Evaluation done at random point.
Time taken: 4.5031
Function value obtained: 0.5980
Current minimum: 0.5980
Iteration No: 2 started. Evaluating function at random point.
[0.0028119775395572893, 90, 90, 0.6513149491728379, 0.10464323370096502, 615] 

Iteration No: 2 ended. Evaluation done at random point.
Time taken: 9.2786
Function value obtained: 0.4976
Current minimum: 0.4976
Iteration No: 3 started. Evaluating function at random point.
[0.005284363548873893, 78, 40, 0.8213503607253493, 0.6512728053921026, 340] 

Iteration No: 3 ended. Evaluation done at random point.
Time taken: 10.0399
Function value obtained: 0.3106
C

In [54]:
result.x

[0.07876350967431142, 94, 1, 0.05181184025013618, 1.0, 646]

In [55]:
model = LGBMRegressor(random_state=SEED,
    learning_rate = 0.07876350967431142, 
    num_leaves = 94, 
    min_child_samples = 1, 
    subsample = 0.05181184025013618,
    colsample_bytree = 1.0,
    subsample_freq=1,
    n_estimators=646)

In [56]:
cross_val = ExpandingWindowSplitter(data_from_newDeaths, 'newDeaths_trend', model, initial_window=60, fh=14)



A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy



In [57]:
cross_val['test_sMAPE'].mean()

0.12807755826327513

In [58]:
cross_val['test_sMAPE'].median()

0.09924936927210701

In [59]:
cross_val.to_csv(r'C:\Users\heylu\Documents\github\HeyLucasLeao.github.io\ipynb\ML\resultados_de_validacao\cross_val_newDeaths.csv')

In [60]:
y_pred = pd.Series(model.predict(X_test), index=y_test.index)

In [62]:
smape_loss(y_test, y_pred)

0.02239643329966805

In [61]:
fig = px.line(y=[y_pred, y_test, data_from_newDeaths['newDeaths'][-14:]], line_shape='spline', labels={'variable': 'Variáveis', 'index': 'Índice', 'value': 'Quantidade'})

fig