In [157]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

from statsforecast import StatsForecast
from statsforecast.models import AutoARIMA
from statsmodels.tsa.arima_model import ARIMA

from mlforecast import MLForecast
from mlforecast.lag_transforms import ExpandingMean, RollingMean
from mlforecast.target_transforms import Differences

from sklearn.neural_network import MLPRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error

In [158]:
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [159]:
path = "../data/parquet/dataSetFinalTratadoTop5Diseases.parquet"
df = pd.read_parquet(path)

### Funções Úteis

In [160]:
def verificar_periodo_completo(dataframe, coluna):
    data_mais_antiga = dataframe[coluna].min()
    data_mais_recente = dataframe[coluna].max()
    
    intervalo_completo = pd.period_range(start=data_mais_antiga, end=data_mais_recente, freq='M')
    
    todas_datas_presentes = all(data in dataframe[coluna].values for data in intervalo_completo)

    return data_mais_antiga, data_mais_recente, todas_datas_presentes

In [161]:
def recortar_dataframe(dataframe, coluna, data_inicio, data_fim):
    data_inicio = pd.Period(data_inicio, freq='M')
    data_fim = pd.Period(data_fim, freq='M')
    
    dataframe_recortado = dataframe[(dataframe[coluna] >= data_inicio) & (dataframe[coluna] <= data_fim)]
    
    return dataframe_recortado

In [162]:
def concatenar_datasets(*dataframes):
    dataframe_concatenado = pd.concat(dataframes, ignore_index=True)
    return dataframe_concatenado

In [163]:
def prever_arima(serie_temporal):
    teste_serie_regiao = serie_temporal.groupby('unique_id').tail(12)
    train_serie_regiao = serie_temporal.drop(teste_serie_regiao.index, axis = 0)

    models = [AutoARIMA(season_length = 12)]

    sf = StatsForecast(
    df=train_serie_regiao,
    models=models,
    freq='M'
    )

    df_forecast = sf.forecast(12, fitted=True).reset_index()

    df_fitted = sf.forecast_fitted_values().reset_index()

    return df_forecast, df_fitted

In [164]:
def modelar_residuos(serie_temporal, forecast, fitted):
    residuos = pd.DataFrame()

    for id in forecast['unique_id'].unique():
        treino = fitted[fitted['unique_id'] == id]
        teste = forecast[forecast['unique_id'] == id].reset_index(drop=True)

        residuos_treino = (treino['AutoARIMA'] - treino['y'])

        teste_serie = serie_temporal[serie_temporal['unique_id'] == id].tail(12).reset_index(drop=True)

        teste['ds'] = teste['ds'] + pd.DateOffset(months=1)

        residuos_teste = (forecast['AutoARIMA'] - teste_serie['y'])
        residuos_totais = pd.concat([residuos_treino, residuos_teste], ignore_index=True)

        coluna_nome = f'residuo_{id}'
        residuos[coluna_nome] = residuos_totais

    return residuos

In [165]:
def criar_lags(data, lag):
    X, y = [], []
    for i in range(lag, len(data)):
        X.append(data[i-lag:i])
        y.append(data[i])
    return np.array(X), np.array(y)

In [166]:
def prever_mlp(residuos, window_size):
    df_prev = pd.DataFrame()

    for id in residuos.columns:
        res = residuos[id].values

        X, y = criar_lags(res, window_size)
        X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=False)

        scaler = StandardScaler()
        X_train = scaler.fit_transform(X_train)
        X_test = scaler.transform(X_test)

        mlp = MLPRegressor(hidden_layer_sizes=(100,100), max_iter=500, random_state=42)
        mlp.fit(X_train, y_train)
        y_pred = mlp.predict(X_test)

        prev_full = np.concatenate((mlp.predict(X_train), y_pred), axis=0)

        coluna_nome = f'prev_{id}'
        df_prev[coluna_nome] = prev_full

    return df_prev

### Série Por Região (Sem distinguir doenças)

In [167]:
def pegar_dfs_por_regiao(df_full):
    df_regioes = df_full[['Data de Diagnostico', 'Regiao', 'categoria_doenca']]
    df_regioes['y'] = 1

    df_regioes.dropna(inplace=True)

    df_regioes['year_month'] = df_regioes['Data de Diagnostico'].dt.to_period('M')

    df_grouped = df_regioes.groupby(['year_month', 'Regiao', 'categoria_doenca'])['y'].sum().reset_index()

    df_nordeste = df_grouped[df_grouped['Regiao'] == 'NORDESTE'].drop('categoria_doenca', axis=1).groupby('year_month')['y'].sum().reset_index()
    df_sudeste = df_grouped[df_grouped['Regiao'] == 'SUDESTE'].drop('categoria_doenca', axis=1).groupby('year_month')['y'].sum().reset_index()
    df_norte = df_grouped[df_grouped['Regiao'] == 'NORTE'].drop('categoria_doenca', axis=1).groupby('year_month')['y'].sum().reset_index()
    df_centro_oeste = df_grouped[df_grouped['Regiao'] == 'CENTRO-OESTE'].drop('categoria_doenca', axis=1).groupby('year_month')['y'].sum().reset_index()
    df_sul = df_grouped[df_grouped['Regiao'] == 'SUL'].drop('categoria_doenca', axis=1).groupby('year_month')['y'].sum().reset_index()

    df_nordeste = recortar_dataframe(df_nordeste, 'year_month', '1999-01', '2018-12')
    df_sudeste = recortar_dataframe(df_sudeste, 'year_month', '1999-01', '2018-12')
    df_norte = recortar_dataframe(df_norte, 'year_month', '1999-01', '2018-12')
    df_centro_oeste = recortar_dataframe(df_centro_oeste, 'year_month', '1999-01', '2018-12')
    df_sul = recortar_dataframe(df_sul, 'year_month', '1999-01', '2018-12')

    df_nordeste['unique_id'] = 'NORDESTE'
    df_sudeste['unique_id'] = 'SUDESTE'
    df_norte['unique_id'] = 'NORTE'
    df_centro_oeste['unique_id'] = 'CENTRO-OESTE'
    df_sul['unique_id'] = 'SUL'

    serie_regiao = concatenar_datasets(df_nordeste, df_sudeste, df_norte, df_centro_oeste, df_sul)
    serie_regiao.rename(columns={'year_month': 'ds'}, inplace=True)
    serie_regiao['ds'] = serie_regiao['ds'].apply(lambda x: x.to_timestamp())

    return serie_regiao

In [133]:
serie_regiao = pegar_dfs_por_regiao(df)
serie_regiao.head(20)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_regioes['y'] = 1
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_regioes.dropna(inplace=True)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_regioes['year_month'] = df_regioes['Data de Diagnostico'].dt.to_period('M')


Unnamed: 0,ds,y,unique_id
0,1999-01-01,1323,NORDESTE
1,1999-02-01,1148,NORDESTE
2,1999-03-01,1616,NORDESTE
3,1999-04-01,1332,NORDESTE
4,1999-05-01,1648,NORDESTE
5,1999-06-01,1334,NORDESTE
6,1999-07-01,1396,NORDESTE
7,1999-08-01,1497,NORDESTE
8,1999-09-01,1263,NORDESTE
9,1999-10-01,1340,NORDESTE


In [134]:
forecast, fitted = prever_arima(serie_regiao)

  freq = pd.tseries.frequencies.to_offset(freq)
  freq = pd.tseries.frequencies.to_offset(freq)


In [135]:
serie_residuos = modelar_residuos(serie_regiao, forecast, fitted)

In [144]:
serie_residuos

Unnamed: 0,residuo_CENTRO-OESTE,residuo_NORDESTE,residuo_NORTE,residuo_SUDESTE,residuo_SUL
0,-0.60199,-1.322998,-0.332001,-3.379883,-0.945984
1,226.250061,141.573242,9.206909,223.921143,146.025146
2,13.911072,-351.286011,-19.410675,-657.337158,-0.337402
3,84.861237,85.784058,24.359528,271.568604,87.742188
4,-52.266541,-242.720459,-13.517456,42.359619,84.710144
5,-264.235535,163.25415,24.089264,101.45166,-229.518066
6,99.214783,24.980225,-22.422211,31.817383,95.171021
7,66.331757,-78.818237,-6.74353,169.344971,78.092957
8,68.036896,172.218384,-10.097168,281.906494,0.591553
9,50.144592,15.910767,27.838013,32.479004,38.561584


In [138]:
forecast[forecast['unique_id'] == 'SUL']

Unnamed: 0,unique_id,ds,AutoARIMA
48,SUL,2017-12-31,1050.595459
49,SUL,2018-01-31,1021.003662
50,SUL,2018-02-28,1125.982788
51,SUL,2018-03-31,1072.394165
52,SUL,2018-04-30,1164.366455
53,SUL,2018-05-31,1136.118896
54,SUL,2018-06-30,1078.309692
55,SUL,2018-07-31,1141.37207
56,SUL,2018-08-31,1040.138062
57,SUL,2018-09-30,1098.518433


In [141]:
fitted[fitted['unique_id'] == 'SUL']

Unnamed: 0,unique_id,ds,y,AutoARIMA
912,SUL,1999-01-01,946.0,945.054016
913,SUL,1999-02-01,739.0,885.025146
914,SUL,1999-03-01,854.0,853.662598
915,SUL,1999-04-01,736.0,823.742188
916,SUL,1999-05-01,723.0,807.710144
917,SUL,1999-06-01,1036.0,806.481934
918,SUL,1999-07-01,717.0,812.171021
919,SUL,1999-08-01,753.0,831.092957
920,SUL,1999-09-01,802.0,802.591553
921,SUL,1999-10-01,752.0,790.561584


### Série Por Região e Categoria de Doença