# Notebook to update data for the forecast

In [1]:
import numpy as np
import pandas as pd
from epiweeks import Week
import matplotlib.pyplot as plt
import os
os.getcwd()

'/Users/eduardoaraujo/Documents/Github/transfer-learning-forecast/forecast'

## LSTM models for the states and "Macroregionais de saúde"

### Path where the cases and climate data are saved:  

In [2]:
PATH = '../data'

### The dataframe below will be used in the functions to get the link between the geocodes and the health macroregion code: 
    

In [3]:
dfs = pd.read_csv('../macro_saude.csv')

dfs.head()

Unnamed: 0.1,Unnamed: 0,geocode,name_muni,name_region,code_region,name_macro,code_macro,state
0,0,1100015,Alta Floresta D'Oeste,Zona da Mata,11005,Cacoal,1101,RO
1,1,1100023,Ariquemes,Vale do Jamari,11001,Porto Velho,1102,RO
2,2,1100031,Cabixi,Cone Sul,11006,Cacoal,1101,RO
3,3,1100049,Cacoal,Café,11002,Cacoal,1101,RO
4,4,1100056,Cerejeiras,Cone Sul,11006,Cacoal,1101,RO


In [4]:
dfs.loc[dfs.state == 'GO'].code_macro.unique().shape

(5,)

In [5]:
dfs.loc[dfs.geocode == 3106200]

Unnamed: 0.1,Unnamed: 0,geocode,name_muni,name_region,code_region,name_macro,code_macro,state
2397,2397,3106200,Belo Horizonte,Belo Horizonte/ Nova Lima/ Caeté,31016,Centro,3103,MG


In [6]:
dfs.loc[dfs.state == 'MG'].code_macro.unique().shape

(14,)

In [7]:
def add_new_columns(df):
    '''
    This function add the number of the ep week, the number of the month and the first difference of the cases 
    as new columns in the table
    '''
    
    df['month'] = df.index.month
    
    weeks = []
    for date in df.index:
        #print(date)
        weeks.append(Week.fromdate(date).weektuple()[1])
        #print(Week.fromdate(date).weektuple()[1])
        #break  
        
    df['SE'] = weeks
    
    df.loc[df.index == '2018-04-04', 'SE'] = 15
     
    diff_series = [df]
        
    for i in df.columns[df.columns.str.startswith('casos_est')]:

        diff_series.append(pd.DataFrame(data = np.diff(df[f'{i}'], 1), index = df.index[1:], columns = [f'diff_{i}']))

    df = pd.concat(diff_series, axis = 1, join = 'outer')    
    
    return df
    
    
    

In [8]:
def get_geocodes_and_state(macro): 
    '''
    This function is used to get the geocodes and state that refer to a specific health macro region code
    
    :param macro:int. A four-digit number
        
    '''
    
    dfs = pd.read_csv('../macro_saude.csv')
    
    geocodes = dfs.loc[dfs.code_macro == macro].geocode.unique()
    state = dfs.loc[dfs.code_macro == macro].state.values[0]

    return geocodes, state

In [9]:
def split_geocodes(geocodes):
    
    '''
    This function split the geocodes between the cities with populations up and below 30k in 2022.
    
    :param geocode:list of int. A list with seven-digit ibge codes for brazilian cities 
     
    '''
    
    dfpop = pd.read_csv('poptcu2010-2022_rgi.csv')

    g_low = dfpop.loc[ (dfpop.CODMUN7.isin(geocodes)) & (dfpop.POP22 <= 30000) ].CODMUN7.unique()
    
    g_up = np.setdiff1d(geocodes, g_low)
    
    if geocodes.shape[0] != g_low.shape[0] + g_up.shape[0]:
    
        print('Error subtracting geocodes')
    
    return g_up, g_low

In [10]:
def transform_data(df, geocode, geo_col = 'municipio_geocodigo'): 
    '''
    This filters the data for a specific region and returns it as a separate dataframe.
    
    :param df: pd.DataFrame.
    :param geocode:. Must be at the same type of the geo_col 
    :param geo_col: str. Name of the column in the df that it will be used to filter the geocode value
     
    '''
        
    
    df_ep = df.loc[df[geo_col] == geocode]
    
    del df_ep[geo_col]
    
    df_ep.columns = df_ep.columns + f'_{geocode}'
    
    return df_ep

In [11]:
predictors_clim = ['temp_min', 'temp_max', 'umid_min', 'umid_max',
                   'pressao_min', 'pressao_max', 'precip_tot', 'rainy_days',
                   'temp_mean', 'temp_amp','umid_mean','umid_amp',
                   'pressao_mean']

def predictors_ep_macro(macro): 
    '''
    This function is used to organize in a table the epidemiological predictors related to a specific health macroregion
    
    :params macro: int. A four digit number
    '''
    
    geocodes, state = get_geocodes_and_state(macro)

    # get epidemiological factors 
    df_ep = pd.read_parquet(f'{PATH}/cases/{state}_dengue.parquet',
                           columns = ['data_iniSE', 'casos_est', 'municipio_geocodigo', 'p_rt1', 'Rt', 'p_inc100k'])
    
    # select only the geocodes include in the health macroregion
    df_ep = df_ep.loc[df_ep.municipio_geocodigo.isin(geocodes)]
    
    df_ep = df_ep.sort_index()
    
    # split the geocodes between cities with population up and below 30k in 2022
    g_up, g_low = split_geocodes(geocodes)    

    # get the data of each city with population above 30k
    list_data_ep = []

    for g in g_up:

        list_data_ep.append(transform_data(df_ep, g))
    
    # get the total weekly cases of this health macroregion 
    data_macro_ep = df_ep[['casos_est']].resample('W-SUN').sum()#.agg({'casos_est':np.sum, 
                                                      #'p_rt1': np.mean, 
                                                      #'Rt': np.mean})

    data_macro_ep.columns = data_macro_ep.columns + f'_{macro}'

    list_data_ep.append(data_macro_ep)
    
    
    # aggregate the data from small cities
    
    data_small_cities = df_ep.loc[df_ep.municipio_geocodigo.isin(g_low)][['casos_est','p_rt1', 'Rt']].resample('W-SUN').agg({'casos_est':np.sum, 
                                                                                                        'p_rt1': np.mean, 'Rt': np.mean})
    data_small_cities.columns = data_small_cities.columns + f'_small'
    
    list_data_ep.append(data_small_cities)
    
    data_ep = pd.concat(list_data_ep, axis=1, join='outer')#.fillna(method='ffill')
    
    #remove columns with all values nan 
    data_ep = data_ep.dropna(axis =1, how = 'all')
    
    return data_ep 


def predictors_clim_macro(macro):
    '''
    This function is used to organize in a table the climate predictors related to a specific health macroregion
    
    :params macro: int. A four digit number
    '''
    geocodes, state = get_geocodes_and_state(macro)
    
    # get climate factors 
    df_clim = pd.read_parquet(f'../data/climate/{state}_climate.parquet',
                         columns = predictors_clim.append('geocodigo'))
    
    # select only the geocodes include in the health macroregion
    df_clim = df_clim.loc[df_clim.geocodigo.isin(geocodes)]

    df_clim = df_clim.loc[df_clim.index.year >= 2010]

    del df_clim['index']
    
    # compute other climate features 
    df_clim['temp_mean'] = (df_clim.temp_max+df_clim.temp_min)/2

    df_clim['pressao_mean'] = (df_clim.pressao_max+df_clim.pressao_min)/2

    df_clim['umid_mean'] = (df_clim.umid_max+df_clim.umid_min)/2

    df_clim['temp_amp'] = df_clim.temp_max-df_clim.temp_min
        # Rainy days
    df_clim['rainy_days'] = df_clim.precip_max > 0
        # Humidity amplitude
    df_clim['umid_amp'] = df_clim.umid_max - df_clim.umid_min

    # agg data by weekly since that's the time scale of the cases 
    df_clim = df_clim.groupby('geocodigo').resample('W-SUN').agg({'temp_min':np.mean, 'temp_max': np.mean,

                                                                'umid_min':np.mean, 'umid_max': np.mean,
                                                                'pressao_min':np.mean, 'pressao_max': np.mean,
                                                                'precip_tot':np.sum, 'rainy_days': np.sum,
                                                                'temp_mean':np.mean, 'temp_amp':np.mean,
                                                                'umid_mean': np.mean,'umid_amp': np.mean,
                                                                'pressao_mean':np.mean}).reset_index().set_index('date')
    
    # split the geocodes between cities with population up and below 30k in 2022
    g_up, g_low = split_geocodes(geocodes)
    
    
    # get the predictors of each city with population above 30k
    
    list_data_clim = []

    for g in g_up:

        list_data_clim.append(transform_data(df_clim, g, 'geocodigo'))

    #del df_clim['geocodigo']

    #data_macro_clim = df_clim.resample('W-SUN').agg({'temp_min':np.mean, 'temp_max': np.mean,

     #                                                           'umid_min':np.mean, 'umid_max': np.mean,
      #                                                          'pressao_min':np.mean, 'pressao_max': np.mean,
       #                                                         'precip_tot':np.sum, 'rainy_days': np.sum,
        #                                                        'temp_mean':np.mean, 'temp_amp':np.mean,
         #                                                       'umid_mean': np.mean,'umid_amp': np.mean,
          #                                                      'pressao_mean':np.mean}).reset_index().set_index('date')

    #data_macro_clim.columns = data_macro_clim.columns + f'_{macro}'

    #list_data_clim.append(data_macro_clim)
    
    # aggregate the data from small cities and save the mean as predictor
    
    data_small_cities = df_clim.loc[df_clim.geocodigo.isin(g_low)][['temp_min','temp_max',
                                                                'umid_min', 'umid_max',
                                                                'pressao_min', 'pressao_max',
                                                                'precip_tot', 'rainy_days',
                                                                'temp_mean', 'temp_amp',
                                                                'umid_mean', 'umid_amp',
                                                                'pressao_mean']].resample('W-SUN').mean()
    

    data_small_cities.columns = data_small_cities.columns + f'_small'
    
    list_data_clim.append(data_small_cities)
    
    data_clim = pd.concat(list_data_clim, axis=1, join='outer').fillna(method='ffill')
    
    #remove columns with all values nan 
    data_clim = data_clim.dropna(axis =1, how = 'all')
    
    return data_clim 


def get_data_macro(macro):
    '''
    This function is used to organize in a table the climate and epidemiological predictors 
    related to a specific health macroregion.
    
    :params macro: int. A four-digit number
    '''
    
    data_ep = predictors_ep_macro(macro)
    
    data_clim = predictors_clim_macro(macro)
    
    data_full = pd.concat([data_ep, data_clim], axis = 1, join = 'outer')#.fillna(method = 'ffill')
    
    data_full = add_new_columns(data_full)
    
    return data_full


def predictors_ep_state(state): 
    
    '''
    This function is used to organize in a table the epidemiological predictors related to a specific state    
    :params state: str. Two leters code 
    '''
        
    
    # get epidemiological factors 
    df_ep = pd.read_parquet(f'../data/cases/{state}_dengue.parquet',
                           columns = ['data_iniSE', 'casos_est', 'municipio_geocodigo', 'p_rt1', 'Rt'])

    df_ep = df_ep.sort_index()
    
    # this copy will be used to compute the target for all the state later 
    df_ep_copy = df_ep.copy()
    
    # link the geocode and the health macroregion code 
    df_ep = df_ep.reset_index().merge(dfs[['code_macro', 'geocode']].rename(columns = {'geocode':'municipio_geocodigo'}),
                          on = 'municipio_geocodigo').set_index('data_iniSE')
    
    del df_ep['municipio_geocodigo']
    
    # resample the data based of the macroregion 
    df_ep = df_ep.groupby('code_macro').resample('W-SUN').agg({'casos_est':np.sum, 
                               'p_rt1': np.mean, 
                               'Rt': np.mean}).reset_index().set_index('data_iniSE')
    
    df_ep.index = pd.to_datetime(df_ep.index)
    
    # transform in column the data of each predictor by macroregion
    list_data_ep = []

    for m in df_ep.code_macro.unique():

        list_data_ep.append(transform_data(df_ep, m, 'code_macro'))
    
    # get the total weekly cases of the state (it will be used as target)
    data_state_ep = df_ep_copy[['casos_est']].resample('W-SUN').sum()#agg({'casos':np.sum, 
                               #'p_rt1': np.mean, 
                               #'Rt': np.mean})

    data_state_ep.columns = data_state_ep.columns + f'_{state}'

    list_data_ep.append(data_state_ep)
    
    # final dataframe
    data_ep = pd.concat(list_data_ep, axis=1, join='outer')#.fillna(method='ffill')
    
    #remove columns with all values nan 
    data_ep = data_ep.dropna(axis =1, how = 'all')
    
    return data_ep 

def predictors_clim_state(state):
    '''
    This function is used to organize in a table the climate predictors related to a specific state    
    :params state: str. Two leters code 
    '''
    
    # get climate factors 
    df_clim = pd.read_parquet(f'../data/climate/{state}_climate.parquet',
                         columns = predictors_clim.append('geocodigo'))

    df_clim = df_clim.loc[df_clim.index.year >= 2010]
    
    del df_clim['index']

    df_clim['temp_mean'] = (df_clim.temp_max+df_clim.temp_min)/2

    df_clim['pressao_mean'] = (df_clim.pressao_max+df_clim.pressao_min)/2

    df_clim['umid_mean'] = (df_clim.umid_max+df_clim.umid_min)/2

    df_clim['temp_amp'] = df_clim.temp_max-df_clim.temp_min
        # Rainy days
    df_clim['rainy_days'] = df_clim.precip_max > 0
        # Humidity amplitude
    df_clim['umid_amp'] = df_clim.umid_max - df_clim.umid_min

    # link the geocode and the health macroregion code 
    
    df_clim = df_clim.reset_index().merge(dfs[['code_macro', 'geocode']].rename(columns = {'geocode':'geocodigo'}),
                          on = 'geocodigo').set_index('date')

    del df_clim['geocodigo']

    # resample the data based of the macroregion 
    df_clim = df_clim.groupby('code_macro').resample('W-SUN').agg({'temp_min':np.mean, 'temp_max': np.mean,

                                                                'umid_min':np.mean, 'umid_max': np.mean,
                                                                'pressao_min':np.mean, 'pressao_max': np.mean,
                                                                'precip_tot':np.sum, 'rainy_days': np.sum,
                                                                'temp_mean':np.mean, 'temp_amp':np.mean,
                                                                'umid_mean': np.mean,'umid_amp': np.mean,
                                                                'pressao_mean':np.mean}).reset_index().set_index('date')

    # transform in column the data of each predictor by macroregion
    list_data_clim = []

    for m in df_clim.code_macro.unique():

        list_data_clim.append(transform_data(df_clim, m, 'code_macro'))
    

    #data_state_clim = df_clim_copy.resample('W-SUN').agg({'temp_min':np.mean, 'temp_max': np.mean,

     #                                                           'umid_min':np.mean, 'umid_max': np.mean,
      #                                                          'pressao_min':np.mean, 'pressao_max': np.mean,
       #                                                         'precip_tot':np.sum, 'rainy_days': np.sum,
        #                                                        'temp_mean':np.mean, 'temp_amp':np.mean,
         #                                                       'umid_mean': np.mean,'umid_amp': np.mean,
          #                                                      'pressao_mean':np.mean}).reset_index().set_index('date')

    #data_state_clim.columns = data_state_clim.columns + f'_{state}'

    #list_data_clim.append(data_state_clim)

    # final dataframe
    data_clim = pd.concat(list_data_clim, axis=1, join='outer').ffill()#.fillna(method='ffill')
    
    #remove columns with all values nan 
    data_clim = data_clim.dropna(axis =1, how = 'all')
    
    return data_clim 


def get_data_state(state):
    '''
    This function is used to organize in a table the climate and epidemiological predictors 
    related to a specific state.
    
    :params macro: int. A four digit number
    '''
    
    data_ep = predictors_ep_state(state)
    
    data_clim = predictors_clim_state(state)
    
    data_full = pd.concat([data_ep, data_clim], axis = 1, join = 'outer')#.fillna(method = 'ffill')
    
    data_full = add_new_columns(data_full)
    
    data_full = data_full.dropna()
    
    return data_full


Get data for all macro in MG: 

In [12]:
macro = 3524
df1 = get_data_macro(macro)
df1 = df1.dropna()
#df1 = df1.loc[df1.index <= '2024-03-17']
df1.tail()

Unnamed: 0,casos_est_3509007,p_rt1_3509007,Rt_3509007,p_inc100k_3509007,casos_est_3509205,p_rt1_3509205,Rt_3509205,p_inc100k_3509205,casos_est_3516309,p_rt1_3516309,...,umid_amp_3528502,pressao_mean_3528502,month,SE,diff_casos_est_3509007,diff_casos_est_3509205,diff_casos_est_3516309,diff_casos_est_3516408,diff_casos_est_3528502,diff_casos_est_3524
2024-03-31,72.0,1.10606e-11,0.410354,73.022316,333.0,0.0004690294,0.785637,328.07883,234.0,0.999994,...,29.386656,1.003913,3,14,-110.0,50.5,58.0,-204.0,-382.0,-587.5
2024-04-07,18.0,0.0,0.124052,18.255579,210.0,8.830114e-11,0.580902,206.89655,169.0,0.313499,...,44.837135,1.002509,4,15,-54.0,-123.0,-65.0,-875.0,-210.0,-1327.0
2024-04-14,8.0,0.0,0.090794,8.11359,162.5,3.787275e-10,0.555066,160.09853,249.0,0.991411,...,33.817332,0.859546,4,16,-10.0,-47.5,80.0,-143.0,-39.0,-159.5
2024-04-21,9.0,5.453374e-06,0.238877,9.12779,229.5,0.4196924,0.981361,226.10837,176.0,0.07541,...,37.442228,1.004527,4,17,1.0,67.0,-73.0,4.0,-2.0,-3.0
2024-04-28,4.0,0.004510253,0.272867,4.056795,265.5,0.9974812,1.296287,261.57635,208.0,0.510182,...,48.120652,1.0028,4,18,-5.0,36.0,32.0,-210.5,0.0,-147.5


In [13]:
df1.isnull().sum()[df1.isnull().sum()>0]

Series([], dtype: int64)

In [14]:

for macro in dfs.code_macro.unique():
#for macro in [3103]:#dfs.code_macro.unique():

    df1 = get_data_macro(macro)
    
    df1 = df1.dropna()
    
    #df1 = df1.loc[df1.index <= '2024-03-17']
    
    df1.to_csv(f'../data/dengue_{macro}.csv.gz')
    
    df1.head()

df1.tail()

In [15]:

for state in dfs.state.unique():
    df2 = get_data_state(state)
    
    #df2 = df2.loc[df2.index <= '2024-03-17']
    
    df2.to_csv(f'../data/dengue_{state}.csv.gz')

    df2.head()

In [16]:
df2.tail()

Unnamed: 0,casos_est_5302,p_rt1_5302,Rt_5302,casos_est_DF,temp_min_5302,temp_max_5302,umid_min_5302,umid_max_5302,pressao_min_5302,pressao_max_5302,...,rainy_days_5302,temp_mean_5302,temp_amp_5302,umid_mean_5302,umid_amp_5302,pressao_mean_5302,month,SE,diff_casos_est_5302,diff_casos_est_DF
2024-03-31,11249.5,0.0,0.71089,11249.5,19.376583,26.794983,62.869103,96.655234,0.999868,1.004094,...,7,23.085783,7.4184,79.762169,33.786131,1.001981,3,14,-32.5,-32.5
2024-04-07,9892.0,0.0,0.764559,9892.0,19.583313,26.363325,67.464338,96.19828,0.999298,1.00363,...,7,22.973319,6.780012,81.831309,28.733942,1.001464,4,15,-1357.5,-1357.5
2024-04-14,7139.0,0.0,0.649511,7139.0,-22.468049,26.765054,67.464338,96.19828,0.856338,0.860453,...,7,2.148502,49.233103,81.831309,36.499378,0.858395,4,16,-2753.0,-2753.0
2024-04-21,4857.5,0.0,0.519835,4857.5,19.251735,28.108525,53.106008,90.370735,1.000383,1.004554,...,7,23.68013,8.85679,71.738371,37.264727,1.002468,4,17,-2281.5,-2281.5
2024-04-28,3506.5,0.0,0.482592,3506.5,17.855324,28.442318,47.858158,90.142849,0.999863,1.004413,...,6,23.148821,10.586994,69.000503,42.284691,1.002138,4,18,-1351.0,-1351.0


In [17]:
macro = 1101 

filename_data = f'../data/dengue_{macro}.csv.gz'

df = pd.read_csv(filename_data, index_col='Unnamed: 0')

df

Unnamed: 0,casos_est_1100049,p_rt1_1100049,Rt_1100049,p_inc100k_1100049,casos_est_1100189,p_rt1_1100189,Rt_1100189,p_inc100k_1100189,casos_est_1100288,p_rt1_1100288,...,umid_amp_small,pressao_mean_small,month,SE,diff_casos_est_1100049,diff_casos_est_1100189,diff_casos_est_1100288,diff_casos_est_1100304,diff_casos_est_1101,diff_casos_est_small
2010-01-10,161.0,0.000000,0.000000,187.442520,118.0,0.000000,0.000000,319.947940,210.0,0.000000e+00,...,25.645254,0.996053,1,2,-3.0,66.0,24.0,28.0,171.0,56.0
2010-01-17,136.0,0.000000,0.000000,158.336530,192.0,0.000000,0.000000,520.593260,171.0,0.000000e+00,...,23.866842,0.997954,1,3,-25.0,74.0,-39.0,11.0,22.0,1.0
2010-01-24,75.0,0.000000,0.000000,87.317940,166.0,0.000000,0.000000,450.096250,183.0,0.000000e+00,...,25.236053,0.996823,1,4,-61.0,-26.0,12.0,3.0,-9.0,63.0
2010-01-31,71.0,0.000000,0.000000,82.660990,129.0,0.000000,0.000000,349.773600,142.0,0.000000e+00,...,22.315170,0.995985,1,5,-4.0,-37.0,-41.0,-17.0,-147.0,-48.0
2010-02-07,48.0,0.000084,0.521628,55.883484,107.0,0.001141,0.684689,290.122280,84.0,9.276549e-08,...,25.328183,0.996109,2,6,-23.0,-22.0,-58.0,9.0,-78.0,16.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2024-03-31,15.0,0.295563,0.831158,16.268627,8.0,0.390129,0.876598,21.353832,23.0,3.920109e-11,...,19.096171,0.997667,3,14,0.0,0.0,5.0,24.0,61.0,32.0
2024-04-07,9.0,0.110442,0.606424,9.761176,15.0,0.915243,1.763980,40.038437,20.0,1.170331e-05,...,25.938097,0.998181,4,15,-6.0,7.0,-3.0,-6.0,-42.0,-34.0
2024-04-14,9.0,0.186146,0.686872,9.761176,6.0,0.175743,0.627631,16.015375,35.0,7.669364e-01,...,24.843869,0.854865,4,16,0.0,-9.0,15.0,-11.0,-66.0,-61.0
2024-04-21,4.0,0.036716,0.381502,4.338301,7.0,0.212633,0.685038,18.684605,18.0,1.503779e-01,...,22.587642,0.998396,4,17,-5.0,1.0,-17.0,10.0,-9.0,2.0


In [18]:
df.shape[0]

745

In [19]:
s = ['AC', 'AL', 'AP', 'DF', 'RN', 'RO', 'RR', 'SE', 'TO']

In [20]:
'AC' in s

True

In [21]:
pd.read_csv('forecast_tables/forecast_5302.csv.gz')

Unnamed: 0.1,Unnamed: 0,date,lower_2_5,lower_25,forecast,upper_75,upper_97_5,macroregion,prob_high,prob_low,HT,LT,HTinc,LTinc
0,0,2024-04-07,5657.600075,7606.643046,8233.700783,8890.458068,10447.026068,5302,100.0,0.0,1978.927462,715.63191,64.773517,23.423797
1,1,2024-04-14,6379.484543,7820.17685,8635.663031,9447.791633,10803.509727,5302,100.0,0.0,1891.155129,707.439027,61.900586,23.155631
2,2,2024-04-21,6756.601471,8414.903584,8968.985846,9930.696929,11484.160177,5302,100.0,0.0,2074.783635,827.720473,67.911046,27.092638
3,3,2024-04-28,7029.390347,8763.471027,9717.037411,10608.805778,12336.874154,5302,100.0,0.0,2282.023925,848.273208,74.694358,27.765363
