## Web extraction from info.dengue data

> autor: BragatteMAS

> updated: 20231211
> used: 20240509

#### Refs

https://info.dengue.mat.br/

https://info.dengue.mat.br/services/api

In [1]:
import pandas as pd ## for dataframes
from datetime import datetime, timedelta ## for dealing with time

In [2]:
## List of geocodes for Brazilian states
geocodes = [
    1200401, # Acre
    2704302, # Alagoas
    1302603, # Amazonas
    1600303, # Amapá
    2927408, # Bahia
    2304400, # Ceará
    5300108, # Distrito Federal
    3205309, # Espírito Santo
    5208707, # Goiás
    2111300, # Maranhão
    5103403, # Mato Grosso
    5002704, # Mato Grosso do Sul
    3106200, # Minas Gerais
    1501402, # Pará
    2507507, # Paraíba
    4115200, # Paraná
    2611606, # Pernambuco
    2207702, # Piauí
    3304557, # Rio de Janeiro
    2408102, # Rio Grande do Norte
    4314902, # Rio Grande do Sul
    1100023, # Rondônia
    1400100, # Roraima
    4205407, # Santa Catarina
    3550308, # São Paulo
    2800308, # Sergipe
    1721000  # Tocantins
]

## Mapping of geocodes to state names
geocode_to_state = {
    1200401: "Acre",
    2704302: "Alagoas",
    1302603: "Amazonas",
    1600303: "Amapá",
    2927408: "Bahia",
    2304400: "Ceará",
    5300108: "Distrito Federal",
    3205309: "Espírito Santo",
    5208707: "Goiás",
    2111300: "Maranhão",
    5103403: "Mato Grosso",
    5002704: "Mato Grosso do Sul",
    3106200: "Minas Gerais",
    1501402: "Pará",
    2507507: "Paraíba",
    4115200: "Paraná",
    2611606: "Pernambuco",
    2207702: "Piauí",
    3304557: "Rio de Janeiro",
    2408102: "Rio Grande do Norte",
    4314902: "Rio Grande do Sul",
    1100023: "Rondônia",
    1400100: "Roraima",
    4205407: "Santa Catarina",
    3550308: "São Paulo",
    2800308: "Sergipe",
    1721000: "Tocantins"
}

## Mapping of geocodes to state codes names
geocode_to_uf = {
    1200401: "AC",
    2704302: "AL",
    1302603: "AM",
    1600303: "AP",
    2927408: "BA",
    2304400: "CE",
    5300108: "DF",
    3205309: "ES",
    5208707: "GO",
    2111300: "MA",
    5103403: "MT",
    5002704: "MS",
    3106200: "MG",
    1501402: "PA",
    2507507: "PB",
    4115200: "PR",
    2611606: "PE",
    2207702: "PI",
    3304557: "RJ",
    2408102: "RN",
    4314902: "RS",
    1100023: "RO",
    1400100: "RR",
    4205407: "SC",
    3550308: "SP",
    2800308: "SE",
    1721000: "TO"
}

## Mapping of state codes to geocodes
uf_to_region = {
    "AC": "Norte", "AP": "Norte", "AM": "Norte", "PA": "Norte", "RO": "Norte", "RR": "Norte", "TO": "Norte",
    "AL": "Nordeste", "BA": "Nordeste", "CE": "Nordeste", "MA": "Nordeste", "PB": "Nordeste",
    "PE": "Nordeste", "PI": "Nordeste", "RN": "Nordeste", "SE": "Nordeste",
    "DF": "Centro-Oeste", "GO": "Centro-Oeste", "MT": "Centro-Oeste", "MS": "Centro-Oeste",
    "ES": "Sudeste", "MG": "Sudeste", "RJ": "Sudeste", "SP": "Sudeste",
    "PR": "Sul", "RS": "Sul", "SC": "Sul"
}

In [3]:
## Columns to be selected from infodengue to compose infodenv_cases (ITpS filter)
new_columns = ['data_iniSE', 'SE', 'casos_est', 'casos_est_min', 'casos_est_max', 'casos']

## Initialize an empty DataFrame to store the selected data
infodenv_cases = pd.DataFrame()

## Loop over each geocode, fetch data, select columns, add state column and concatenate
for geocode in geocodes:
    url = f'https://info.dengue.mat.br/api/alertcity?geocode={geocode}&disease=dengue&format=csv&ew_start=1&ew_end=52&ey_start=2022&ey_end=2024'
    infodengue = pd.read_csv(url)
    # selected_data = infodengue[new_columns]
    selected_data = infodengue[new_columns].copy()  ## Make a copy of the slice to avoid SettingWithCopyWarning
    selected_data.loc[:, 'state_code'] = geocode_to_uf[geocode]  ## Add a column for the state_code
    selected_data.loc[:, 'state'] = geocode_to_state[geocode]  ## Add a column for the state name
    selected_data.loc[:, 'region'] = uf_to_region[geocode_to_uf[geocode]]  ## Add a column for the region
    infodenv_cases = pd.concat([infodenv_cases, selected_data]) ## Concatenate the selected data

## Reset index after concatenating
infodenv_cases.reset_index(drop=True, inplace=True)

## Display the first few rows of the new DataFrame
infodenv_cases.head()

Unnamed: 0,data_iniSE,SE,casos_est,casos_est_min,casos_est_max,casos,state_code,state,region
0,2024-04-28,202418,131.0,24,624.0,0,AC,Acre,Norte
1,2024-04-21,202417,196.0,140,418.0,127,AC,Acre,Norte
2,2024-04-14,202416,233.5,211,304.0,205,AC,Acre,Norte
3,2024-04-07,202415,259.0,249,289.0,247,AC,Acre,Norte
4,2024-03-31,202414,227.0,223,243.0,222,AC,Acre,Norte


In [4]:
## Display the columns of the original DataFrame
# print(infodengue.columns)
print(infodenv_cases.columns)

Index(['data_iniSE', 'SE', 'casos_est', 'casos_est_min', 'casos_est_max',
       'casos', 'state_code', 'state', 'region'],
      dtype='object')


In [5]:
infodenv_cases['state'].unique()

array(['Acre', 'Alagoas', 'Amazonas', 'Amapá', 'Bahia', 'Ceará',
       'Distrito Federal', 'Espírito Santo', 'Goiás', 'Maranhão',
       'Mato Grosso', 'Mato Grosso do Sul', 'Minas Gerais', 'Pará',
       'Paraíba', 'Paraná', 'Pernambuco', 'Piauí', 'Rio de Janeiro',
       'Rio Grande do Norte', 'Rio Grande do Sul', 'Rondônia', 'Roraima',
       'Santa Catarina', 'São Paulo', 'Sergipe', 'Tocantins'],
      dtype=object)

Corrigindo data da semana

In [6]:
## Function to add 6 days to a date string
def add_days_to_date(date_str):
    date_obj = datetime.strptime(date_str, '%Y-%m-%d')
    end_date_obj = date_obj + timedelta(days=6)
    return end_date_obj.strftime('%Y-%m-%d')

## Apply the function to the column 'data_iniSE' and store the result in a new column 'data_fimSE'
infodenv_cases['data_fimSE'] = infodenv_cases['data_iniSE'].apply(add_days_to_date)

## Remove the column 'data_iniSE'
infodenv_cases.drop(columns=['data_iniSE'], inplace=True)

## Rename columns 'casos_est' and 'casos' FOR casos estimados' and 'casos confirmados'
infodenv_cases.rename(columns={'casos_est': 'casos_estimados', 'casos': 'casos_confirmados'}, inplace=True)

## Change the order of the columns, moving 'data_fimSE' to the first position and second position casos confirmados
infodenv_cases = infodenv_cases[['data_fimSE', 'casos_confirmados', 'casos_estimados', 'casos_est_min', 'casos_est_max', 'state_code', 'state', 'region']]

## Display the first few rows of the new DataFrame
infodenv_cases.head()

Unnamed: 0,data_fimSE,casos_confirmados,casos_estimados,casos_est_min,casos_est_max,state_code,state,region
0,2024-05-04,0,131.0,24,624.0,AC,Acre,Norte
1,2024-04-27,127,196.0,140,418.0,AC,Acre,Norte
2,2024-04-20,205,233.5,211,304.0,AC,Acre,Norte
3,2024-04-13,247,259.0,249,289.0,AC,Acre,Norte
4,2024-04-06,222,227.0,223,243.0,AC,Acre,Norte


In [7]:
## Pivot the DataFrame cases
pivoted_df_cases = infodenv_cases.pivot_table(
    index='state_code',             # Set 'Estado' as the index
    columns='data_fimSE',       # Set 'data_iniSE' as the columns
    values='casos_confirmados',         # Choose the values to display, e.g., 'casos'
    aggfunc='sum'               # Aggregate function if there are multiple entries for the same index/column pair
)

## Pivot the DataFrame cases_est
pivoted_df_cases_est = infodenv_cases.pivot_table(
    index='state_code',             # Set 'Estado' as the index
    columns='data_fimSE',       # Set 'data_iniSE' as the columns
    values='casos_estimados',         # Choose the values to display, e.g., 'casos_est'
    aggfunc='sum'               # Aggregate function if there are multiple entries for the same index/column pair
)

In [8]:
## Display the pivoted DataFrame from cases
pivoted_df_cases.tail()

data_fimSE,2022-01-08,2022-01-15,2022-01-22,2022-01-29,2022-02-05,2022-02-12,2022-02-19,2022-02-26,2022-03-05,2022-03-12,...,2024-03-02,2024-03-09,2024-03-16,2024-03-23,2024-03-30,2024-04-06,2024-04-13,2024-04-20,2024-04-27,2024-05-04
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RS,5,3,6,2,3,3,14,26,61,227,...,211,150,58,25,46,139,137,52,13,6
SC,44,32,22,22,16,32,21,50,50,112,...,1648,1520,1622,1729,1696,2095,2048,2027,1890,877
SE,2,12,7,2,9,7,19,7,18,10,...,328,299,299,305,170,255,226,185,106,5
SP,327,253,366,325,289,310,322,342,504,627,...,38998,45109,52679,63105,53183,55826,55396,42182,23298,4321
TO,1248,1030,946,901,892,798,695,542,545,592,...,364,346,286,252,226,329,362,472,412,308


In [9]:
## Display the pivoted DataFrame from cases estimated
pivoted_df_cases_est.tail()

data_fimSE,2022-01-08,2022-01-15,2022-01-22,2022-01-29,2022-02-05,2022-02-12,2022-02-19,2022-02-26,2022-03-05,2022-03-12,...,2024-03-02,2024-03-09,2024-03-16,2024-03-23,2024-03-30,2024-04-06,2024-04-13,2024-04-20,2024-04-27,2024-05-04
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RS,5.0,3.0,6.0,2.0,3.0,3.0,14.0,26.0,61.0,227.0,...,214.0,158.0,71.0,43.0,72.0,176.0,181.0,105.0,78.0,88.5
SC,44.0,32.0,22.0,22.0,16.0,32.0,21.0,50.0,50.0,112.0,...,1648.0,1521.0,1624.0,1732.0,1702.0,2115.0,2092.0,2111.0,2091.0,2074.0
SE,2.0,12.0,7.0,2.0,9.0,7.0,19.0,7.0,18.0,10.0,...,328.0,299.0,300.0,306.0,172.0,259.0,236.0,222.5,215.0,220.5
SP,327.0,253.0,366.0,325.0,289.0,310.0,322.0,342.0,504.0,627.0,...,39080.0,45370.0,53259.5,64265.5,55250.0,59117.0,60218.5,49039.5,33348.0,23779.0
TO,1248.0,1030.0,946.0,901.0,892.0,798.0,695.0,542.0,545.0,592.0,...,364.0,346.0,286.0,252.0,227.0,333.0,372.0,493.0,472.0,637.5


In [10]:
# Save the pivoted DataFrame from cases to a TSV file
pivoted_df_cases.to_csv('casos_denv.tsv', sep='\t')

# Save the pivoted DataFrame from cases estimated to a TSV file
pivoted_df_cases_est.to_csv('casosest_denv.tsv', sep='\t')

## Save the infodenv_cases fiile to a CSV file
infodenv_cases.to_csv('infodengue_casos_filtro.csv', index=False)

## Site e.g.

https://info.dengue.mat.br/services/api/doc

In [11]:
# 3304557 = Rio de Janeiro city geocode
url = 'https://info.dengue.mat.br/api/alertcity?geocode=3304557&disease=dengue&format=csv&ew_start=1&ew_end=50&ey_start=2022&ey_end=2023'
search_filter = (
    'geocode=3304557&disease=dengue&format=csv&' +
    'ew_start=1&ew_end=50&ey_start=2022&ey_end=2023'
)
df = pd.read_csv('%s?%s' % (url, search_filter))
print(url)
df.head()

https://info.dengue.mat.br/api/alertcity?geocode=3304557&disease=dengue&format=csv&ew_start=1&ew_end=50&ey_start=2022&ey_end=2023


Unnamed: 0,data_iniSE,SE,casos_est,casos_est_min,casos_est_max,casos,p_rt1,p_inc100k,Localidade_id,nivel,...,umidmed,umidmin,tempmed,tempmax,casprov,casprov_est,casprov_est_min,casprov_est_max,casconf,notif_accum_year
0,2023-12-10,202350,1411.0,1411,1411,1411,1.0,21.295383,0,4,...,76.277792,66.529891,24.60215,26.322581,,,,,,51479
1,2023-12-03,202349,1149.0,1149,1149,1149,1.0,17.341173,0,4,...,78.48408,61.85701,26.673612,31.0,,,,,,51479
2,2023-11-26,202348,983.0,983,983,983,1.0,14.835835,0,4,...,,,,,,,,,,51479
3,2023-11-19,202347,900.0,900,900,900,1.0,13.583165,0,4,...,,,,,,,,,,51479
4,2023-11-12,202346,571.0,571,571,571,0.898765,8.617764,0,4,...,77.486542,75.15213,27.1,27.4,,,,,,51479


In [12]:
## extract data from RJ
new_columns = ['data_iniSE', 'SE', 'casos_est', 'casos_est_min', 'casos_est_max', 'casos']
new_df = infodengue[new_columns]
new_df.head()


Unnamed: 0,data_iniSE,SE,casos_est,casos_est_min,casos_est_max,casos
0,2024-04-28,202418,637.5,358,2131,308
1,2024-04-21,202417,472.0,428,626,412
2,2024-04-14,202416,493.0,477,536,472
3,2024-04-07,202415,372.0,364,394,362
4,2024-03-31,202414,333.0,329,344,329
