## Web extraction from info.dengue data

> autor: BragatteMAS

> updated: 20240129 

#### Refs

https://info.dengue.mat.br/

https://info.dengue.mat.br/services/api - dicionário de dados aqui!



In [1]:
import pandas as pd ## for dataframes
from datetime import datetime, timedelta ## for dealing with time

In [2]:
## List of geocodes for Brazilian states
geocodes = [
    1200401, # Acre
    2704302, # Alagoas
    1302603, # Amazonas
    1600303, # Amapá
    2927408, # Bahia
    2304400, # Ceará
    5300108, # Distrito Federal
    3205309, # Espírito Santo
    5208707, # Goiás
    2111300, # Maranhão
    5103403, # Mato Grosso
    5002704, # Mato Grosso do Sul
    3106200, # Minas Gerais
    1501402, # Pará
    2507507, # Paraíba
    4115200, # Paraná
    2611606, # Pernambuco
    2207702, # Piauí
    3304557, # Rio de Janeiro
    2408102, # Rio Grande do Norte
    4314902, # Rio Grande do Sul
    1100023, # Rondônia
    1400100, # Roraima
    4205407, # Santa Catarina
    3550308, # São Paulo
    2800308, # Sergipe
    1721000  # Tocantins
]

## Mapping of geocodes to state names
geocode_to_state = {
    1200401: "Acre",
    2704302: "Alagoas",
    1302603: "Amazonas",
    1600303: "Amapá",
    2927408: "Bahia",
    2304400: "Ceará",
    5300108: "Distrito Federal",
    3205309: "Espírito Santo",
    5208707: "Goiás",
    2111300: "Maranhão",
    5103403: "Mato Grosso",
    5002704: "Mato Grosso do Sul",
    3106200: "Minas Gerais",
    1501402: "Pará",
    2507507: "Paraíba",
    4115200: "Paraná",
    2611606: "Pernambuco",
    2207702: "Piauí",
    3304557: "Rio de Janeiro",
    2408102: "Rio Grande do Norte",
    4314902: "Rio Grande do Sul",
    1100023: "Rondônia",
    1400100: "Roraima",
    4205407: "Santa Catarina",
    3550308: "São Paulo",
    2800308: "Sergipe",
    1721000: "Tocantins"
}

## Mapping of geocodes to state codes names
geocode_to_uf = {
    1200401: "AC",
    2704302: "AL",
    1302603: "AM",
    1600303: "AP",
    2927408: "BA",
    2304400: "CE",
    5300108: "DF",
    3205309: "ES",
    5208707: "GO",
    2111300: "MA",
    5103403: "MT",
    5002704: "MS",
    3106200: "MG",
    1501402: "PA",
    2507507: "PB",
    4115200: "PR",
    2611606: "PE",
    2207702: "PI",
    3304557: "RJ",
    2408102: "RN",
    4314902: "RS",
    1100023: "RO",
    1400100: "RR",
    4205407: "SC",
    3550308: "SP",
    2800308: "SE",
    1721000: "TO"
}

## Mapping of state codes to geocodes
uf_to_region = {
    "AC": "Norte", "AP": "Norte", "AM": "Norte", "PA": "Norte", "RO": "Norte", "RR": "Norte", "TO": "Norte",
    "AL": "Nordeste", "BA": "Nordeste", "CE": "Nordeste", "MA": "Nordeste", "PB": "Nordeste",
    "PE": "Nordeste", "PI": "Nordeste", "RN": "Nordeste", "SE": "Nordeste",
    "DF": "Centro-Oeste", "GO": "Centro-Oeste", "MT": "Centro-Oeste", "MS": "Centro-Oeste",
    "ES": "Sudeste", "MG": "Sudeste", "RJ": "Sudeste", "SP": "Sudeste",
    "PR": "Sul", "RS": "Sul", "SC": "Sul"
}

In [3]:
## Columns to be selected from infodengue to compose infodenv_cases (ITpS filter)
new_columns = ['data_iniSE', 'SE', 'casos','casos_est', 'casos_est_min', 'casos_est_max', 'casprov','casconf', 'notif_accum_year','p_inc100k', 'Rt','pop',]

## Initialize an empty DataFrame to store the selected data
infodenv_cases = pd.DataFrame()

## Loop over each geocode, fetch data, select columns, add state column and concatenate
for geocode in geocodes:
    url = f'https://info.dengue.mat.br/api/alertcity?geocode={geocode}&disease=dengue&format=csv&ew_start=1&ew_end=52&ey_start=2022&ey_end=2024'
    infodengue = pd.read_csv(url)
    selected_data = infodengue[new_columns].copy()
    selected_data['state_code'] = geocode_to_uf[geocode]  # Add a column for the state_code
    selected_data['state'] = geocode_to_state[geocode]  # Add a column for the state name
    selected_data['region'] = uf_to_region[geocode_to_uf[geocode]]  # Add a column for the region
    infodenv_cases = pd.concat([infodenv_cases, selected_data])

## Reset index after concatenating
infodenv_cases.reset_index(drop=True, inplace=True)

## Display the first few rows of the new DataFrame
infodenv_cases.head()

Unnamed: 0,data_iniSE,SE,casos,casos_est,casos_est_min,casos_est_max,casprov,casconf,notif_accum_year,p_inc100k,Rt,pop,state_code,state,region
0,2024-01-07,202402,2,386.0,99,1252.0,,,13790,93.36797,1.119636,413418.0,AC,Acre,Norte
1,2023-12-31,202401,247,459.0,296,987.0,,,13790,111.02564,1.170213,413418.0,AC,Acre,Norte
2,2023-12-24,202352,223,291.0,242,430.0,,,13790,70.38881,0.544741,413418.0,AC,Acre,Norte
3,2023-12-17,202351,209,233.0,215,270.0,,,13790,56.359425,0.379563,413418.0,AC,Acre,Norte
4,2023-12-10,202350,686,697.0,688,717.0,,,13790,168.5945,1.164719,413418.0,AC,Acre,Norte


In [18]:
## checar coluna casprov
infodenv_cases['casos'].value_counts()

0       36
8       36
14      35
3       31
24      30
        ..
272      1
288      1
276      1
334      1
1248     1
Name: casos, Length: 840, dtype: int64

Corrigindo data da semana

In [4]:
## Function to add 6 days to a date string
def add_days_to_date(date_str):
    date_obj = datetime.strptime(date_str, '%Y-%m-%d')
    end_date_obj = date_obj + timedelta(days=6)
    return end_date_obj.strftime('%Y-%m-%d')

## Apply the function to the column 'data_iniSE' and store the result in a new column 'data_fimSE'
infodenv_cases['data_fimSE'] = infodenv_cases['data_iniSE'].apply(add_days_to_date)

## Display the first few rows of the new DataFrame
infodenv_cases.tail()

Unnamed: 0,data_iniSE,SE,casos,casos_est,casos_est_min,casos_est_max,casprov,casconf,notif_accum_year,p_inc100k,Rt,pop,state_code,state,region,data_fimSE
2857,2022-01-30,202205,892,892.0,892,892.0,,,32782,291.22156,0.928555,306296.0,TO,Tocantins,Norte,2022-02-05
2858,2022-01-23,202204,901,901.0,901,901.0,,,32782,294.1599,0.903501,306296.0,TO,Tocantins,Norte,2022-01-29
2859,2022-01-16,202203,946,946.0,946,946.0,,,32782,308.85156,1.116532,306296.0,TO,Tocantins,Norte,2022-01-22
2860,2022-01-09,202202,1030,1030.0,1030,1030.0,,,32782,336.27603,1.872523,306296.0,TO,Tocantins,Norte,2022-01-15
2861,2022-01-02,202201,1248,1248.0,1248,1248.0,,,32782,407.449,2.437746,306296.0,TO,Tocantins,Norte,2022-01-08


In [5]:
## Pivot the DataFrame cases
pivoted_df_cases = infodenv_cases.pivot_table(
    index='state_code',             # Set 'Estado' as the index
    columns='data_fimSE',       # Set 'data_iniSE' as the columns
    values='casos',         # Choose the values to display, e.g., 'casos'
    aggfunc='sum'               # Aggregate function if there are multiple entries for the same index/column pair
)

## Pivot the DataFrame cases_est
pivoted_df_cases_est = infodenv_cases.pivot_table(
    index='state_code',             # Set 'Estado' as the index
    columns='data_fimSE',       # Set 'data_iniSE' as the columns
    values='casos_est',         # Choose the values to display, e.g., 'casos_est'
    aggfunc='sum'               # Aggregate function if there are multiple entries for the same index/column pair
)

In [6]:
## Display the pivoted DataFrame from cases
pivoted_df_cases.tail()

data_fimSE,2022-01-08,2022-01-15,2022-01-22,2022-01-29,2022-02-05,2022-02-12,2022-02-19,2022-02-26,2022-03-05,2022-03-12,...,2023-11-11,2023-11-18,2023-11-25,2023-12-02,2023-12-09,2023-12-16,2023-12-23,2023-12-30,2024-01-06,2024-01-13
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RS,5,3,6,2,3,3,14,26,61,227,...,21,20,24,16,20,16,17,13,10,1
SC,44,32,22,22,16,32,21,50,50,112,...,194,156,181,256,313,366,358,293,377,267
SE,2,12,7,2,9,7,19,7,18,10,...,58,72,67,57,55,50,17,15,26,1
SP,327,253,366,325,289,310,322,342,504,627,...,746,698,788,918,912,916,783,803,1567,647
TO,1248,1030,946,901,892,798,695,542,545,592,...,70,84,82,96,119,84,87,50,96,70


In [12]:
## Display the pivoted DataFrame from cases estimated
pivoted_df_cases_est.tail()

data_fimSE,2022-01-08,2022-01-15,2022-01-22,2022-01-29,2022-02-05,2022-02-12,2022-02-19,2022-02-26,2022-03-05,2022-03-12,...,2023-11-11,2023-11-18,2023-11-25,2023-12-02,2023-12-09,2023-12-16,2023-12-23,2023-12-30,2024-01-06,2024-01-13
state_code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
RS,5.0,3.0,6.0,2.0,3.0,3.0,14.0,26.0,61.0,227.0,...,21.0,21.0,26.0,18.0,23.0,19.0,21.0,18.0,18.0,13.0
SC,44.0,32.0,22.0,22.0,16.0,32.0,21.0,50.0,50.0,112.0,...,194.0,156.0,182.0,257.0,315.0,370.0,366.0,311.0,425.0,520.0
SE,2.0,12.0,7.0,2.0,9.0,7.0,19.0,7.0,18.0,10.0,...,58.0,72.0,67.0,57.0,55.0,51.0,19.0,20.0,41.0,27.0
SP,327.0,253.0,366.0,325.0,289.0,310.0,322.0,342.0,504.0,627.0,...,747.0,700.0,791.0,924.0,922.0,935.0,823.0,896.0,1857.0,1905.5
TO,1248.0,1030.0,946.0,901.0,892.0,798.0,695.0,542.0,545.0,592.0,...,70.0,84.0,82.0,96.0,119.0,85.0,89.0,53.0,106.0,118.0


In [None]:
# Save the pivoted DataFrame from cases to a TSV file
pivoted_df_cases.to_csv('casos_denv.tsv', sep='\t')

# Save the pivoted DataFrame from cases estimated to a TSV file
pivoted_df_cases_est.to_csv('casosest_denv.tsv', sep='\t')

In [13]:
## Save the infodenv_cases fiile to a CSV file
infodenv_cases.to_csv('infodengue_casos_filtro.csv', index=False)

## Site e.g.

https://info.dengue.mat.br/services/api/doc

In [9]:
# 3304557 = Rio de Janeiro city geocode
url = 'https://info.dengue.mat.br/api/alertcity?geocode=3304557&disease=dengue&format=csv&ew_start=1&ew_end=50&ey_start=2022&ey_end=2024'
search_filter = (
    'geocode=3304557&disease=dengue&format=csv&' +
    'ew_start=1&ew_end=50&ey_start=2022&ey_end=2024'
)
df = pd.read_csv('%s?%s' % (url, search_filter))
print(url)
df.head()

https://info.dengue.mat.br/api/alertcity?geocode=3304557&disease=dengue&format=csv&ew_start=1&ew_end=50&ey_start=2022&ey_end=2024


Unnamed: 0,data_iniSE,SE,casos_est,casos_est_min,casos_est_max,casos,p_rt1,p_inc100k,Localidade_id,nivel,...,umidmed,umidmin,tempmed,tempmax,casprov,casprov_est,casprov_est_min,casprov_est_max,casconf,notif_accum_year
0,2024-01-07,202402,2533.0,1735,3998,918,1.0,37.53808,0,4,...,78.220254,76.564083,27.071429,27.142857,,,,,,56355
1,2023-12-31,202401,2280.0,2093,2596,1846,1.0,33.788715,0,4,...,77.966467,77.847939,24.904762,25.0,,,,,,56355
2,2023-12-24,202352,1234.0,1171,1323,1073,0.258404,18.287401,0,4,...,74.152162,68.894631,26.637681,28.347826,,,,,,56355
3,2023-12-17,202351,1158.0,1129,1201,1078,0.373716,17.16111,0,4,...,77.95962,73.573996,26.369565,27.391304,,,,,,56355
4,2023-12-10,202350,1449.0,1428,1476,1403,1.0,21.473618,0,4,...,76.277792,66.529891,24.60215,26.322581,,,,,,56355


In [10]:
## extract data from RJ
new_columns = ['data_iniSE', 'SE', 'casos_est', 'casos_est_min', 'casos_est_max', 'casos']
new_df = infodengue[new_columns]
new_df.head()


Unnamed: 0,data_iniSE,SE,casos_est,casos_est_min,casos_est_max,casos
0,2024-01-07,202402,118.0,76,342,70
1,2023-12-31,202401,106.0,97,137,96
2,2023-12-24,202352,53.0,50,62,50
3,2023-12-17,202351,89.0,87,95,87
4,2023-12-10,202350,85.0,84,88,84
