In [1]:
import pandas as pd
from pysus import SIH

# 1) Defina parâmetros de extração
uf       = "SP"        # Unidade da Federação (ex.: 'SP', 'RJ', etc.)
year     = 2020        # Ano desejado
group    = "RD"        # Grupo de arquivos ('RD' = AIH Reduzida, 'RJ' = AIH Rejeitada, etc.)
months   = [1,2,3]     # Meses que deseja baixar (1 a 12). Use None para todos.
local_dir = "./sih_data"  # Pasta onde os arquivos Parquet serão salvos

# 2) Instancia o objeto SIH e lista os arquivos disponíveis
sih = SIH().load()  # carrega metadados do SIH
files = sih.get_files(group, uf=uf, year=year, month=months)

# 3) Faz o download (converte DBC ➔ Parquet) e retorna lista de paths
parquet_files = sih.download(files, local_dir=local_dir)

# 4) Lê cada Parquet em um DataFrame e concatena tudo num único
df_list = [pf.to_dataframe() for pf in parquet_files]
df = pd.concat(df_list, ignore_index=True)

# 5) Exibe as primeiras linhas para conferência
print(df.head())
print(f"Total de registros: {len(df)}")


RDSP2003.parquet: 100%|██████████| 732k/732k [00:14<00:00, 49.1kB/s] 


    UF_ZI ANO_CMPT MES_CMPT ESPEC        CGC_HOSP          N_AIH IDENT  \
0  350000     2020       01    03  46230439000101  3519102439358     1   
1  350000     2020       01    03  46230439000101  3519102439402     1   
2  350000     2020       01    03  46230439000101  3519102439611     1   
3  350000     2020       01    03  46374500011390  3520101387980     1   
4  350000     2020       01    03  46374500011390  3520101388001     1   

        CEP MUNIC_RES      NASC  ...  DIAGSEC9 TPDISEC1 TPDISEC2 TPDISEC3  \
0  18730970    352180  19861111  ...                  0        0        0   
1  18601420    350750  19591031  ...                  0        0        0   
2  18740000    355380  19580616  ...                  0        0        0   
3  02929030    355030  19870923  ...                  0        0        0   
4  02961020    355030  19241219  ...                  1        1        0   

  TPDISEC4 TPDISEC5 TPDISEC6 TPDISEC7 TPDISEC8 TPDISEC9  
0        0        0        0      

In [19]:
import os, sys

sys.path.insert(0, os.path.join(os.getcwd(), "scripts"))

from minio_loader import load_sih_dataframe

os.environ["MINIO_ENDPOINT"] = "http://localhost:9000"

In [18]:
df = load_sih_dataframe()
df.head()

🔄 Carregando testes/sih/RDSP2001.parquet...
🔄 Carregando testes/sih/RDSP2002.parquet...
🔄 Carregando testes/sih/RDSP2003.parquet...


Unnamed: 0,UF_ZI,ANO_CMPT,MES_CMPT,ESPEC,CGC_HOSP,N_AIH,IDENT,CEP,MUNIC_RES,NASC,...,DIAGSEC9,TPDISEC1,TPDISEC2,TPDISEC3,TPDISEC4,TPDISEC5,TPDISEC6,TPDISEC7,TPDISEC8,TPDISEC9
0,353030,2020,1,3,53221255001970,3520106202130,1,15130000,353030,19510815,...,,0,0,0,0,0,0,0,0,0
1,353030,2020,1,3,53221255001970,3520106202140,1,15130000,353030,19410708,...,,0,0,0,0,0,0,0,0,0
2,353030,2020,1,3,53221255001970,3520106202162,1,15130000,353030,19380130,...,,0,0,0,0,0,0,0,0,0
3,353030,2020,1,3,53221255001970,3520106203735,1,15155000,352450,19690423,...,,0,0,0,0,0,0,0,0,0
4,353030,2020,1,3,53221255001970,3520106203746,1,15130000,353030,19560228,...,,0,0,0,0,0,0,0,0,0
