In [1]:
import os
from glob import glob

import pandas as pd
from sqlalchemy import create_engine
from IPython.display import display
from pysus.utilities.readdbc import read_dbc

CONTEXT_MAX_ROWS_COLS = ('display.max_rows', None, 'display.max_columns', None)
DATASET_PATH_BASE = '../../../../datasets/'

In [2]:
date_parser_dmy = lambda x: pd.to_datetime(x, format='%d%m%Y', 
                                           errors='raise',
                                           cache=True)
date_parser_ymd = lambda x: pd.to_datetime(x, format='%Y%m%d', 
                                           errors='raise',
                                           cache=True)


def load_resource_df_pf(resource_path, dtype, parse_dates, 
                        date_parser=date_parser_dmy, 
                        format='csv', encoding='utf-8'):
    if format == 'csv':
        df = pd.read_csv(resource_path,
                         dtype=dtype,
                         parse_dates=parse_dates,
                         date_parser=date_parser,)
    elif format == 'tsv':
        df = pd.read_csv(resource_path, encoding=encoding)
    else:
        raise NotImplementedError
    
    df = df.reindex(sorted(df.columns), axis=1)
    pf = pandas_profiling.ProfileReport(df)
    
    return df, pf


get_files_path = lambda p: sorted(glob(DATASET_PATH_BASE + p))


def file_base_name(file_name):
    if '.' in file_name:
        separator_index = file_name.index('.')
        base_name = file_name[:separator_index]
        return base_name
    else:
        return file_name


def path_base_name(path):
    file_name = os.path.basename(path)
    return file_base_name(file_name)


def get_resources(path_pattern, prefix_pattern='%s'):
    resources = {}
    for file_path in get_files_path(path_pattern):
        key = prefix_pattern % path_base_name(file_path).lower()
        resources[key] = file_path
    
    return resources


def _read_dbc(path, cols_numeric, dict_dates, encoding='iso-8859-1'):
    df = read_dbc(path, encoding=encoding)

    df[cols_numeric] = df[cols_numeric].apply(pd.to_numeric, 
                                              errors='coerce')
    
    for date_format, cols in dict_dates_sia.items():
        df[cols] = df[cols].apply(pd.to_datetime, 
                                  format=date_format,
                                  errors='coerce')

    return df

In [3]:
datasus_siaam_es_resources = {}
for year in range(17, 20):
    r = get_resources('datasus/sihsus/data-raw/RDES%s*'%year, 'sia-%s-raw')
    datasus_siaam_es_resources.update(r)
display(len(datasus_siaam_es_resources))
display(datasus_siaam_es_resources)

27

{'sia-rdes1701-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1701.csv',
 'sia-rdes1702-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1702.csv',
 'sia-rdes1703-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1703.csv',
 'sia-rdes1704-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1704.csv',
 'sia-rdes1705-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1705.csv',
 'sia-rdes1706-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1706.csv',
 'sia-rdes1707-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1707.csv',
 'sia-rdes1708-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1708.csv',
 'sia-rdes1709-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1709.csv',
 'sia-rdes1710-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1710.csv',
 'sia-rdes1711-raw': '../../../../datasets/datasus/sihsus/data-trauma-

In [6]:
list_cid_trauma = ('S681', 'W019', 'S601', 'S602', 'S800', 'S400', 'S700', 'S900', 'S836', 'S934', 'S835', 'S314', 'S610', 'W198', 'S818', 'S913', 'S618', 'S315', 'T141', 'T111', 'S312', 'S619', 'S219', 'S910', 'T091', 'T018', 'T012', 'S617', 'S420', 'S822', 'Y048', 'S522', 'S723', 'V298', 'S423', 'S523', 'S823', 'S724', 'T022', 'S525', 'W190', 'W199', 'S526', 'S424', 'W189', 'W069', 'S318', 'S821', 'V299', 'S118', 'S520', 'W011', 'S521', 'S422', 'S429', 'W010', 'W180', 'W100', 'S211', 'S789', 'S711', 'S829', 'T013', 'V092', 'V290', 'V229', 'V899', 'S611', 'S820', 'W139', 'S021', 'V892', 'S817', 'V191', 'S026', 'V174', 'Y044', 'Y838', 'S524', 'S500', 'S923', 'V292', 'V189', 'V123', 'W226', 'S828', 'S060', 'S202', 'W208', 'W328', 'W188', 'V049', 'S626', 'S122', 'Y609', 'W239', 'Y289', 'W529', 'W318', 'S981', 'Y043', 'Y338', 'W110', 'W448', 'W398', 'W549', 'S110', 'S623', 'W109', 'T142', 'S320', 'W518', 'S421', 'S317', 'S220', 'S324', 'S529', 'S920', 'S682', 'S680', 'S720', 'S328', 'T068', 'W080', 'S729', 'S924', 'W228', 'V249', 'S826', 'S029', 'W138', 'W218', 'S411', 'S600', 'T797', 'T136', 'W018', 'V029', 'S825', 'S323', 'S620', 'W193', 'S929', 'S625', 'Y049', 'S519', 'Y042', 'S022', 'V731', 'W213', 'W108', 'S721', 'S628', 'W088', 'S728', 'S824', 'T023', 'V239', 'W159', 'W129', 'W171', 'W278', 'S024', 'W268', 'W169', 'W160', 'S325', 'S061', 'S827', 'S624', 'V800', 'V093', 'S727', 'S027', 'W276', 'S589', 'T029', 'W060', 'S064', 'S925', 'S327', 'S065', 'S066', 'V021', 'S627', 'S271', 'W170', 'S570', 'W316', 'S431', 'S430', 'S730', 'S930', 'W179', 'S530', 'S830', 'S131', 'S531', 'S631', 'S831', 'S630', 'S221', 'S069', 'V230', 'S068', 'V493', 'W178', 'W040', 'S722', 'V485', 'V220', 'V039', 'Y899', 'W068', 'W200', 'W072', 'W030', 'V040', 'W220', 'W250', 'T983', 'T902', 'T932', 'T905', 'S063', 'W349', 'S275', 'S669', 'Y298', 'W259', 'S369', 'S819', 'S562', 'Y282', 'W299', 'S368', 'S969', 'V240', 'S360', 'S365', 'S761', 'S398', 'S462', 'S661', 'S922', 'W258', 'S640', 'W440', 'V489', 'S370', 'S860', 'W458', 'S270', 'S129', 'S099', 'S134', 'W149', 'Y290', 'V020', 'T131', 'S399', 'S799', 'S527', 'X709', 'T140', 'S797')

In [48]:
# for key, file_path in datasus_siaam_es_resources['sia-paes1903-raw'].items():
key = 'sia-rdes1701-raw'

file_path = datasus_siaam_es_resources[key]
_file_base_name = path_base_name(file_path)
df = read_dbc(datasus_siaam_es_resources[key], 'latin-1')

df_part_trauma = df[df.DIAG_PRINC.isin(list_cid_trauma)]

# with pd.option_context(*CONTEXT_MAX_ROWS_COLS):
#     display(df_tmp_trauma.shape)
#     display(df_tmp_trauma.head())
#     display(df_tmp_trauma.PA_PROC_ID.count_values())
if df_part_trauma.shape[0] > 0:
    display(df_part_trauma.shape)
    df_part_trauma.to_csv('../../../../datasets/datasus/sihsus/data-trauma-raw-csv/%s.csv'%_file_base_name, 
                          index=False)

(1670, 113)

In [52]:
with pd.option_context(*CONTEXT_MAX_ROWS_COLS):
    display(df_part_trauma.head())
    display(df_part_trauma.MORTE.value_counts())

Unnamed: 0,UF_ZI,ANO_CMPT,MES_CMPT,ESPEC,CGC_HOSP,N_AIH,IDENT,CEP,MUNIC_RES,NASC,SEXO,UTI_MES_IN,UTI_MES_AN,UTI_MES_AL,UTI_MES_TO,MARCA_UTI,UTI_INT_IN,UTI_INT_AN,UTI_INT_AL,UTI_INT_TO,DIAR_ACOM,QT_DIARIAS,PROC_SOLIC,PROC_REA,VAL_SH,VAL_SP,VAL_SADT,VAL_RN,VAL_ACOMP,VAL_ORTP,VAL_SANGUE,VAL_SADTSR,VAL_TRANSP,VAL_OBSANG,VAL_PED1AC,VAL_TOT,VAL_UTI,US_TOT,DT_INTER,DT_SAIDA,DIAG_PRINC,DIAG_SECUN,COBRANCA,NATUREZA,NAT_JUR,GESTAO,RUBRICA,IND_VDRL,MUNIC_MOV,COD_IDADE,IDADE,DIAS_PERM,MORTE,NACIONAL,NUM_PROC,CAR_INT,TOT_PT_SP,CPF_AUT,HOMONIMO,NUM_FILHOS,INSTRU,CID_NOTIF,CONTRACEP1,CONTRACEP2,GESTRISCO,INSC_PN,SEQ_AIH5,CBOR,CNAER,VINCPREV,GESTOR_COD,GESTOR_TP,GESTOR_CPF,GESTOR_DT,CNES,CNPJ_MANT,INFEHOSP,CID_ASSO,CID_MORTE,COMPLEX,FINANC,FAEC_TP,REGCT,RACA_COR,ETNIA,SEQUENCIA,REMESSA,AUD_JUST,SIS_JUST,VAL_SH_FED,VAL_SP_FED,VAL_SH_GES,VAL_SP_GES,VAL_UCI,MARCA_UCI,DIAGSEC1,DIAGSEC2,DIAGSEC3,DIAGSEC4,DIAGSEC5,DIAGSEC6,DIAGSEC7,DIAGSEC8,DIAGSEC9,TPDISEC1,TPDISEC2,TPDISEC3,TPDISEC4,TPDISEC5,TPDISEC6,TPDISEC7,TPDISEC8,TPDISEC9
0,320000,2017,1,1,27080605002300,3216102907666,1,29173076,320500,19840123,1,0,0,0,0,0,0,0,0,0,0,28,408020350,408020350,611.16,195.29,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,806.45,0.0,260.98,20161031,20161128,S424,0,18,0,1023,2,0,0,320500,4,32,28,0,10,,2,0,,0,0,0,,0,0,1,0,0,0,0,0,0,0,0,,7257406,27080605000196,,0,0,2,6,,0,4,0,16632,HE32000001N201701.DTS,,,0.0,0.0,0.0,0.0,0.0,0,V99,,,,,,,,,1,0,0,0,0,0,0,0,0
1,320000,2017,1,1,27080605002300,3216102907754,1,29185000,320220,19910908,1,0,0,0,1,75,0,0,0,0,0,1,415010012,415010012,1841.07,676.58,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2517.65,478.72,814.77,20161128,20161129,S723,0,28,0,1023,2,0,0,320500,4,25,1,0,10,,2,0,,0,0,0,,0,0,1,0,0,0,0,0,0,1,56008554787,,7257406,27080605000196,,0,0,2,6,,0,3,0,16633,HE32000001N201701.DTS,,,0.0,0.0,0.0,0.0,0.0,0,Y86,,,,,,,,,2,0,0,0,0,0,0,0,0
4,320000,2017,1,1,27080605002300,3216102982895,1,29172015,320500,19521012,1,0,0,0,0,0,0,0,0,0,0,1,408050543,415010012,341.98,258.91,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,600.89,0.0,194.46,20161214,20161215,S827,0,18,0,1023,2,0,0,320500,4,64,1,0,10,,2,0,,0,0,0,,0,0,1,0,0,0,0,0,7,1,56008554787,,7257406,27080605000196,,0,0,2,6,,0,3,0,16742,HE32000001N201701.DTS,,,0.0,0.0,0.0,0.0,0.0,0,W100,,,,,,,,,1,0,0,0,0,0,0,0,0
5,320000,2017,1,1,27080605002300,3216102982906,1,29164792,320500,19591216,1,0,0,0,2,75,0,0,0,0,0,11,403010314,403010314,1836.2,817.68,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,2653.88,957.44,858.86,20161217,20161230,S065,0,12,0,1023,2,0,0,320500,4,57,13,0,10,,2,0,,0,0,0,,0,0,1,0,0,0,0,0,0,0,0,,7257406,27080605000196,,0,0,2,6,,0,3,0,16743,HE32000001N201701.DTS,,,0.0,0.0,0.0,0.0,0.0,0,W174,,,,,,,,,2,0,0,0,0,0,0,0,0
6,320000,2017,1,1,27080605002300,3216102982917,1,29161549,320500,19580811,1,0,0,0,0,0,0,0,0,0,0,14,408050489,408050489,1576.7,269.85,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1846.55,0.0,597.58,20161208,20161222,S721,0,12,0,1023,2,0,0,320500,4,58,14,0,10,,2,0,,0,0,0,,0,0,1,0,0,0,0,0,0,0,0,,7257406,27080605000196,,0,0,2,6,,0,3,0,16744,HE32000001N201701.DTS,,,0.0,0.0,0.0,0.0,0.0,0,W199,,,,,,,,,2,0,0,0,0,0,0,0,0


0    1638
1      32
Name: MORTE, dtype: int64

In [9]:
datasus_trauma_resources = {}
for year in range(17, 20):
    r = get_resources('datasus/sihsus/data-trauma-raw-csv/RDES%s*'%year, 'sia-%s-raw')
    datasus_trauma_resources.update(r)
display(len(datasus_trauma_resources))
display(datasus_trauma_resources)

list_df_trauma = []

for k, v in datasus_trauma_resources.items():
    list_df_trauma.append(pd.read_csv(v))

df_trauma = pd.concat(list_df_trauma)
df_trauma.head()

27

{'sia-rdes1701-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1701.csv',
 'sia-rdes1702-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1702.csv',
 'sia-rdes1703-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1703.csv',
 'sia-rdes1704-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1704.csv',
 'sia-rdes1705-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1705.csv',
 'sia-rdes1706-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1706.csv',
 'sia-rdes1707-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1707.csv',
 'sia-rdes1708-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1708.csv',
 'sia-rdes1709-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1709.csv',
 'sia-rdes1710-raw': '../../../../datasets/datasus/sihsus/data-trauma-raw-csv/RDES1710.csv',
 'sia-rdes1711-raw': '../../../../datasets/datasus/sihsus/data-trauma-

Unnamed: 0,UF_ZI,ANO_CMPT,MES_CMPT,ESPEC,CGC_HOSP,N_AIH,IDENT,CEP,MUNIC_RES,NASC,...,DIAGSEC9,TPDISEC1,TPDISEC2,TPDISEC3,TPDISEC4,TPDISEC5,TPDISEC6,TPDISEC7,TPDISEC8,TPDISEC9
0,320000,2017,1,1,27080610000000.0,3216102907666,1,29173076,320500,19840123,...,,1,0,0,0,0,0,0,0,0
1,320000,2017,1,1,27080610000000.0,3216102907754,1,29185000,320220,19910908,...,,2,0,0,0,0,0,0,0,0
2,320000,2017,1,1,27080610000000.0,3216102982895,1,29172015,320500,19521012,...,,1,0,0,0,0,0,0,0,0
3,320000,2017,1,1,27080610000000.0,3216102982906,1,29164792,320500,19591216,...,,2,0,0,0,0,0,0,0,0
4,320000,2017,1,1,27080610000000.0,3216102982917,1,29161549,320500,19580811,...,,2,0,0,0,0,0,0,0,0


In [11]:
df_trauma.MORTE.value_counts()

0    41615
1      717
Name: MORTE, dtype: int64

In [10]:

df_trauma.to_csv('../../../../datasets/datasus/sihsus/data-trauma-raw-csv/rdes-1701-1903.csv',
                 index=False)

In [12]:
df_trauma_csv = pd.read_csv('../../../../datasets/datasus/sihsus/data-trauma-raw-csv/rdes-1701-1903.csv')
df_trauma_csv.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,UF_ZI,ANO_CMPT,MES_CMPT,ESPEC,CGC_HOSP,N_AIH,IDENT,CEP,MUNIC_RES,NASC,...,DIAGSEC9,TPDISEC1,TPDISEC2,TPDISEC3,TPDISEC4,TPDISEC5,TPDISEC6,TPDISEC7,TPDISEC8,TPDISEC9
0,320000,2017,1,1,27080610000000.0,3216102907666,1,29173076,320500,19840123,...,,1,0,0,0,0,0,0,0,0
1,320000,2017,1,1,27080610000000.0,3216102907754,1,29185000,320220,19910908,...,,2,0,0,0,0,0,0,0,0
2,320000,2017,1,1,27080610000000.0,3216102982895,1,29172015,320500,19521012,...,,1,0,0,0,0,0,0,0,0
3,320000,2017,1,1,27080610000000.0,3216102982906,1,29164792,320500,19591216,...,,2,0,0,0,0,0,0,0,0
4,320000,2017,1,1,27080610000000.0,3216102982917,1,29161549,320500,19580811,...,,2,0,0,0,0,0,0,0,0


In [13]:
df_trauma_csv.MORTE.value_counts()

0    41615
1      717
Name: MORTE, dtype: int64

In [13]:
def _convert_to(x):
    return pd.to_numeric(downcast='float')

df_correrios_ceplog = pd.read_csv(DATASET_PATH_BASE+'geocoordinates/geocoordinates-br-correios-ceplog-2017-06.tar.gz',
                                  sep=';',
                                  error_bad_lines=False,
                                  dtype={
                                      'correios-ceplog-2017-06-utf8.txt': 'int32'
                                  },
                                  usecols=['correios-ceplog-2017-06-utf8.txt','UF', 'Tipo_Acento', 'Bairro1_Acento','latitude', 'longitude'])

df_correrios_ceplog.rename(columns={'correios-ceplog-2017-06-utf8.txt':'CEP'}, inplace=True)
df_correrios_ceplog['latitude'] = pd.to_numeric(df_correrios_ceplog['latitude'], downcast='float', errors='coerce')
df_correrios_ceplog['longitude'] = pd.to_numeric(df_correrios_ceplog['longitude'], downcast='float', errors='coerce')

  interactivity=interactivity, compiler=compiler, result=result)


In [15]:
df_correrios_ceplog.head()

Unnamed: 0,CEP,UF,Tipo_Acento,Bairro1_Acento,latitude,longitude
0,13568693,SP,TRAVESSA,RESIDENCIAL PARQUE DOURADINHO,-22.018085,-47.848595
1,13568695,SP,TRAVESSA,RESIDENCIAL PARQUE DOURADINHO,-22.017389,-47.850056
2,13568698,SP,TRAVESSA,RESIDENCIAL PARQUE DOURADINHO,-22.016893,-47.8503
3,13568700,SP,TRAVESSA,RESIDENCIAL PARQUE DOURADINHO,-22.016867,-47.850681
4,13568703,SP,TRAVESSA,RESIDENCIAL PARQUE DOURADINHO,-22.051586,-47.919071


In [14]:
df_correrios_ceplog.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1021723 entries, 0 to 1021722
Data columns (total 6 columns):
CEP               1021723 non-null int32
UF                1021723 non-null object
Tipo_Acento       992639 non-null object
Bairro1_Acento    1019465 non-null object
latitude          1008893 non-null float32
longitude         1008893 non-null float32
dtypes: float32(2), int32(1), object(3)
memory usage: 35.1+ MB
