# Desafio GSK - Asma: KDD - Etapa de Seleção

## Dependecias e Inicialização

In [45]:
%matplotlib inline
import os
from glob import glob
import pandas as pd
import pandas_profiling
import matplotlib.pyplot as plt
import seaborn as sns
from pandas.plotting import register_matplotlib_converters
from IPython.display import display, display_png

register_matplotlib_converters()


CONTEXT_MAX_ROWS_COLS = ('display.max_rows', None, 'display.max_columns', None)
DATASET_PATH_BASE = '../../../../datasets/'

## Funções

In [63]:
date_parser_dmy = lambda x: pd.to_datetime(x, format='%d%m%Y', 
                                           errors='raise',
                                           cache=True)


def load_resource_df_pf(resource_path, dtype, parse_dates, 
                        date_parser=date_parser_dmy, 
                        format='csv', encoding='utf-8'):
    if format == 'csv':
        df = pd.read_csv(resource_path,
                         dtype=dtype,
                         parse_dates=parse_dates,
                         date_parser=date_parser,)
    elif format == 'tsv':
        df = pd.read_csv(resource_path, encoding=encoding)
    else:
        raise NotImplementedError
    
    df = df.reindex(sorted(df.columns), axis=1)
    pf = pandas_profiling.ProfileReport(df)
    
    return df, pf


get_files_path = lambda p: sorted(glob(DATASET_PATH_BASE + p))


def file_base_name(file_name):
    if '.' in file_name:
        separator_index = file_name.index('.')
        base_name = file_name[:separator_index]
        return base_name
    else:
        return file_name


def path_base_name(path):
    file_name = os.path.basename(path)
    return file_base_name(file_name)


def get_resources(path_pattern, prefix_pattern='%s'):
    resources = {}
    for file_path in get_files_path(path_pattern):
        key = prefix_pattern % path_base_name(file_path).lower()
        resources[key] = file_path
    
    return resources

# Conjuntos de Dados

**[SIHSUS - Sistema de Informações Hospitalares do SUS](http://datasus.saude.gov.br/sistemas-e-aplicativos/hospitalares/sihsus)**

A finalidade do AIH (Sistema SIHSUS) é registrar todos os atendimentos provenientes de internações hospitalares que foram financiadas pelo SUS, e a partir deste processamento, gerar relatórios para que os gestores possam fazer os pagamentos dos estabelecimentos de saúde. Além disso, o nível Federal recebe mensalmente uma base de dados de todas as internações autorizadas (aprovadas ou não para pagamento) para que possam ser repassados às Secretarias de Saúde os valores de Produção de Média e Alta complexidade, além dos valores de [CNRAC](http://datasus.saude.gov.br/sistemas-e-aplicativos/regulacao/cnrac-central-nacional-de-regulacao-de-alta-complexidade), FAEC e de Hospitais Universitários – em suas variadas formas de contrato de gestão.

| Acrônimo | Título |
|----------|--------|
| RD | <abbr title="Autorização de Internação Hospitalar">AIH</abbr> Reduzida |
| RJ | <abbr title=" Autorização de Internação Hospitalar ">AIH</abbr> Rejeitadas |
| SP | Servições Profissionais |
| ER | <abbr title=" Autorização de Internação Hospitalar ">AIH</abbr> Rejeitadas com código de erro |

In [59]:
datasus_sih_es_resources = get_resources('datasus/sihsus/data-raw-tsv/*ES*.tsv.gz', 'sih-%s-raw-tsv')

%store datasus_sih_es_resources
datasus_sih_es_resources

Stored 'datasus_sih_es_resources' (dict)


{'sih-rdes0802-raw-tsv': '../../../../datasets/datasus/sihsus/data-raw-tsv/RDES0802.tsv.gz',
 'sih-rdes0803-raw-tsv': '../../../../datasets/datasus/sihsus/data-raw-tsv/RDES0803.tsv.gz',
 'sih-rdes0804-raw-tsv': '../../../../datasets/datasus/sihsus/data-raw-tsv/RDES0804.tsv.gz',
 'sih-rdes0805-raw-tsv': '../../../../datasets/datasus/sihsus/data-raw-tsv/RDES0805.tsv.gz',
 'sih-rdes0806-raw-tsv': '../../../../datasets/datasus/sihsus/data-raw-tsv/RDES0806.tsv.gz',
 'sih-rdes0807-raw-tsv': '../../../../datasets/datasus/sihsus/data-raw-tsv/RDES0807.tsv.gz',
 'sih-rdes0808-raw-tsv': '../../../../datasets/datasus/sihsus/data-raw-tsv/RDES0808.tsv.gz',
 'sih-rdes0809-raw-tsv': '../../../../datasets/datasus/sihsus/data-raw-tsv/RDES0809.tsv.gz',
 'sih-rdes0810-raw-tsv': '../../../../datasets/datasus/sihsus/data-raw-tsv/RDES0810.tsv.gz',
 'sih-rdes0811-raw-tsv': '../../../../datasets/datasus/sihsus/data-raw-tsv/RDES0811.tsv.gz',
 'sih-rdes0812-raw-tsv': '../../../../datasets/datasus/sihsus/data-raw

# SIH RDES

In [69]:
# df_sih_rdes1903_raw, pf_sih_rdes1903_raw = load_resource_df_pf(datasus_sih_es_resources['sih-rdes1903-raw-tsv'],
#                                                                None,
#                                                                None,
#                                                                format='tsv',
#                                                                encoding='windows-1252')
df_sih_rdes1903_raw = pd.read_csv(datasus_sih_es_resources['sih-rdes1903-raw-tsv'], 
                                  sep='\t',
                                  encoding='windows-1252')

with pd.option_context(*CONTEXT_MAX_ROWS_COLS):
    display(df_sih_rdes1903_raw.head())

Unnamed: 0.1,Unnamed: 0,UF_ZI,ANO_CMPT,MES_CMPT,ESPEC,CGC_HOSP,N_AIH,IDENT,CEP,MUNIC_RES,NASC,SEXO,UTI_MES_IN,UTI_MES_AN,UTI_MES_AL,UTI_MES_TO,MARCA_UTI,UTI_INT_IN,UTI_INT_AN,UTI_INT_AL,UTI_INT_TO,DIAR_ACOM,QT_DIARIAS,PROC_SOLIC,PROC_REA,VAL_SH,VAL_SP,VAL_SADT,VAL_RN,VAL_ACOMP,VAL_ORTP,VAL_SANGUE,VAL_SADTSR,VAL_TRANSP,VAL_OBSANG,VAL_PED1AC,VAL_TOT,VAL_UTI,US_TOT,DT_INTER,DT_SAIDA,DIAG_PRINC,DIAG_SECUN,COBRANCA,NATUREZA,NAT_JUR,GESTAO,RUBRICA,IND_VDRL,MUNIC_MOV,COD_IDADE,IDADE,DIAS_PERM,MORTE,NACIONAL,NUM_PROC,CAR_INT,TOT_PT_SP,CPF_AUT,HOMONIMO,NUM_FILHOS,INSTRU,CID_NOTIF,CONTRACEP1,CONTRACEP2,GESTRISCO,INSC_PN,SEQ_AIH5,CBOR,CNAER,VINCPREV,GESTOR_COD,GESTOR_TP,GESTOR_CPF,GESTOR_DT,CNES,CNPJ_MANT,INFEHOSP,CID_ASSO,CID_MORTE,COMPLEX,FINANC,FAEC_TP,REGCT,RACA_COR,ETNIA,SEQUENCIA,REMESSA,AUD_JUST,SIS_JUST,VAL_SH_FED,VAL_SP_FED,VAL_SH_GES,VAL_SP_GES,VAL_UCI,MARCA_UCI,DIAGSEC1,DIAGSEC2,DIAGSEC3,DIAGSEC4,DIAGSEC5,DIAGSEC6,DIAGSEC7,DIAGSEC8,DIAGSEC9,TPDISEC1,TPDISEC2,TPDISEC3,TPDISEC4,TPDISEC5,TPDISEC6,TPDISEC7,TPDISEC8,TPDISEC9
0,1,320000,2019,3,3,27080610000000.0,3219101523462,1,29171518,320500,19820724,3,0,0,0,0,0,0,0,0,0,0,11,303070129,303070129,244.62,45.82,0,0,0,0,0,0,0,0,0,290.44,0.0,73.71,20190108,20190118,K805,0,28,0,1023,2,0,0,320500,4,36,10,0,10,,2,0,,0,0,0,,0,0,1,0,0,0,0,0,0,0,0,,7257406,27080610000000.0,,0,0,2,6,,0,1,0,10147,HE32000001N201903.DTS,,,0,0,0,0,0,0,K851,,,,,,,,,2,0,0,0,0,0,0,0,0
1,2,320000,2019,3,3,27080610000000.0,3219101523473,1,29172766,320500,19600118,1,0,0,0,0,0,0,0,0,0,0,7,303010037,303010037,1206.43,72.22,0,0,0,0,0,0,0,0,0,1278.65,0.0,324.53,20190109,20190115,A499,0,43,0,1023,2,0,0,320500,4,58,6,1,10,,2,0,,0,0,0,,0,0,1,0,0,0,0,0,0,0,0,,7257406,27080610000000.0,,0,0,2,6,,0,3,0,10148,HE32000001N201903.DTS,,,0,0,0,0,0,0,A418,,,,,,,,,2,0,0,0,0,0,0,0,0
2,3,320000,2019,3,3,27080610000000.0,3219101523484,1,29143821,320130,19660216,1,0,0,0,6,75,0,0,0,0,0,2,303010037,303010037,3957.42,479.02,0,0,0,0,0,0,0,0,0,4436.44,2872.32,1126.0,20190109,20190116,A418,0,28,0,1023,2,0,0,320500,4,52,7,0,10,,2,0,,0,0,0,,0,0,1,0,0,0,0,0,0,0,0,,7257406,27080610000000.0,,0,0,2,6,,0,3,0,10149,HE32000001N201903.DTS,,,0,0,0,0,0,0,,,,,,,,,,0,0,0,0,0,0,0,0,0
3,4,320000,2019,3,3,27080610000000.0,3219101523495,1,29176776,320500,19851115,3,0,0,0,4,75,0,0,0,0,0,10,303030046,303030046,2405.32,310.18,0,0,0,0,0,0,0,0,0,2715.5,1914.88,689.21,20190108,20190122,E889,0,12,0,1023,2,0,0,320500,4,33,14,0,10,,2,0,,0,0,0,,0,0,1,0,0,0,0,0,0,0,0,,7257406,27080610000000.0,,0,0,2,6,,0,3,0,10150,HE32000001N201903.DTS,,,0,0,0,0,0,0,,,,,,,,,,0,0,0,0,0,0,0,0,0
4,5,320000,2019,3,1,27192590000000.0,3219102765406,1,29330000,320280,20111116,1,0,0,0,0,0,0,0,0,0,1,1,413040240,408020342,117.69,82.91,0,0,0,0,0,0,0,0,0,200.6,0.0,50.91,20190219,20190220,S618,0,12,0,3999,2,0,0,320120,4,7,1,0,10,,2,0,,0,0,0,,0,0,1,0,0,0,0,0,0,0,0,,2485729,,,0,0,2,6,,0,1,0,10256,HE32000001N201903.DTS,,,0,0,0,0,0,0,W238,,,,,,,,,1,0,0,0,0,0,0,0,0


In [80]:
cols_sih_rd_diagnostico = ['DIAGSEC1', 'DIAGSEC2', 'DIAGSEC3', 'DIAGSEC4', 
                           'DIAGSEC5', 'DIAGSEC6', 'DIAGSEC7', 'DIAGSEC8', 'DIAGSEC9']

df_sih_rd_cols_diagnostico_vc = df_sih_rdes1903_raw[cols_sih_rd_diagnostico].apply(pd.value_counts)

with pd.option_context(*CONTEXT_MAX_ROWS_COLS):
    display(df_sih_rd_cols_diagnostico_vc)

Unnamed: 0,DIAGSEC1,DIAGSEC2,DIAGSEC3,DIAGSEC4,DIAGSEC5,DIAGSEC6,DIAGSEC7,DIAGSEC8,DIAGSEC9
A049,1.0,,,,,,,,
A09,3.0,1.0,,,,,,,
A154,1.0,,,,,,,,
A279,1.0,,,,,,,,
A418,13.0,,,,,,,,
A419,24.0,1.0,1.0,,,,,,
A46,4.0,,,,,,,,
A488,8.0,,,,,,,,
A491,1.0,1.0,,,,,,,
A498,1.0,,,,,,,,


In [82]:
cols_sih_rd_tp_diagnostico = ['TPDISEC1', 'TPDISEC2', 'TPDISEC3', 'TPDISEC4', 
                           'TPDISEC5', 'TPDISEC6', 'TPDISEC7', 'TPDISEC8', 'TPDISEC9']

df_sih_rd_cols_tp_diagnostico_vc = df_sih_rdes1903_raw[cols_sih_rd_tp_diagnostico].apply(pd.value_counts)

with pd.option_context(*CONTEXT_MAX_ROWS_COLS):
    display(df_sih_rd_cols_tp_diagnostico_vc)

Unnamed: 0,TPDISEC1,TPDISEC2,TPDISEC3,TPDISEC4,TPDISEC5,TPDISEC6,TPDISEC7,TPDISEC8,TPDISEC9
0,15468,18031,18124,18139.0,18139.0,18139.0,18139.0,18139.0,18139.0
1,1674,65,14,,,,,,
2,997,43,1,,,,,,
