# Exploratory Data Analysis

### Import Libraries

In [1]:
import pandas as pd
import seaborn as sns
import matplotlib as plt
import numpy as np

### Authorization for Hospital Admission Data

The AIH datasets contains data on hospital production and services in Brazil. The data that will be used here is the Authorization for Hospital Admission. This dataset is part of `Brazil’s SIHSUS Hospital Information System`. This system manages the coordination and payment by Brazil’s public healthcare system (covers around 34% of Brazil’s population). In this application, I will be using data from 2015 – 2018. This represents 3.5 years’ of information.

A record in the AIH database is created when a hospital or healthcare unit generates a request for hospitalization. Providers submit demographic and health information about the patient. This request is ultimately approved or rejected. While the patient is in the hospital, the record is updated to also contain information about procedures performed and discharge. 

More information about this data can be found below: 

* [DataSUS Website](http://datasus.saude.gov.br/informacoes-de-saude)
* [AIH Data Fields](https://github.com/IvetteMTapia/Capstone-2_Deep_Learning/blob/master/IT_SIHSUS_1603_DataDict.pdf)

### Upload Data

Upload random sample created from the AIH 2015 - 2018 files.

In [3]:
%%time

AIH_sample_path = ('/Users/ivettetapia 1/Symbolic Link Seagate Drive/Springboard/Capstone 2_Deep_Learning/Data/Random Sample File/AIH_random_sample_full.csv')

AIH_sample = pd.read_csv(AIH_sample_path, 
                         encoding = 'UTF-8', 
                         na_values= ['NaN',' ',''], 
                         low_memory=True)



CPU times: user 1min 55s, sys: 27.5 s, total: 2min 22s
Wall time: 2min 28s


In [4]:
AIH_sample.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8329642 entries, 0 to 8329641
Data columns (total 73 columns):
UF_ZI         8329642 non-null int64
ANO_CMPT      8329642 non-null int64
MES_CMPT      8329642 non-null int64
ESPEC         8329642 non-null int64
CGC_HOSP      6282752 non-null float64
N_AIH         8329642 non-null int64
IDENT         8329642 non-null int64
CEP           8329642 non-null int64
MUNIC_RES     8329642 non-null int64
NASC          8329642 non-null int64
SEXO          8329642 non-null int64
UTI_MES_TO    8329642 non-null int64
MARCA_UTI     8329642 non-null int64
UTI_INT_IN    8329642 non-null int64
UTI_INT_AN    8329642 non-null int64
UTI_INT_AL    8329642 non-null int64
UTI_INT_TO    8329642 non-null int64
DIAR_ACOM     8329642 non-null int64
QT_DIARIAS    8329642 non-null int64
PROC_SOLIC    8329642 non-null int64
PROC_REA      8329642 non-null int64
VAL_SH        8329642 non-null float64
VAL_SP        8329642 non-null float64
VAL_TOT       8329642 non-null

### Create column blocks

In [5]:
ids = ['UF_ZI','ANO_CMPT','MES_CMPT','CGC_HOSP','N_AIH'
       'IDENT','CEP','NAT_JUR','NATUREZA','DESTAO',
       'MUNIC_MOV','CID_NOTIF','SEQ_AIH5',
       'CNAER','VINCPREV','GESTOR_COD','GESTOR_TP',
       'GESTOR_CPF','GESTOR_DT','CNES','CNPJ_MANT',
       'SEQUENCIA','HOMONIMO','GESTAO']

demo = ['MUNIC_RES','NASC','SEXO','COD_IDADE',
        'IDADE','MORTE','NACIONAL','NUM_FILHOS',
        'INSTRU','GESTRISCO','CBOR',
        'RACA_COR','ETNIA']

diag = ['DIAG_PRINC','DIAG_SECUN','COMPLEX']

serv = ['ESPEC', 'MARCA_UTI','PROC_SOLIC',
        'PROC_REA','IND_VDRL','CONTRACEP1',
        'CONTRACEP2','MARCA_UCI']

hospi = ['UTI_MES_TO','UTI_INT_TO','DIAR_ACOM',
         'QT_DIARIAS','DI_INTER','DT_INTER','DT_SAIDA','COBRANCA',
         'DIAS_PERM','CAR_INT','INSC_PN','INFEHOSP',
         'CID_ASSO','CID_MORTE']

finan = ['VAL_SH','VAL_SP','VAL_TOT','VAL_UTI','US_TOT',
         'FINANC','FAEC_TP','REGCT','REMESSA','AUD_JUST',
         'SIS_JUST','VAL_SH_FED','VAL_SP_FED','VAL_SH_GES',
         'VAL_SP_GES','VAL_UCI']

In [19]:
cols = AIH_sample.columns.values.tolist()

print(len(cols))

cols

73


['UF_ZI',
 'ANO_CMPT',
 'MES_CMPT',
 'ESPEC',
 'CGC_HOSP',
 'N_AIH',
 'IDENT',
 'CEP',
 'MUNIC_RES',
 'NASC',
 'SEXO',
 'UTI_MES_TO',
 'MARCA_UTI',
 'UTI_INT_IN',
 'UTI_INT_AN',
 'UTI_INT_AL',
 'UTI_INT_TO',
 'DIAR_ACOM',
 'QT_DIARIAS',
 'PROC_SOLIC',
 'PROC_REA',
 'VAL_SH',
 'VAL_SP',
 'VAL_TOT',
 'VAL_UTI',
 'US_TOT',
 'DT_INTER',
 'DT_SAIDA',
 'DIAG_PRINC',
 'DIAG_SECUN',
 'COBRANCA',
 'NATUREZA',
 'NAT_JUR',
 'GESTAO',
 'RUBRICA',
 'IND_VDRL',
 'MUNIC_MOV',
 'COD_IDADE',
 'IDADE',
 'DIAS_PERM',
 'MORTE',
 'NACIONAL',
 'CAR_INT',
 'HOMONIMO',
 'NUM_FILHOS',
 'INSTRU',
 'CONTRACEP1',
 'CONTRACEP2',
 'GESTRISCO',
 'INSC_PN',
 'SEQ_AIH5',
 'CBOR',
 'CNAER',
 'VINCPREV',
 'GESTOR_COD',
 'GESTOR_TP',
 'GESTOR_CPF',
 'CNES',
 'CID_ASSO',
 'CID_MORTE',
 'COMPLEX',
 'FINANC',
 'REGCT',
 'RACA_COR',
 'ETNIA',
 'SEQUENCIA',
 'REMESSA',
 'VAL_SH_FED',
 'VAL_SP_FED',
 'VAL_SH_GES',
 'VAL_SP_GES',
 'VAL_UCI',
 'MARCA_UCI']

In [24]:
list(set(cols) - set(ids) - set(demo) - set(diag) - set(serv) - set(hospi) - set(finan))

['RUBRICA',
 'DT_INTER',
 'N_AIH',
 'UTI_INT_IN',
 'UTI_INT_AN',
 'UTI_INT_AL',
 'GESTAO']

In [13]:
# Sanity check. Make sure I have all the columns accounted for.

col_check = len(ids) + len(demo) + len(diag) + len(serv) + len(hospi) + len(finan)

col_check

76

### Create Dictionary of Variable Definitions for Reference

In [9]:
var_spread_path = ('/Users/ivettetapia 1/Symbolic Link Seagate Drive/Springboard/Capstone 2_Deep_Learning/References/IT_SIHSUS_1603_DataDict.xlsx')

var_df = pd.read_excel(var_spread_path, index_col = 'Field_Name')

var_def_dict = var_df.to_dict(orient = 'index')

var_def_dict

{'UF_ZI': {'Type of Field': 'char(6)', 'Description': 'Municipality Manager'},
 'ANO_CMPT': {'Type of Field': 'char(4)',
  'Description': 'Year of AIH processing, in yyyy format.'},
 'MÊS_CMPT': {'Type of Field': 'char(2)',
  'Description': 'Month of AIH processing, in mm format.'},
 'ESPEC': {'Type of Field': 'char(2)', 'Description': 'Specialty of Bed'},
 'CGC_HOSP': {'Type of Field': 'char(14)',
  'Description': 'CNPJ of the Establishment'},
 'N_AIH': {'Type of Field': 'char(13)', 'Description': 'Number of AIH'},
 'IDENT': {'Type of Field': 'char(1)',
  'Description': 'Identification of the type of AIH'},
 'CEP': {'Type of Field': 'char(8)', 'Description': 'CEP of the patient'},
 'MUNIC_RES': {'Type of Field': 'char(6)',
  'Description': "Municipality of Patient's Residence"},
 'NASC': {'Type of Field': 'char(8)',
  'Description': 'Date of birth of the patient (yyyammdd)'},
 'SEXO': {'Type of Field': 'char(1)', 'Description': 'Sex of patient'},
 'UTI_MES_IN': {'Type of Field': 'nume

### Demographics Bloc 

> Create demographics dataframe

+ Descriptives
+ Histogram/Grids
+ CDF
+ Unique Values
+ Heatmaps