## Import Needed Libraries

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Upload Cleaned Numerical Data

**Demographic Columns**

In [2]:
%%time

demo_path = ('/Users/ivettetapia 1/Symbolic Link Seagate Drive/Springboard/Capstone 2_Deep_Learning/Data/AIH_sample_demo.csv')

demo_cols = ['MUNIC_RES_CAT','SEXO_CAT','RACA_COR_CAT','ETNIA_CAT','IDADE',"MORTE_CAT"]


demo_data = pd.read_csv(demo_path, 
                        encoding = 'UTF-8', 
                        na_values= ['NaN',' ',''],
                        usecols = demo_cols)

CPU times: user 16.1 s, sys: 2 s, total: 18.1 s
Wall time: 18.3 s


In [3]:
demo_data.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16614830 entries, 0 to 16614829
Data columns (total 6 columns):
MUNIC_RES_CAT    16462037 non-null float64
SEXO_CAT         16614830 non-null int64
RACA_COR_CAT     12299845 non-null float64
ETNIA_CAT        35012 non-null float64
IDADE            16614830 non-null int64
MORTE_CAT        16614830 non-null int64
dtypes: float64(3), int64(3)
memory usage: 760.6 MB


**Diagnosis Features**

In [3]:
%%time

diag_path = ('/Users/ivettetapia 1/Symbolic Link Seagate Drive/Springboard/Capstone 2_Deep_Learning/Data/AIH_sample_diag.csv')

diag_cols = ["DIAG_PRINC_CAT","CAT_CAT","CAP_CAT","GRP_CAT"]


diag_data = pd.read_csv(diag_path, 
                        encoding = 'UTF-8', 
                        na_values= ['NaN',' ',''],
                        usecols = diag_cols)

CPU times: user 28.7 s, sys: 2.95 s, total: 31.7 s
Wall time: 32.9 s


In [5]:
diag_data.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16614830 entries, 0 to 16614829
Data columns (total 4 columns):
DIAG_PRINC_CAT    16614830 non-null int64
CAT_CAT           16614830 non-null int64
CAP_CAT           16614830 non-null int64
GRP_CAT           16614830 non-null int64
dtypes: int64(4)
memory usage: 507.0 MB


**Hospitalization Columns**

In [4]:
%%time

hospi_path = ('/Users/ivettetapia 1/Symbolic Link Seagate Drive/Springboard/Capstone 2_Deep_Learning/Data/AIH_sample_hospi.csv')

hospi_cols = ['ESPEC_CAT', 'CGC_HOSP_CAT', 'UTI_MES_TO',
              'MARCA_UTI_CAT','UTI_INT_TO','DIAR_ACOM','DIAS_PERM',
              'PROC_REA_CAT','proc_group_CAT','proc_subgroup_CAT','COBRANCA_CAT',
              'cobranca_group_CAT','IND_VDRL_CAT','CAR_INT_CAT','CONTRACEP1_CAT',
              'CONTRACEP2_CAT','COMPLEX_CAT']


hospi_data = pd.read_csv(hospi_path, 
                         encoding = 'UTF-8', 
                         na_values= ['NaN',' ',''],
                         usecols = hospi_cols)

CPU times: user 57.8 s, sys: 10.4 s, total: 1min 8s
Wall time: 1min 7s


In [6]:
hospi_data.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16614830 entries, 0 to 16614829
Data columns (total 17 columns):
ESPEC_CAT             16614830 non-null int64
CGC_HOSP_CAT          16614830 non-null int64
UTI_MES_TO            16614830 non-null int64
MARCA_UTI_CAT         16614830 non-null int64
UTI_INT_TO            16614830 non-null int64
DIAR_ACOM             16614830 non-null int64
DIAS_PERM             16614830 non-null int64
PROC_REA_CAT          16614830 non-null int64
proc_group_CAT        16614830 non-null int64
proc_subgroup_CAT     16614830 non-null int64
COBRANCA_CAT          16614830 non-null int64
cobranca_group_CAT    16614830 non-null int64
IND_VDRL_CAT          16614830 non-null int64
CAR_INT_CAT           16614830 non-null int64
CONTRACEP1_CAT        16614830 non-null int64
CONTRACEP2_CAT        16614830 non-null int64
COMPLEX_CAT           16614830 non-null int64
dtypes: int64(17)
memory usage: 2.1 GB


## Join Data

In [5]:
#Concatenate demographics and diagnosis data

join_1 = demo_data.join(diag_data, lsuffix='l_', rsuffix='r',  sort = False)

In [6]:
# Concatenate demographics, diagnosis and hospitalization data

join_2 = join_1.join(hospi_data, lsuffix='l_', rsuffix='r',  sort = False)

In [7]:
final_join = join_2

In [11]:
final_join.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16614830 entries, 0 to 16614829
Data columns (total 27 columns):
MUNIC_RES_CAT         16462037 non-null float64
SEXO_CAT              16614830 non-null int64
RACA_COR_CAT          12299845 non-null float64
ETNIA_CAT             35012 non-null float64
IDADE                 16614830 non-null int64
MORTE_CAT             16614830 non-null int64
DIAG_PRINC_CAT        16614830 non-null int64
CAT_CAT               16614830 non-null int64
CAP_CAT               16614830 non-null int64
GRP_CAT               16614830 non-null int64
ESPEC_CAT             16614830 non-null int64
CGC_HOSP_CAT          16614830 non-null int64
UTI_MES_TO            16614830 non-null int64
MARCA_UTI_CAT         16614830 non-null int64
UTI_INT_TO            16614830 non-null int64
DIAR_ACOM             16614830 non-null int64
DIAS_PERM             16614830 non-null int64
PROC_REA_CAT          16614830 non-null int64
proc_group_CAT        16614830 non-null int64
proc_sub

## Re-scale continuous values (0,1)

In [8]:
scaler = MinMaxScaler()

final_join[['IDADE','UTI_MES_TO','UTI_INT_TO','DIAR_ACOM','DIAS_PERM']] = scaler.fit_transform(final_join[['IDADE','UTI_MES_TO','UTI_INT_TO','DIAR_ACOM','DIAS_PERM']])

In [9]:
final_join[['IDADE','UTI_MES_TO','UTI_INT_TO','DIAR_ACOM','DIAS_PERM']].describe()

Unnamed: 0,IDADE,UTI_MES_TO,UTI_INT_TO,DIAR_ACOM,DIAS_PERM
count,16614830.0,16614830.0,16614830.0,16614830.0,16614830.0
mean,0.4016778,0.001527763,0.0002167705,0.005723171,0.01481481
std,0.243914,0.0095804,0.004061263,0.01289579,0.02204041
min,0.0,0.0,0.0,0.0,0.0
25%,0.2121212,0.0,0.0,0.0,0.005494505
50%,0.3636364,0.0,0.0,0.0,0.008241758
75%,0.5959596,0.0,0.0,0.005882353,0.01648352
max,1.0,1.0,1.0,1.0,1.0


## Output as CSV

Output cleaned concanetaned file as CSV.

In [15]:
# Output Concatenated to CSV

final_join.to_csv('AIH_clean_concantenated.csv', index = False,
                  na_rep= 'NaN', encoding='utf-8', chunksize = 50000)

## Prepare Data for DNN - With Principal Diagnosis

This section will prepare the training, validation and test data for the neural network. The data will be outputted as Numpy arrays to decrease computational cost in the GPU.

**Load Concatenated and Cleaned Data**

In [4]:
#Load Cleaned Concatenated Data

data_path = ('/Users/ivettetapia 1/Symbolic Link Seagate Drive/Springboard/Capstone 2_Deep_Learning/Data/AIH_clean_concantenated.csv')

cols = ['SEXO_CAT','IDADE','MORTE_CAT','UTI_MES_TO','UTI_INT_TO','DIAR_ACOM',
        'DIAS_PERM','ESPEC_CAT','DIAG_PRINC_CAT','proc_subgroup_CAT',
        'COBRANCA_CAT','IND_VDRL_CAT','CAR_INT_CAT','COMPLEX_CAT','MARCA_UTI_CAT']

data = pd.read_csv(data_path, 
                   encoding = 'UTF-8', 
                   na_values= ['NaN',' ',''],
                   usecols = cols)

**Create Feature Arrays**

In [5]:
num = data[['SEXO_CAT','MORTE_CAT','IDADE','UTI_MES_TO','IND_VDRL_CAT',
              'UTI_INT_TO','DIAR_ACOM','DIAS_PERM','ESPEC_CAT']].values


cat = data[['DIAG_PRINC_CAT','ESPEC_CAT','COBRANCA_CAT','CAR_INT_CAT','COMPLEX_CAT','MARCA_UTI_CAT']].values

out = pd.get_dummies(data.proc_subgroup_CAT, prefix = 'proc').values

In [6]:
# Size of Unique Categorical Values (i.e. size of vocab)

dim = data['DIAG_PRINC_CAT'].nunique() + data['ESPEC_CAT'].nunique() + data['COBRANCA_CAT'].nunique() + data['COMPLEX_CAT'].nunique() + data['MARCA_UTI_CAT'].nunique() + data['CAR_INT_CAT'].nunique()

dim

9085

**Create Training, Validation and Test Splits**

In [7]:
# First split into training and validation

num_train, num_valid, cat_train, cat_valid, out_train, out_valid = train_test_split(num, cat, out, 
                                                                                    test_size=.12, train_size = 0.88, 
                                                                                    random_state = 42, shuffle = True)

In [8]:
# Breakdown further into valid and testing

num_valid, num_test, cat_valid, cat_test, out_valid, out_test = train_test_split(num_valid, cat_valid, out_valid, 
                                                                                 test_size=.16, train_size = 0.84, 
                                                                                 random_state = 42, shuffle = True)

In [9]:
print('num_input shape |', 'Train:',num_train.shape,'Valid:',num_valid.shape,'Test:',num_test.shape)

print('cat_input shape |', 'Train:',cat_train.shape,'Valid:',cat_valid.shape,'Test:',cat_test.shape)

print('out_input shape |', 'Train:',out_train.shape,'Valid:',out_valid.shape,'Test:',out_test.shape)

num_input shape | Train: (14621050, 9) Valid: (1674775, 9) Test: (319005, 9)
cat_input shape | Train: (14621050, 6) Valid: (1674775, 6) Test: (319005, 6)
out_input shape | Train: (14621050, 38) Valid: (1674775, 38) Test: (319005, 38)


## Save as Numpy Arrays

Training Data

In [12]:
np.save('num_train.npy', num_train)

In [13]:
np.save('cat_train.npy', cat_train)

In [14]:
np.save('out_train.npy', out_train)

Validation Data

In [15]:
np.save('num_valid.npy', num_valid)

In [16]:
np.save('cat_valid.npy', cat_valid)

In [17]:
np.save('out_valid.npy', out_valid)

Testing Data

In [18]:
np.save('num_test.npy', num_test)

In [19]:
np.save('cat_test.npy', cat_test)

In [20]:
np.save('out_test.npy', out_test)

## Prepare Data for DNN - With Principal Diagnosis Groups

This section will prepare the training, validation and test data for the neural network. The data will be outputted as Numpy arrays to decrease computational cost in the GPU.

**Load Concatenated and Cleaned Data**

In [21]:
#Load Cleaned Concatenated Data

data_path = ('/Users/ivettetapia 1/Symbolic Link Seagate Drive/Springboard/Capstone 2_Deep_Learning/Data/AIH_clean_concantenated.csv')

cols = ['SEXO_CAT','IDADE','MORTE_CAT','UTI_MES_TO','UTI_INT_TO','DIAR_ACOM',
        'DIAS_PERM','ESPEC_CAT','GRP_CAT','proc_subgroup_CAT',
        'COBRANCA_CAT','IND_VDRL_CAT','CAR_INT_CAT','COMPLEX_CAT','MARCA_UTI_CAT']

data = pd.read_csv(data_path, 
                   encoding = 'UTF-8', 
                   na_values= ['NaN',' ',''],
                   usecols = cols)

**Create Feature Arrays**

In [22]:
num = data[['SEXO_CAT','MORTE_CAT','IDADE','UTI_MES_TO','IND_VDRL_CAT',
              'UTI_INT_TO','DIAR_ACOM','DIAS_PERM','ESPEC_CAT']].values


cat = data[['GRP_CAT','ESPEC_CAT','COBRANCA_CAT','CAR_INT_CAT','COMPLEX_CAT','MARCA_UTI_CAT']].values

out = pd.get_dummies(data.proc_subgroup_CAT, prefix = 'proc').values

In [23]:
# Size of Unique Categorical Values (i.e. size of vocab)

dim = data['GRP_CAT'].nunique() + data['ESPEC_CAT'].nunique() + data['COBRANCA_CAT'].nunique() + data['COMPLEX_CAT'].nunique() + data['MARCA_UTI_CAT'].nunique() + data['CAR_INT_CAT'].nunique()

dim

331

**Create Training, Validation and Test Splits**

In [30]:
# First split into training and validation

num_train_grp, num_valid_grp, cat_train_grp, cat_valid_grp, out_train_grp, out_valid_grp = train_test_split(num, cat, out, 
                                                                                    test_size=.12, train_size = 0.88, 
                                                                                    random_state = 42, shuffle = True)

In [31]:
# Breakdown further into valid and testing

num_valid_grp, num_test_grp, cat_valid_grp, cat_test_grp, out_valid_grp, out_test_grp = train_test_split(num_valid, cat_valid, out_valid, 
                                                                                 test_size=.16, train_size = 0.84, 
                                                                                 random_state = 42, shuffle = True)

In [33]:
print('num_input shape |', 'Train:',num_train_grp.shape,'Valid:',num_valid_grp.shape,'Test:',num_test_grp.shape)

print('cat_input shape |', 'Train:',cat_train_grp.shape,'Valid:',cat_valid_grp.shape,'Test:',cat_test_grp.shape)

print('out_input shape |', 'Train:',out_train_grp.shape,'Valid:',out_valid_grp.shape,'Test:',out_test_grp.shape)

num_input shape | Train: (14621050, 9) Valid: (1406811, 9) Test: (267964, 9)
cat_input shape | Train: (14621050, 6) Valid: (1406811, 6) Test: (267964, 6)
out_input shape | Train: (14621050, 38) Valid: (1406811, 38) Test: (267964, 38)


## Save as Numpy Arrays

Training Data

In [34]:
np.save('num_train_grp.npy', num_train_grp)

In [35]:
np.save('cat_train_grp.npy', cat_train_grp)

In [36]:
np.save('out_train_grp.npy', out_train_grp)

Validation Data

In [37]:
np.save('num_valid_grp.npy', num_valid_grp)

In [38]:
np.save('cat_valid_grp.npy', cat_valid_grp)

In [39]:
np.save('out_valid_grp.npy', out_valid_grp)

Testing Data

In [40]:
np.save('num_test_grp.npy', num_test_grp)

In [41]:
np.save('cat_test_grp.npy', cat_test_grp)

In [42]:
np.save('out_test_grp.npy', out_test_grp)