## Import Needed Libraries

In [12]:
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MinMaxScaler

## Upload Cleaned Numerical Data

**Demographic Columns**

In [2]:
%%time

demo_path = ('/Users/ivettetapia 1/Symbolic Link Seagate Drive/Springboard/Capstone 2_Deep_Learning/Data/AIH_sample_demo.csv')

demo_cols = ['MUNIC_RES_CAT','SEXO_CAT','RACA_COR_CAT','ETNIA_CAT','IDADE',"MORTE_CAT"]


demo_data = pd.read_csv(demo_path, 
                        encoding = 'UTF-8', 
                        na_values= ['NaN',' ',''],
                        usecols = demo_cols)

CPU times: user 15.5 s, sys: 1.76 s, total: 17.3 s
Wall time: 17.2 s


In [3]:
demo_data.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16614830 entries, 0 to 16614829
Data columns (total 6 columns):
MUNIC_RES_CAT    16462037 non-null float64
SEXO_CAT         16614830 non-null int64
RACA_COR_CAT     12299845 non-null float64
ETNIA_CAT        35012 non-null float64
IDADE            16614830 non-null int64
MORTE_CAT        16614830 non-null int64
dtypes: float64(3), int64(3)
memory usage: 760.6 MB


**Diagnosis Features**

In [4]:
%%time

diag_path = ('/Users/ivettetapia 1/Symbolic Link Seagate Drive/Springboard/Capstone 2_Deep_Learning/Data/AIH_sample_diag.csv')

diag_cols = ["DIAG_PRINC_CAT","CAT_CAT","CAP_CAT","GRP_CAT"]


diag_data = pd.read_csv(diag_path, 
                        encoding = 'UTF-8', 
                        na_values= ['NaN',' ',''],
                        usecols = diag_cols)

CPU times: user 29.1 s, sys: 3.62 s, total: 32.7 s
Wall time: 34 s


In [5]:
diag_data.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16614830 entries, 0 to 16614829
Data columns (total 4 columns):
DIAG_PRINC_CAT    16614830 non-null int64
CAT_CAT           16614830 non-null int64
CAP_CAT           16614830 non-null int64
GRP_CAT           16614830 non-null int64
dtypes: int64(4)
memory usage: 507.0 MB


**Hospitalization Columns**

In [6]:
%%time

hospi_path = ('/Users/ivettetapia 1/Symbolic Link Seagate Drive/Springboard/Capstone 2_Deep_Learning/Data/AIH_sample_hospi.csv')

hospi_cols = ['ESPEC_CAT', 'CGC_HOSP_CAT', 'UTI_MES_TO',
              'MARCA_UTI_CAT','UTI_INT_TO','DIAR_ACOM','DIAS_PERM',
              'PROC_REA_CAT','proc_group_CAT','COBRANCA_CAT',
              'cobranca_group_CAT','IND_VDRL_CAT',
              'CAR_INT_CAT','CONTRACEP1_CAT','CONTRACEP2_CAT','COMPLEX_CAT']


hospi_data = pd.read_csv(hospi_path, 
                         encoding = 'UTF-8', 
                         na_values= ['NaN',' ',''],
                         usecols = hospi_cols)

CPU times: user 1min 4s, sys: 12.6 s, total: 1min 16s
Wall time: 1min 17s


In [7]:
hospi_data.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16614830 entries, 0 to 16614829
Data columns (total 16 columns):
UTI_MES_TO            16614830 non-null int64
UTI_INT_TO            16614830 non-null int64
DIAR_ACOM             16614830 non-null int64
DIAS_PERM             16614830 non-null int64
ESPEC_CAT             16614830 non-null int64
CGC_HOSP_CAT          12629682 non-null float64
PROC_REA_CAT          16614830 non-null int64
COBRANCA_CAT          16614830 non-null int64
IND_VDRL_CAT          16614830 non-null int64
CAR_INT_CAT           16614830 non-null int64
CONTRACEP1_CAT        16614830 non-null int64
CONTRACEP2_CAT        16614830 non-null int64
COMPLEX_CAT           16614830 non-null int64
cobranca_group_CAT    16582615 non-null float64
proc_group_CAT        16614830 non-null int64
MARCA_UTI_CAT         16614830 non-null int64
dtypes: float64(2), int64(14)
memory usage: 2.0 GB


## Join Data

In [8]:
#Concatenate demographics 

join_1 = demo_data.join(diag_data, lsuffix='l_', rsuffix='r',  sort = False)

In [9]:
join_2 = join_1.join(hospi_data, lsuffix='l_', rsuffix='r',  sort = False)

In [10]:
final_join = join_2

In [11]:
final_join.info(verbose=True, null_counts=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 16614830 entries, 0 to 16614829
Data columns (total 26 columns):
MUNIC_RES_CAT         16462037 non-null float64
SEXO_CAT              16614830 non-null int64
RACA_COR_CAT          12299845 non-null float64
ETNIA_CAT             35012 non-null float64
IDADE                 16614830 non-null int64
MORTE_CAT             16614830 non-null int64
DIAG_PRINC_CAT        16614830 non-null int64
CAT_CAT               16614830 non-null int64
CAP_CAT               16614830 non-null int64
GRP_CAT               16614830 non-null int64
UTI_MES_TO            16614830 non-null int64
UTI_INT_TO            16614830 non-null int64
DIAR_ACOM             16614830 non-null int64
DIAS_PERM             16614830 non-null int64
ESPEC_CAT             16614830 non-null int64
CGC_HOSP_CAT          12629682 non-null float64
PROC_REA_CAT          16614830 non-null int64
COBRANCA_CAT          16614830 non-null int64
IND_VDRL_CAT          16614830 non-null int64
CAR_IN

## Re-scale continuous values (0,1)

In [13]:
scaler = MinMaxScaler()

final_join[['IDADE','UTI_MES_TO','UTI_INT_TO','DIAR_ACOM','DIAS_PERM']] = scaler.fit_transform(final_join[['IDADE','UTI_MES_TO','UTI_INT_TO','DIAR_ACOM','DIAS_PERM']])

In [15]:
final_join[['IDADE','UTI_MES_TO','UTI_INT_TO','DIAR_ACOM','DIAS_PERM']].describe()

Unnamed: 0,IDADE,UTI_MES_TO,UTI_INT_TO,DIAR_ACOM,DIAS_PERM
count,16614830.0,16614830.0,16614830.0,16614830.0,16614830.0
mean,0.4016778,0.001527763,0.0002167705,0.005723171,0.01481481
std,0.243914,0.0095804,0.004061263,0.01289579,0.02204041
min,0.0,0.0,0.0,0.0,0.0
25%,0.2121212,0.0,0.0,0.0,0.005494505
50%,0.3636364,0.0,0.0,0.0,0.008241758
75%,0.5959596,0.0,0.0,0.005882353,0.01648352
max,1.0,1.0,1.0,1.0,1.0


## Train - Test Split

In [17]:
%%time

AIH_train, AIH_test = train_test_split(final_join, test_size=0.3, train_size = 0.7, 
                                      random_state = 42, shuffle = True)

CPU times: user 9.86 s, sys: 4.18 s, total: 14 s
Wall time: 15.8 s


In [20]:
print('Rows in Train: ',len(AIH_train))
print('Rows in Test: ',len(AIH_test))

Rows in Train:  11630381
Rows in Test:  4984449


## Output as CSV

In [21]:
# Train data
AIH_train.to_csv('AIH_train.csv', index = False, 
                 na_rep= 'NaN', encoding='utf-8', chunksize = 50000)

In [22]:
# Test data

AIH_test.to_csv('AIH_test.csv', index = False, 
                 na_rep= 'NaN', encoding='utf-8', chunksize = 50000)