<a href="https://colab.research.google.com/github/IvetteMTapia/Capstone-2_Deep_Learning/blob/master/Feature_Engineering_and_Deep_Learning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## Import Libraries

In [0]:
import pandas as pd
import numpy as np
import matplotlib as plt
import keras
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from keras.layers import Dense, Activation, Dropout, Concatenate, Input
from keras.models import Sequential
from keras.callbacks import EarlyStopping, ModelCheckpoint
from sklearn.model_selection import StratifiedKFold


In [2]:
# Create connection to Google Drive to get files
from google.colab import drive
drive.mount('/content/gdrive')

Go to this URL in a browser: https://accounts.google.com/o/oauth2/auth?client_id=947318989803-6bn6qk8qdgf4n4g3pfee6491hc0brc4i.apps.googleusercontent.com&redirect_uri=urn%3Aietf%3Awg%3Aoauth%3A2.0%3Aoob&scope=email%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdocs.test%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fdrive.photos.readonly%20https%3A%2F%2Fwww.googleapis.com%2Fauth%2Fpeopleapi.readonly&response_type=code

Enter your authorization code:
··········
Mounted at /content/gdrive


## Upload Cleaned Data

Upload 2015 - 2018 AIH cleaned features: demographics, diagnosis and hospitalization. 

**Proportional Random Sample **

In [0]:
#Path to train data


data_path = ('/content/gdrive/My Drive/Deep Learning/AIH_clean_concantenated.csv')

cols = ['SEXO_CAT','IDADE','MORTE_CAT','CAP_CAT','UTI_MES_TO','UTI_INT_TO',
        'DIAR_ACOM','DIAS_PERM','ESPEC_CAT','PROC_REA_CAT','GRP_CAT',
        'COBRANCA_CAT','IND_VDRL_CAT','CAR_INT_CAT','COMPLEX_CAT','MARCA_UTI_CAT',
        'DIAG_PRINC_CAT']

cols_not_used = ['CAT_CAT','RACA_COR_CAT','ETNIA_CAT','cobranca_group_CAT',
                 'CONTRACEP1_CAT','CONTRACEP2_CAT','proc_group_CAT','MUNIC_RES_CAT',"CAT_CAT"]

#Read to pandas df

num_rows = 1000000

data = pd.read_csv(data_path, usecols = cols, nrows = num_rows)

## Prepare Training Data for DNN

**One-Hot Encode Categorical Predictors - Create Input (X) dataset**

In [0]:

cols_encode = ['SEXO_CAT','MORTE_CAT','CAP_CAT','ESPEC_CAT',
               'COBRANCA_CAT','IND_VDRL_CAT','CAR_INT_CAT',
               'COMPLEX_CAT','MARCA_UTI_CAT','GRP_CAT'] # columns to one-hot encode

pre = ['c1','c2','c3','c4','c5','c6','c7','c8','c9','c10'] # prefixes of one-hot encoded columns

one_hot = pd.get_dummies(data, columns= cols_encode, prefix = pre)

#Remove Procedure Realized(Y) & Diagnosis Features from input features

one_hot.drop(['PROC_REA_CAT', 'DIAG_PRINC_CAT'], axis = 1, inplace=True)

X = one_hot.values

**Create Output Feature - Y**

In [0]:
# Create Output Feature

Y = pd.get_dummies(data.PROC_REA_CAT, prefix = 'proc').values

**Split Data into Train, Valid and Testing**

In [0]:
# First split into training and validation

x_train, x_valid, y_train, y_valid = train_test_split(X, Y, test_size=.15, train_size = 0.85, 
                                                      random_state = 42, shuffle = True)

In [0]:
# Breakdown further into valid and testing

x_valid, x_test, y_valid, y_test = train_test_split(x_valid, y_valid, test_size=.35, train_size = 0.65, 
                                                    random_state = 42, shuffle = True)

** Shape for Each Sample**

In [13]:
print('X shape |', 'Train:', x_train.shape, 'Valid:', x_valid.shape, 'Test:',x_test.shape)

print('Y shape |', 'Train:', y_train.shape, 'Valid:', y_valid.shape, 'Test:',y_test.shape)


X shape | Train: (850000, 354) Valid: (97500, 354) Test: (52500, 354)
Y shape | Train: (850000, 1612) Valid: (97500, 1612) Test: (52500, 1612)


In [0]:
#Input Columns

input_cols = x_train.shape[1]

#Output Columns

output_cols = y_train.shape[1]

## Neural Network - Baseline Model

Model with no regularization

In [0]:
# Set Up Model

baseline = Sequential()

#Input Layers

baseline.add(Dense(100, activation='relu', input_shape = (input_cols,)))

# Second Layer
baseline.add(Dense(100, activation='relu'))

# Third Layer
baseline.add(Dense(100, activation='relu'))

# Fourth Layer
baseline.add(Dense(100, activation='relu'))

# Fifth Layer
baseline.add(Dense(100, activation='relu'))

#Outupt Layer

baseline.add(Dense(output_cols, activation='softmax'))

# Optimizer: Adam Gradient Descent

adamgrad = keras.optimizers.Adagrad(lr=0.001, epsilon=None, decay=0.0)

#Compile Function
baseline.compile(optimizer=adamgrad, 
                 loss='categorical_crossentropy', 
                 metrics=['accuracy'])

callbacks = [EarlyStopping(monitor='loss', patience=5)]

In [17]:
# Fit Model Function

%%time

baseline.fit(x = x_train, 
             y = y_train, epochs= 25,
             shuffle = True,
             batch_size=128,
             verbose=2,
             callbacks=callbacks)

Epoch 1/2
 - 134s - loss: 2.9497 - acc: 0.4218
Epoch 2/2
 - 135s - loss: 1.9632 - acc: 0.5770
CPU times: user 2min 3s, sys: 2min 52s, total: 4min 55s
Wall time: 4min 29s


<keras.callbacks.History at 0x7fa92b9650f0>

## Neural Network - Model with Dropout Regularization

Model with dropout regularization

In [0]:
# Set Up Model

model_reg = Sequential()

#Input Layers

model_reg.add(Dense(100, activation='relu', input_shape = (input_cols,)))
model_reg.add(Dropout(0.3))

# Second Layer
model_reg.add(Dense(100, activation='relu'))
model_reg.add(Dropout(0.3))

# Third Layer
model_reg.add(Dense(100, activation='relu'))
model_reg.add(Dropout(0.3))

# Fourth Layer
model_reg.add(Dense(100, activation='relu'))
model_reg.add(Dropout(0.3))

# Fifth Layer
model_reg.add(Dense(100, activation='relu'))
model_reg.add(Dropout(0.3))

#Outupt Layer

model_reg.add(Dense(output_cols, activation='softmax'))

# Optimizer: Adam Gradient Descent

adamgrad = keras.optimizers.Adagrad(lr=0.001, epsilon=None, decay=0.0)

#Compile Function
model_reg.compile(optimizer=adamgrad, 
                 loss='categorical_crossentropy', 
                 metrics=['accuracy'])

callbacks = [EarlyStopping(monitor='loss', patience=5)]


In [0]:
# Fit Model Function

%%time

baseline.fit(x = x_train, 
             y = y_train, epochs= 25,
             shuffle = True,
             batch_size=128,
             verbose=2,
             callbacks=callbacks)

## Model with Embedding Layer

Prepare Data for Model

**One-Hot Encode Categorical Predictors - Create Auxiliary Dataset**

In [0]:
cols_encode = ['SEXO_CAT','MORTE_CAT','CAP_CAT','ESPEC_CAT',
               'COBRANCA_CAT','IND_VDRL_CAT','CAR_INT_CAT',
               'COMPLEX_CAT','MARCA_UTI_CAT'] # columns to one-hot encode, GRP_CAT not included

pre = ['c1','c2','c3','c4','c5','c6','c7','c8','c9'] # prefixes of one-hot encoded columns

one_hot2 = pd.get_dummies(train, columns= cols_encode, prefix = pre)

#Remove Procedure Realized(Y) & Diagnosis Features from input features

one_hot2.drop('PROC_REA_CAT', 'DIAG_PRINC_CAT', axis = 1,inplace=True)

aux = one_hot2.values

**Create Categorical Inputs**

In [0]:
cat_input = train[['PROC_REA_CAT','GRP_CAT']].values

**Split Data into Train, Valid and Testing**

In [0]:
# First split into training and validation

aux_train, aux_valid, cat_input_train, cat_input_valid = train_test_split(aux, cat_input, 
                                                                          test_size=.15, train_size = 0.85, 
                                                                          random_state = 42, shuffle = True)

In [0]:
# Breakdown further into valid and testing

aux_valid, aux_test, cat_input_valid, cat_input_test = train_test_split(aux_valid, cat_input_valid, 
                                                                        test_size=.34, train_size = 0.66, 
                                                                        random_state = 42, shuffle = True)

**Shape of Arrays**

In [0]:
print('aux shape |', 'Train:', aux_train.shape, 'Valid:', aux_valid.shape, 'Test:',aux_test.shape)

print('cat_input shape |', 'Train:', cat_input_train.shape, 'Valid:', cat_input_valid.shape, 'Test:',cat_input_test.shape)

In [0]:
#Auxiliary Cols

aux_cols = aux_train.shape[1]

# Columns

cat_input_cols = y_train.shape[1]

In [0]:
# Size of Unique Categorical Values (i.e. size of vocab)

dim = data['PROC_REA_CAT'].nunique() + data['GRP_CAT'].nunique()

> Note: Output stays the same. Will use y_train, y_test and y_valid created above.

**Embedding Layer Model**

In [25]:
# Categorical Inputs

cat_input = Input(shape=(cat_input_cols,), dtype='int32', name='cat_input')

# Embedding Layer

x = Embedding(output_dim=80, input_dim=dim, input_length=cat_input_cols)(main_input)

# A LSTM will transform the vector sequence into a single vector,
# containing information about the entire sequence

lstm_out = LSTM(32)(x)

auxiliary_output = Dense(1, activation='sigmoid', name='aux_output')(lstm_out)

#Define auxiliary input layer

auxiliary_input = Input(shape=(aux_cols,), name='aux_input')

# Add numerical data & One - Hot Encoded Data

x = keras.layers.concatenate([lstm_out, auxiliary_input])

# Stack a deep densely-connected network 

x = Dense(100, activation='relu')(x)
x = Dense(100, activation='relu')(x)
x = Dense(100, activation='relu')(x)

# Output Layer

main_output = Dense(output_cols, activation='softmax', name='main_output')(x)

# Model

embbed_model = Model(inputs=[main_input, auxiliary_input], 
              outputs=[main_output, auxiliary_output])

# Optimizer: Adam Gradient Descent

adamgrad = keras.optimizers.Adagrad(lr=0.001, epsilon=None, decay=0.0)

#Compile

embbed_model.compile(optimizer=adamgrad, 
                 loss={'main_output':'categorical_crossentropy', 
                       'aux_output':'binary_crossentropy'},
              metrics=['accuracy'])

NameError: ignored

In [0]:
model.fit({'main_input':cat_input_train , 'aux_input':aux_train },
          {'main_output':y_train , 'aux_output':auxiliary_output},
          epochs=25, batch_size=128,verbose = 2, callbacks=callbacks)

## Fit Trainined Models to Test Data

In [0]:
score_baseline = baseline.evaluate(x_test, y_test, batch_size=128)

In [0]:
score_model_reg = model_reg.evaluate(x_test, y_test, batch_size=128)

## K-Fold Cross Validation (5-Fold)

In [0]:
# Turn this into a function??

# Define 5-fold cross validation

random_seed = 42

kfold = StratifiedKFold(n_splits=5, shuffle=True, random_state=random_seed )

cvscores = []

for train, test in kfold.split(x_valid, y_valid):
  
	# Fit the model
	
  model.fit(x_valid[train], y_valid[train], epochs=150, batch_size=128, verbose=0)
  
	# evaluate the model
  
  scores = model.evaluate(x_valid[test], y_valid[test], batch_size=128, verbose=0)
  
	print("%s: %.2f%%" % (model.metrics_names[1], scores[1]*100))
  
	cvscores.append(scores[1] * 100)
  
print("%.2f%% (+/- %.2f%%)" % (numpy.mean(cvscores), numpy.std(cvscores)))

**Notes**



*   Test different learning rates
*   Modify Architecture
*   Other types of regularization?
*   Feed more data
*   Testing
*   K- Fold Cross Validate

