##Add Drive

In [1]:
from google.colab import drive
import os
drive.mount("/content/drive/")

mypath = "drive/My Drive/CS598DLHProject"
os.listdir(mypath)

Mounted at /content/drive/


['data',
 'BOTKeras.ipynb',
 'CNN3Layer.ipynb',
 'BOTModel.ipynb',
 'CNNModel.ipynb']

##Pull data from Drive

In [2]:
import gzip
import pickle as pkl
import gc


DATA_DIR = mypath + "/data/"

X_NOTES_INDEXED_FILE = f'{DATA_DIR}X_NOTES_INDEXED.gz'
Y_ICD9_FILE = f'{DATA_DIR}Y_ICD9.gz'
Y_ICD9_ROLLED_FILE = f'{DATA_DIR}Y_ICD9_ROLLED.gz'


SPLIT_CONSTANT = 100000
# X dataset
# if os.path.exists(X_NOTES_INDEXED_FILE):
print ('reading from saved file X_NOTES_INDEXED_FILE: ', X_NOTES_INDEXED_FILE)
with gzip.open(X_NOTES_INDEXED_FILE, "rb") as f:
    X = pkl.load(f)
print('X.type: ', type(X))
print('X.shape: ', X.shape)

X_small = X[:SPLIT_CONSTANT]
del X
gc.collect()
print('X_small.type: ', type(X_small))
print('X_small.shape: ', X_small.shape)

# Y_ICD9 dataset
# if os.path.exists(Y_ICD9_FILE):
# print ('reading from saved file Y_ICD9_FILE: ', Y_ICD9_FILE)
# with gzip.open(Y_ICD9_FILE, "rb") as f:
#     Y_ICD9 = pkl.load(f)
# print('Y_ICD9.type: ',type(Y_ICD9))
# print('Y_ICD9.shape: ', Y_ICD9.shape)

# Y_ICD9_ROLLED dataset
# if os.path.exists(Y_ICD9_ROLLED_FILE):
print ('reading from saved file Y_ICD9_ROLLED_FILE: ', Y_ICD9_ROLLED_FILE)
with gzip.open(Y_ICD9_ROLLED_FILE, "rb") as f:
    Y_ICD9_ROLLED = pkl.load(f)
print('Y_ICD9_ROLLED.type: ',type(Y_ICD9_ROLLED))
print('Y_ICD9_ROLLED.shape: ', Y_ICD9_ROLLED.shape)

Y_small = Y_ICD9_ROLLED[:SPLIT_CONSTANT]

del Y_ICD9_ROLLED
gc.collect()
print('Y_small.type: ',type(Y_small))
print('Y_small.shape: ', Y_small.shape)


reading from saved file X_NOTES_INDEXED_FILE:  drive/My Drive/CS598DLHProject/data/X_NOTES_INDEXED.gz
X.type:  <class 'numpy.ndarray'>
X.shape:  (399631, 2200)
X_small.type:  <class 'numpy.ndarray'>
X_small.shape:  (100000, 2200)
reading from saved file Y_ICD9_ROLLED_FILE:  drive/My Drive/CS598DLHProject/data/Y_ICD9_ROLLED.gz
Y_ICD9_ROLLED.type:  <class 'numpy.ndarray'>
Y_ICD9_ROLLED.shape:  (399631, 781)
Y_small.type:  <class 'numpy.ndarray'>
Y_small.shape:  (100000, 781)


In [3]:
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence

In [7]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Conv1D, GlobalMaxPool1D, concatenate

class CNN3Layer(Model):

    def __init__(self,
                 maxlen,
                 max_features,
                 embedding_dims,
                 class_num=1,
                 last_activation='relu'):
        super(CNN3Layer, self).__init__()
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation
        self.embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)
        self.conv1 = Conv1D(250, 2, activation=last_activation)
        self.conv2 = Conv1D(250, 3, activation=last_activation)
        self.conv3 = Conv1D(250, 4, activation=last_activation)

        self.max_pooling1 = GlobalMaxPool1D()
        self.max_pooling2 = GlobalMaxPool1D()
        self.max_pooling3 = GlobalMaxPool1D()

        self.classifier = Dense(self.class_num, activation='sigmoid')

    def call(self, inputs):
        if len(inputs.get_shape()) != 2:
            raise ValueError('The rank of inputs of FastText must be 2, but now is %d' % len(inputs.get_shape()))
        if inputs.get_shape()[1] != self.maxlen:
            raise ValueError('The maxlen of inputs of FastText must be %d, but now is %d' % (self.maxlen, inputs.get_shape()[1]))
        embedding = self.embedding(inputs)
        conv1 = self.conv1(embedding)
        conv2 = self.conv2(embedding)
        conv3 = self.conv3(embedding)

        x1 = self.max_pooling1(conv1)
        x2 = self.max_pooling2(conv2)
        x3 = self.max_pooling3(conv3)
        
        x = concatenate([x1, x2, x3])
        output = self.classifier(x)
        return output

In [8]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(X_small, Y_small, test_size=0.2, random_state=42)

print(x_train.shape, y_train.shape)
print(x_test.shape, y_test.shape)

(80000, 2200) (80000, 781)
(20000, 2200) (20000, 781)


In [9]:
import keras

max_features = 54000
maxlen = 2200
batch_size = 32
embedding_dims = 50
epochs = 10
class_num = 781


print('Build model...')
model = CNN3Layer(maxlen, max_features, embedding_dims, class_num)

# model = CNN3Layer(maxlen, max_features, embedding_dims)

# model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

model.compile('adam', 'binary_crossentropy', metrics=[keras.metrics.Precision(), keras.metrics.Recall()])

print('Train...')
# early_stopping = EarlyStopping(monitor='val_accuracy', patience=5, mode='max')
# model.fit(x_train, y_train,
#           batch_size=batch_size,
#           epochs=epochs,
#           callbacks=[early_stopping],
#           validation_data=(x_test, y_test))

model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          # callbacks=[early_stopping],
          validation_split=0.2)

Build model...
Train...
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x7ff9f1343b50>