In [14]:
from google.colab import drive
import os
drive.mount("/content/drive/")

mypath = "drive/My Drive/CS598DLHProject"
os.listdir(mypath)

Drive already mounted at /content/drive/; to attempt to forcibly remount, call drive.mount("/content/drive/", force_remount=True).


['data', 'BOTKeras.ipynb']

In [None]:
import gzip
import pickle as pkl


DATA_DIR = mypath + "/data/"

X_NOTES_INDEXED_FILE = f'{DATA_DIR}X_NOTES_INDEXED.gz'
Y_ICD9_FILE = f'{DATA_DIR}Y_ICD9.gz'
Y_ICD9_ROLLED_FILE = f'{DATA_DIR}Y_ICD9_ROLLED.gz'

# X dataset
if os.path.exists(X_NOTES_INDEXED_FILE):
  print ('reading from saved file X_NOTES_INDEXED_FILE: ', X_NOTES_INDEXED_FILE)
  with gzip.open(X_NOTES_INDEXED_FILE, "rb") as f:
      X = pkl.load(f)
  print('X.type: ',type(X))
  print('X.shape: ', X.shape)
# else:
#   # save data and label to file
#   with gzip.open(X_NOTES_INDEXED_FILE, "wb") as f:
#       pkl.dump(X, f)
#   print('X saved')

# Y_ICD9 dataset
if os.path.exists(Y_ICD9_FILE):
  print ('reading from saved file Y_ICD9_FILE: ', Y_ICD9_FILE)
  with gzip.open(Y_ICD9_FILE, "rb") as f:
      Y_ICD9 = pkl.load(f)
  print('Y_ICD9.type: ',type(Y_ICD9))
  print('Y_ICD9.shape: ', Y_ICD9.shape)
# else:
#   # save data and label to file
#   with gzip.open(Y_ICD9_FILE, "wb") as f:
#       pkl.dump(Y_ICD9, f)
#   print('Y_ICD9 saved')

# Y_ICD9_ROLLED dataset
if os.path.exists(Y_ICD9_ROLLED_FILE):
  print ('reading from saved file Y_ICD9_ROLLED_FILE: ', Y_ICD9_ROLLED_FILE)
  with gzip.open(Y_ICD9_ROLLED_FILE, "rb") as f:
      Y_ICD9_ROLLED = pkl.load(f)
  print('Y_ICD9_ROLLED.type: ',type(Y_ICD9_ROLLED))
  print('Y_ICD9_ROLLED.shape: ', Y_ICD9_ROLLED.shape)
# else:
#   # save data and label to file
#   with gzip.open(Y_ICD9_ROLLED_FILE, "wb") as f:
#       pkl.dump(Y_ICD9_ROLLED, f)
#   print('Y_ICD9_ROLLED saved')

reading from saved file X_NOTES_INDEXED_FILE:  drive/My Drive/CS598DLHProject/data/X_NOTES_INDEXED.gz
X.type:  <class 'numpy.ndarray'>
X.shape:  (399631, 2200)
reading from saved file Y_ICD9_FILE:  drive/My Drive/CS598DLHProject/data/Y_ICD9.gz


In [None]:
import numpy as np
from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.datasets import imdb
from tensorflow.keras.preprocessing import sequence

In [None]:
ngram_range = 1
max_features = 5000
maxlen = 400
batch_size = 32
embedding_dims = 50
epochs = 1

print('Loading data...')
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=max_features)
print(len(x_train), 'train sequences')
print(len(x_test), 'test sequences')
print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

Loading data...
Downloading data from https://storage.googleapis.com/tensorflow/tf-keras-datasets/imdb.npz
25000 train sequences
25000 test sequences
Average train sequence length: 238
Average test sequence length: 230


In [None]:
def create_ngram_set(input_list, ngram_value=2):
    """
    Extract a set of n-grams from a list of integers.
    # >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=2)
    {(4, 9), (4, 1), (1, 4), (9, 4)}
    # >>> create_ngram_set([1, 4, 9, 4, 1, 4], ngram_value=3)
    [(1, 4, 9), (4, 9, 4), (9, 4, 1), (4, 1, 4)]
    """
    return set(zip(*[input_list[i:] for i in range(ngram_value)]))


def add_ngram(sequences, token_indice, ngram_range=2):
    """
    Augment the input list of list (sequences) by appending n-grams values.
    Example: adding bi-gram
    # >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    # >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017}
    # >>> add_ngram(sequences, token_indice, ngram_range=2)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42]]
    Example: adding tri-gram
    # >>> sequences = [[1, 3, 4, 5], [1, 3, 7, 9, 2]]
    # >>> token_indice = {(1, 3): 1337, (9, 2): 42, (4, 5): 2017, (7, 9, 2): 2018}
    # >>> add_ngram(sequences, token_indice, ngram_range=3)
    [[1, 3, 4, 5, 1337, 2017], [1, 3, 7, 9, 2, 1337, 42, 2018]]
    """
    new_sequences = []
    for input_list in sequences:
        new_list = input_list[:]
        for ngram_value in range(2, ngram_range + 1):
            for i in range(len(new_list) - ngram_value + 1):
                ngram = tuple(new_list[i:i + ngram_value])
                if ngram in token_indice:
                    new_list.append(token_indice[ngram])
        new_sequences.append(new_list)

    return new_sequences

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense

class FastText(Model):

    def __init__(self,
                 maxlen,
                 max_features,
                 embedding_dims,
                 class_num=1,
                 last_activation='sigmoid'):
        super(FastText, self).__init__()
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation
        self.embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)
        self.avg_pooling = GlobalAveragePooling1D()
        self.classifier = Dense(self.class_num, activation=self.last_activation)

    def call(self, inputs):
        if len(inputs.get_shape()) != 2:
            raise ValueError('The rank of inputs of FastText must be 2, but now is %d' % len(inputs.get_shape()))
        if inputs.get_shape()[1] != self.maxlen:
            raise ValueError('The maxlen of inputs of FastText must be %d, but now is %d' % (self.maxlen, inputs.get_shape()[1]))
        embedding = self.embedding(inputs)
        x = self.avg_pooling(embedding)
        output = self.classifier(x)
        return output

## CNN model

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Conv1D, GlobalMaxPool1D

class CNN(Model):

    def __init__(self,
                 maxlen,
                 max_features,
                 embedding_dims,
                 class_num=1,
                 last_activation='relu'):
        super(CNN, self).__init__()
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation
        self.embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)
        self.conv = Conv1D(250, 3, activation=last_activation)
        self.max_pooling = GlobalMaxPool1D()
        self.classifier = Dense(self.class_num, activation=self.last_activation)

    def call(self, inputs):
        if len(inputs.get_shape()) != 2:
            raise ValueError('The rank of inputs of FastText must be 2, but now is %d' % len(inputs.get_shape()))
        if inputs.get_shape()[1] != self.maxlen:
            raise ValueError('The maxlen of inputs of FastText must be %d, but now is %d' % (self.maxlen, inputs.get_shape()[1]))
        embedding = self.embedding(inputs)
        conv = self.conv(embedding)
        x = self.max_pooling(conv)
        output = self.classifier(x)
        return output

## CNN 3 Layer model

In [None]:
from tensorflow.keras import Model
from tensorflow.keras.layers import Embedding, GlobalAveragePooling1D, Dense, Conv1D, GlobalMaxPool1D, concatenate

class CNN3Layer(Model):

    def __init__(self,
                 maxlen,
                 max_features,
                 embedding_dims,
                 class_num=1,
                 last_activation='relu'):
        super(CNN3Layer, self).__init__()
        self.maxlen = maxlen
        self.max_features = max_features
        self.embedding_dims = embedding_dims
        self.class_num = class_num
        self.last_activation = last_activation
        self.embedding = Embedding(self.max_features, self.embedding_dims, input_length=self.maxlen)
        self.conv1 = Conv1D(250, 2, activation=last_activation)
        self.conv2 = Conv1D(250, 3, activation=last_activation)
        self.conv3 = Conv1D(250, 4, activation=last_activation)

        self.max_pooling1 = GlobalMaxPool1D()
        self.max_pooling2 = GlobalMaxPool1D()
        self.max_pooling3 = GlobalMaxPool1D()

        self.classifier = Dense(self.class_num, activation=self.last_activation)

    def call(self, inputs):
        if len(inputs.get_shape()) != 2:
            raise ValueError('The rank of inputs of FastText must be 2, but now is %d' % len(inputs.get_shape()))
        if inputs.get_shape()[1] != self.maxlen:
            raise ValueError('The maxlen of inputs of FastText must be %d, but now is %d' % (self.maxlen, inputs.get_shape()[1]))
        embedding = self.embedding(inputs)
        conv1 = self.conv1(embedding)
        conv2 = self.conv2(embedding)
        conv3 = self.conv3(embedding)

        x1 = self.max_pooling1(conv1)
        x2 = self.max_pooling2(conv2)
        x3 = self.max_pooling3(conv3)
        
        x = concatenate([x1, x2, x3])
        output = self.classifier(x)
        return output

In [None]:


if ngram_range > 1:
    print('Adding {}-gram features'.format(ngram_range))
    # Create set of unique n-gram from the training set.
    ngram_set = set()
    for input_list in x_train:
        for i in range(2, ngram_range + 1):
            set_of_ngram = create_ngram_set(input_list, ngram_value=i)
            ngram_set.update(set_of_ngram)

    # Dictionary mapping n-gram token to a unique integer.
    # Integer values are greater than max_features in order
    # to avoid collision with existing features.
    start_index = max_features + 1
    token_indice = {v: k + start_index for k, v in enumerate(ngram_set)}
    indice_token = {token_indice[k]: k for k in token_indice}

    # max_features is the highest integer that could be found in the dataset.
    max_features = np.max(list(indice_token.keys())) + 1

    # Augmenting x_train and x_test with n-grams features
    x_train = add_ngram(x_train, token_indice, ngram_range)
    x_test = add_ngram(x_test, token_indice, ngram_range)
    print('Average train sequence length: {}'.format(np.mean(list(map(len, x_train)), dtype=int)))
    print('Average test sequence length: {}'.format(np.mean(list(map(len, x_test)), dtype=int)))

print('Pad sequences (samples x time)...')
x_train = sequence.pad_sequences(x_train, maxlen=maxlen)
x_test = sequence.pad_sequences(x_test, maxlen=maxlen)
print('x_train shape:', x_train.shape)
print('x_test shape:', x_test.shape)

print('Build model...')
# model = FastText(maxlen, max_features, embedding_dims)

model = CNN3Layer(maxlen, max_features, embedding_dims)

model.compile('adam', 'binary_crossentropy', metrics=['accuracy'])

print('Train...')
early_stopping = EarlyStopping(monitor='val_accuracy', patience=3, mode='max')
model.fit(x_train, y_train,
          batch_size=batch_size,
          epochs=epochs,
          callbacks=[early_stopping],
          validation_data=(x_test, y_test))



Pad sequences (samples x time)...
x_train shape: (25000, 400)
x_test shape: (25000, 400)
Build model...
Train...


<keras.callbacks.History at 0x7fa87d203310>

In [None]:
print('Test...')
result = model.predict(x_test)
print(result)

Test...
[[0.        ]
 [1.1173838 ]
 [0.53399014]
 ...
 [0.        ]
 [0.11631893]
 [0.17010495]]
