Custom training of the embeddings for the summary text dataset using Deep Learning based library Keras

In [1]:
# load cleaned data file (adm_ds2.csv) which is obtained after preprocessing

In [2]:
import os, sys, math, csv, datetime, time, pickle, json
import numpy as np
import pandas as pd

import matplotlib.pyplot as plt
import seaborn as sns

In [3]:
import gc

In [4]:
# load the file cleaned and processed in the 1st step
adm_ds = pd.read_csv('adm_ds2.csv')

In [5]:
import keras

In [6]:
# load the keras library functions and classes used to process the text dataset
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [7]:
# preprocess cleaned text to convert the text into a form that could be given to the embedding layer
def process_text_keras(text, max_length):
  tokenizer = Tokenizer()
  tokenizer.fit_on_texts(text)
  vocab_size = len(tokenizer.word_index) + 1

  encoded_docs = tokenizer.texts_to_sequences(text)

  padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

  return padded_docs, tokenizer, vocab_size

In [8]:
# process the test dataset given the tokenizer and the max_length at which the summary text needs to be capped.
def process_test_data(test_data, tokenizer, max_length):
  encoded_docs = tokenizer.texts_to_sequences(test_data)
  padded_docs = pad_sequences(encoded_docs, maxlen=max_length, padding='post')

  return padded_docs

In [9]:
 # create the keras model based on the vocabulary size obtained from the keras tokenization process
 # create a custom embedding suitable for the summary text of size 300
 # input that embedding onto the Keras LSTM layer.
 # pass the LSTM layer feature to Dense layer for classification
 
 def create_model(vocab_size, max_length):
   input_tensor = keras.layers.Input(shape=(max_length,), name='input_tensor')
   embedding_layer = keras.layers.Embedding(vocab_size, 300, name='embedding_layer')(input_tensor)
   lstm_layer = keras.layers.LSTM(128, name='lstm_layer')(embedding_layer)
   dense_layer = keras.layers.Dense(1, activation='sigmoid', name='output_layer')(lstm_layer)

   model = keras.models.Model(inputs=[input_tensor], outputs=[dense_layer])

   opt = keras.optimizers.Adam(learning_rate=3e-4)

   model.compile(loss='binary_crossentropy', metrics=['acc'], optimizer=opt)

   return model, tokenizer

In [10]:
# get the cleaed text values
input_text = adm_ds.cleaned_text.values

In [11]:
# get the target values
target = adm_ds.TARGET.values

In [12]:
input_text

array(['   HISTORY OF PRESENT ILLNESS:  This is an 81-year-old female with a history of emphysema (not on home O2), who presents with three days of shortness of breath thought by her primary care doctor to be a COPD flare.  Two days prior to admission, she was started on a prednisone taper and one day prior to admission she required oxygen at home in order to maintain oxygen saturation greater than 90%.  She has also been on levofloxacin and nebulizers, and was not getting better, and presented to the  Emergency Room. In the  Emergency Room, her oxygen saturation was 100% on CPAP.  She was not able to be weaned off of this despite nebulizer treatment and Solu-Medrol 125 mg IV x2. Review of systems is negative for the following:  Fevers, chills, nausea, vomiting, night sweats, change in weight, gastrointestinal complaints, neurologic changes, rashes, palpitations, orthopnea.  Is positive for the following: Chest pressure occasionally with shortness of breath with exertion, some shortnes

In [13]:
from sklearn.model_selection import train_test_split

In [14]:
# get 20% of total data into test dataset (train test split)
train_x, test_x, train_y, test_y = train_test_split(input_text, target, random_state=42, test_size=0.2)

In [15]:
train_x.shape, test_x.shape, train_y.shape, test_y.shape

((42180,), (10546,), (42180,), (10546,))

In [16]:
# get the maximum length of the summary text
train_x_max_length = max(list(map(lambda x: len(x), train_x)))

In [17]:
del adm_ds

In [18]:
# create vocabulary, get the padded_docs, and the tokenizer object for training the model
# use the first 3500 words only from the summary text which is average of all the lengths analyzed in step 1 of the project
padded_docs, tokenizer, vocab_size = process_text_keras(train_x, 3500)

In [19]:
# get the model and the tokenizer for the vocab size and the max_length
model, tokenizer = create_model(vocab_size, 3500)

In [20]:
gc.collect()

20

In [21]:
# check the model architecture
model.summary()

Model: "functional_1"
_________________________________________________________________
Layer (type)                 Output Shape              Param #   
input_tensor (InputLayer)    [(None, 3500)]            0         
_________________________________________________________________
embedding_layer (Embedding)  (None, 3500, 300)         41396100  
_________________________________________________________________
lstm_layer (LSTM)            (None, 128)               219648    
_________________________________________________________________
output_layer (Dense)         (None, 1)                 129       
Total params: 41,615,877
Trainable params: 41,615,877
Non-trainable params: 0
_________________________________________________________________


In [22]:
# check the feature document shape
padded_docs.shape

(42180, 3500)

In [23]:
# train the keras model with batch size of 64
model.fit(padded_docs, train_y, batch_size=64)



<tensorflow.python.keras.callbacks.History at 0x1444cbd50>

In [24]:
from sklearn.metrics import roc_auc_score

In [25]:
# process the test dataset to get the correct features to input it to the keras model
test_padded_docs = process_test_data(test_x, tokenizer, 3500)

In [26]:
# predict the probabilities of the keras model on the test dataset
test_values = model.predict(test_padded_docs)

In [27]:
test_values.shape

(10546, 1)

In [28]:
# calculate the auroc score of the test dtaa probability values obtained
roc_auc_score(test_y, test_values[:, 0])

0.501960551523712