In [1]:
import pandas as pd
import numpy as np
import os
import sys
import numpy as np
import keras
from keras.preprocessing.text import Tokenizer
from keras.utils import to_categorical
from keras.preprocessing.sequence import pad_sequences
from keras.layers import Activation, Conv2D, Input, Embedding, Reshape, MaxPool2D, Concatenate, Flatten, Dropout, Dense, Conv1D
from keras.layers import MaxPool1D
from keras.models import Model
from keras.layers import Embedding
from keras.callbacks import ModelCheckpoint
from keras.optimizers import Adam
import nltk
from nltk.corpus import stopwords
nltk.download('stopwords')

Using TensorFlow backend.
  _np_qint8 = np.dtype([("qint8", np.int8, 1)])
  _np_quint8 = np.dtype([("quint8", np.uint8, 1)])
  _np_qint16 = np.dtype([("qint16", np.int16, 1)])
  _np_quint16 = np.dtype([("quint16", np.uint16, 1)])
  _np_qint32 = np.dtype([("qint32", np.int32, 1)])
  np_resource = np.dtype([("resource", np.ubyte, 1)])
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\goond\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [2]:
#Import the dataset - This notebook only tests a Convolutional Neural Network on the Subjective Notes column
#Subsequent tests will include the Medical History column
df = pd.read_csv('BCH_Test3.csv', error_bad_lines=False)
df = df.reindex(np.random.permutation(df.index))  
df = df[['DischargeDispositionDesc', 'SubjectiveNotes']]

#Create a separate column called Disposition as a binary outcome of the patient's stay in the hospital
df['Disposition'] = np.where(df['DischargeDispositionDesc'].str[:5]=="Admit", 'Admit', 'Discharge')
df.head()

  interactivity=interactivity, compiler=compiler, result=result)


Unnamed: 0,DischargeDispositionDesc,SubjectiveNotes,Disposition
83758,"Discharge to private home, condo, apt without ...",^c^^^^ctasSUN=Pt has in the Er this morning fo...,Discharge
16100,"Discharge to private home, condo, apt without ...",^c^^^^ctasSUN=As per mom pt has abcess to uppe...,Discharge
8952,"Discharge to private home, condo, apt without ...",^c^^^^ctasSUN=pt feel last week Wednesday on f...,Discharge
39841,"Discharge to private home, condo, apt without ...","^c^^^^ctasSUN=As per parent, fever since yeste...",Discharge
15010,"Discharge to private home, condo, apt without ...",,Discharge


In [3]:
#Count the total number of records of each major outcome type
pd.value_counts(df['Disposition'].values, sort=False)

Discharge    120080
Admit         16913
dtype: int64

In [4]:
#Remove any Subjective Notes with empty values and count the number of records
filtered_df = df[df['SubjectiveNotes'].notnull()]
filtered_df.dropna(subset=['SubjectiveNotes'])
pd.value_counts(filtered_df['Disposition'].values, sort=False)

Discharge    100170
Admit         10704
dtype: int64

In [5]:
#Create a temporary subset of only the admitted patients
admit_df = filtered_df.loc[filtered_df.Disposition == "Admit"]
num_admits = len(admit_df)
#admit_df.head()
print(num_admits)

10704


In [6]:
#Create a subset of discharged patients of length equal to the number of admits and save as a temporary subset
discharge_df = filtered_df.loc[filtered_df.Disposition == "Discharge"]
#discharge_df.head()
balanced_discharge_df = discharge_df.sample(n = num_admits) 

In [7]:
#Combine the subset of admitted patients and the random discharged patients back into a single dataframe
balanced_set = admit_df.append(balanced_discharge_df)
balanced_set.head()

Unnamed: 0,DischargeDispositionDesc,SubjectiveNotes,Disposition
1638,Admit to reporting facility as inpatient to an...,^c^^^^ctasSUN=Pt is 16 weeks pregnant c/o naus...,Admit
39669,Admit to reporting facility as inpatient to an...,^c^^^^ctasSUN=Pt 8 weeks pregnant. Pt in ER c/...,Admit
128761,Admit to reporting facility as inpatient to an...,^c^^^^ctasSUN=Pt c/o shortness of breath since...,Admit
39866,Admit to reporting facility as inpatient to an...,^c^^^^ctasSUN=c/o intermittent generalized abd...,Admit
120489,Admit to reporting facility as inpatient to SC...,"^c^^^^ctasSUN=alert , lethargic, 8 mg lorazepa...",Admit


In [8]:
#Changing the text to lowercase:
balanced_set['SubjectiveNotes'] = balanced_set['SubjectiveNotes'].str.lower()

In [9]:
#Create a function to remove stopwords and apply this to the dataset.
#A whitelist of words is created and can be expanded to include a broader set of terms
def remove_stopwords(input_text):
        stop_words = stopwords.words('english')
        whitelist = ["n't", "not", "no"]
        words = input_text.split() 
        clean_words = [word for word in words if (word not in stop_words or word in whitelist) and len(word) > 1] 
        return " ".join(clean_words) 

balanced_set.SubjectiveNotes = balanced_set.SubjectiveNotes.apply(remove_stopwords)

In [10]:
#Removing the following string from the SubjectiveNotes:
#String to be removed: ^c^^^^ctasSUN=
substring = '^c^^^^ctassun='
balanced_set['SubjectiveNotes'] = balanced_set['SubjectiveNotes'].str.strip(substring)
balanced_set.head()

Unnamed: 0,DischargeDispositionDesc,SubjectiveNotes,Disposition
1638,Admit to reporting facility as inpatient to an...,pt 16 weeks pregnant c/o nausea vomiting since...,Admit
39669,Admit to reporting facility as inpatient to an...,pt weeks pregnant. pt er c/o worsening diarrhe...,Admit
128761,Admit to reporting facility as inpatient to an...,pt c/o shortness breath since yesterday. last ...,Admit
39866,Admit to reporting facility as inpatient to an...,"/o intermittent generalized abdo pain, sharp, ...",Admit
120489,Admit to reporting facility as inpatient to SC...,"lert lethargic, mg lorazepam iv given oxygen 2...",Admit


In [11]:
#Verifying that the new dataset has an equal number of Admits and Discharges
#Future iterations will include additional performance metrics, however since the current tests involve accuracy only,
#The datasets are stricly balanced
pd.value_counts(balanced_set['Disposition'].values, sort=False)

Discharge    10704
Admit        10704
dtype: int64

In [12]:
#Setting the parameters for the CNN
MAX_WORDS = 5000
MAX_SEQUENCE_LENGTH = 250
VALIDATION_SPLIT = 0.2
EMBEDDING_DIM = 50
filter_sizes = [3,4,5]
num_filters = 512
embedding_dim = 50
drop = 0.5
batch_size = 50
epochs = 25

In [13]:
#Converting the binary outcome value to numeric values
Y = balanced_set['Disposition']
Y = np.where(balanced_set['Disposition'].str[:5]=="Admit", 1, 0)

In [14]:
#Tokenizing text
tokenizer = Tokenizer(num_words = MAX_WORDS)
tokenizer.fit_on_texts(balanced_set.SubjectiveNotes)
sequences = tokenizer.texts_to_sequences(balanced_set.SubjectiveNotes)

word_index = tokenizer.word_index
print("unique words : {}".format(len(word_index)))

#Padding records with shorter notes up to the max sequence length (of ~250 words)
data = pad_sequences(sequences, maxlen = MAX_SEQUENCE_LENGTH)

Y = to_categorical(np.asarray(Y))
print('Shape of data tensor: ', data.shape)
print('Shape of label tensor: ', Y.shape)

unique words : 14532
Shape of data tensor:  (21408, 250)
Shape of label tensor:  (21408, 2)


In [15]:
#Splitting the data randomly into training and validation sets
indices = np.arange(data.shape[0])
np.random.shuffle(indices)
data = data[indices]
Y = Y[indices]
nb_validation_samples = int(VALIDATION_SPLIT * data.shape[0])

x_train = data[:-nb_validation_samples]
y_train = Y[:-nb_validation_samples]
x_val = data[-nb_validation_samples:]
y_val = Y[-nb_validation_samples:]

In [16]:
#Creating a matrix of word embeddings using Glove - the 50 dimensional representation is used.
#Higher dimensional representations are also available
embeddings_index = {}
f= open("glove.6B.50d.txt", encoding="utf8")
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:],dtype = 'float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [17]:
#Creating a word embedding matrix of the tokenized words
embedding_matrix = np.zeros((len(word_index)+1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        embedding_matrix[i] = embedding_vector

In [18]:
#Creating an embedding layer to be fed into the CNN
embedding_layer = Embedding(len(word_index)+1,
                           EMBEDDING_DIM,
                           weights = [embedding_matrix],
                           input_length=MAX_SEQUENCE_LENGTH,
                           trainable=False)

In [19]:
#Setting the CNN architecture
inputs = Input(shape=(MAX_SEQUENCE_LENGTH,), dtype='int32')
embedding = embedding_layer(inputs)

print(embedding.shape)
reshape = Reshape((MAX_SEQUENCE_LENGTH,EMBEDDING_DIM,1))(embedding)
print(reshape.shape)

conv_0 = Conv2D(num_filters, kernel_size=(filter_sizes[0], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_1 = Conv2D(num_filters, kernel_size=(filter_sizes[1], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)
conv_2 = Conv2D(num_filters, kernel_size=(filter_sizes[2], embedding_dim), padding='valid', kernel_initializer='normal', activation='relu')(reshape)

maxpool_0 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[0] + 1, 1), strides=(1,1), padding='valid')(conv_0)
maxpool_1 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[1] + 1, 1), strides=(1,1), padding='valid')(conv_1)
maxpool_2 = MaxPool2D(pool_size=(MAX_SEQUENCE_LENGTH - filter_sizes[2] + 1, 1), strides=(1,1), padding='valid')(conv_2)

concatenated_tensor = Concatenate(axis=1)([maxpool_0, maxpool_1, maxpool_2])
flatten = Flatten()(concatenated_tensor)
dropout = Dropout(drop)(flatten)
output = Dense(units=2, activation='softmax')(dropout)

model = Model(inputs=inputs, outputs=output)

checkpoint = ModelCheckpoint('weights_cnn_sentece.hdf5', monitor='val_acc', verbose=1, save_best_only=True, mode='auto')
adam = Adam(lr=1e-4, beta_1=0.9, beta_2=0.999, epsilon=1e-08, decay=0.0)

model.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])
model.summary()

Instructions for updating:
Colocations handled automatically by placer.
(?, 250, 50)
(?, 250, 50, 1)
Model: "model_1"
__________________________________________________________________________________________________
Layer (type)                    Output Shape         Param #     Connected to                     
input_1 (InputLayer)            (None, 250)          0                                            
__________________________________________________________________________________________________
embedding_1 (Embedding)         (None, 250, 50)      726650      input_1[0][0]                    
__________________________________________________________________________________________________
reshape_1 (Reshape)             (None, 250, 50, 1)   0           embedding_1[0][0]                
__________________________________________________________________________________________________
conv2d_1 (Conv2D)               (None, 248, 1, 512)  77312       reshape_1[0][0]          

In [None]:
#Running the model & printing testing and validation accuracies and losses
print("Training model...")
model.fit(x_train, y_train, batch_size = batch_size, epochs = epochs, verbose = 1, callbacks = [checkpoint],\
         validation_data = (x_val, y_val))

Training model...
Instructions for updating:
Use tf.cast instead.
Train on 17127 samples, validate on 4281 samples
Epoch 1/25
Epoch 2/25




Epoch 3/25
Epoch 4/25
Epoch 5/25
Epoch 6/25
Epoch 7/25
Epoch 8/25
Epoch 9/25
Epoch 10/25
Epoch 11/25
Epoch 12/25
Epoch 13/25
Epoch 14/25
Epoch 15/25
Epoch 16/25
Epoch 17/25
Epoch 18/25
Epoch 19/25
Epoch 20/25
Epoch 21/25
Epoch 22/25
Epoch 23/25