In [48]:
import pathlib
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.random.seed(42)

In [49]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

In [50]:
df_train = pd.read_csv("output_files/combined.csv", index_col=0)

In [51]:
df_train.head()

Unnamed: 0,ART_CODE,DESC_CODED,HLGT_NAME_COMPL,HLT_NAME_COMPL,INC_CODE,INC_CODE_J,LLT_NAME_COMPL,PT_NAME_COMPL,REPORTED_TERM,SOC_CODE,len
0,0,Hyponatraemia,ELECTROLYTE AND FLUID BALANCE CONDITIONS,SODIUM IMBALANCE,10021038.0,10021036,HYPONATREMIA,HYPONATRAEMIA,HYPONATREMIA,10027433.0,1
1,1,Subacute cutaneous lupus erythematosus,EPIDERMAL AND DERMAL CONDITIONS,CONNECTIVE TISSUE DISORDERS,10057903.0,10057903,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,OMEPRAZOLE INDUCED SUBACUTE CUTANEOUS LUPUS ER...,10040785.0,4
2,2,Blood bilirubin unconjugated increased,HEPATOBILIARY INVESTIGATIONS,LIVER FUNCTION ANALYSES,10021709.0,10021709,INDIRECT BILIRUBIN INCREASED,BLOOD BILIRUBIN UNCONJUGATED INCREASED,INDIRECT BILIRUBIN (74.7 MICROMOL/L),10022891.0,4
3,3,toxic epidermal necrolysis,EPIDERMAL AND DERMAL CONDITIONS,BULLOUS CONDITIONS,10044223.0,10044223,TOXIC EPIDERMAL NECROLYSIS,TOXIC EPIDERMAL NECROLYSIS,TOXIC EPIDERMAL NECROLYSIS,10040785.0,3
4,4,Bradycardia,CARDIAC ARRHYTHMIAS,RATE AND RHYTHM DISORDERS NEC,10006093.0,10006093,BRADYCARDIA,BRADYCARDIA,BRADYCARDIA,10007541.0,1


In [52]:
MAX_WORDS = 26000
MAX_LEN = 40
EMBEDDING_DIM = 200

In [53]:
tokenizer = Tokenizer(num_words=MAX_WORDS, lower=True)

In [54]:
tokenizer.fit_on_texts(df_train.REPORTED_TERM)
sequences = tokenizer.texts_to_sequences(df_train.REPORTED_TERM)
x = pad_sequences(sequences, maxlen=MAX_LEN)

In [55]:
word_index = tokenizer.word_index

In [56]:
len(word_index)

16834

In [36]:
GLOVE_DIR = pathlib.Path("glove/")

In [37]:
embeddings_index = {}
f = open(GLOVE_DIR/'glove.6B.200d.txt')
for line in f:
    values = line.split()
    word = values[0]
    coefs = np.asarray(values[1:], dtype='float32')
    embeddings_index[word] = coefs
f.close()

print('Found %s word vectors.' % len(embeddings_index))

Found 400000 word vectors.


In [38]:
embedding_matrix = np.zeros((len(word_index) + 1, EMBEDDING_DIM))
for word, i in word_index.items():
    embedding_vector = embeddings_index.get(word)
    if embedding_vector is not None:
        # words not found in embedding index will be all-zeros.
        embedding_matrix[i] = embedding_vector

In [39]:
embedding_matrix.shape

(16835, 200)

In [40]:
y = df_train.ART_CODE

In [41]:
dummies = pd.get_dummies(y.values)

In [42]:
y = np.array(dummies)

In [43]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [44]:
NUM_CLASSES = y.shape[1]

In [45]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalMaxPool1D, Dropout, SpatialDropout1D, LSTM
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

In [58]:
model = Sequential()
model.add(Embedding(len(word_index) + 1, EMBEDDING_DIM, input_length=x.shape[1], weights=[embedding_matrix]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(32, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(NUM_CLASSES, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

epochs = 30
batch_size = 64

In [None]:
model.fit(
    x_train, y_train, 
    epochs=epochs, batch_size=batch_size,
    validation_data=[x_test, y_test],
    callbacks=[
        ReduceLROnPlateau(),
        EarlyStopping(monitor='val_loss', patience=10, min_delta=0.0001),
        ModelCheckpoint(filepath='model-LSTM-word2vec.h5', save_best_only=True)])

Train on 50586 samples, validate on 12647 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30