In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
np.random.seed(42)

In [2]:
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

Using TensorFlow backend.


In [3]:
df_train = pd.read_csv("output_files/combined.csv", index_col=0)

In [4]:
df_train.head()

Unnamed: 0,ART_CODE,DESC_CODED,HLGT_NAME_COMPL,HLT_NAME_COMPL,INC_CODE,INC_CODE_J,LLT_NAME_COMPL,PT_NAME_COMPL,REPORTED_TERM,SOC_CODE,len
0,0,Hyponatraemia,ELECTROLYTE AND FLUID BALANCE CONDITIONS,SODIUM IMBALANCE,10021038.0,10021036,HYPONATREMIA,HYPONATRAEMIA,HYPONATREMIA,10027433.0,1
1,1,Subacute cutaneous lupus erythematosus,EPIDERMAL AND DERMAL CONDITIONS,CONNECTIVE TISSUE DISORDERS,10057903.0,10057903,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,OMEPRAZOLE INDUCED SUBACUTE CUTANEOUS LUPUS ER...,10040785.0,4
2,2,Blood bilirubin unconjugated increased,HEPATOBILIARY INVESTIGATIONS,LIVER FUNCTION ANALYSES,10021709.0,10021709,INDIRECT BILIRUBIN INCREASED,BLOOD BILIRUBIN UNCONJUGATED INCREASED,INDIRECT BILIRUBIN (74.7 MICROMOL/L),10022891.0,4
3,3,toxic epidermal necrolysis,EPIDERMAL AND DERMAL CONDITIONS,BULLOUS CONDITIONS,10044223.0,10044223,TOXIC EPIDERMAL NECROLYSIS,TOXIC EPIDERMAL NECROLYSIS,TOXIC EPIDERMAL NECROLYSIS,10040785.0,3
4,4,Bradycardia,CARDIAC ARRHYTHMIAS,RATE AND RHYTHM DISORDERS NEC,10006093.0,10006093,BRADYCARDIA,BRADYCARDIA,BRADYCARDIA,10007541.0,1


In [5]:
MAX_WORDS = 26000
MAX_LEN = 40
EMBEDDING_DIM = 100

In [6]:
tokenizer = Tokenizer(num_words=MAX_WORDS, lower=True)

In [7]:
tokenizer.fit_on_texts(df_train.REPORTED_TERM)
sequences = tokenizer.texts_to_sequences(df_train.REPORTED_TERM)
x = pad_sequences(sequences, maxlen=MAX_LEN)

In [8]:
y = df_train.ART_CODE

In [9]:
dummies = pd.get_dummies(y.values)

In [10]:
y = np.array(dummies)

In [12]:
from sklearn.model_selection import train_test_split

x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=42)

In [16]:
NUM_CLASSES = y.shape[1]

In [14]:
from keras.models import Sequential
from keras.layers import Dense, Embedding, GlobalMaxPool1D, Dropout, SpatialDropout1D, LSTM
from keras.optimizers import Adam
from keras.callbacks import ReduceLROnPlateau, EarlyStopping, ModelCheckpoint

In [17]:
model = Sequential()
model.add(Embedding(MAX_WORDS, EMBEDDING_DIM, input_length=x.shape[1]))
model.add(SpatialDropout1D(0.2))
model.add(LSTM(100, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(NUM_CLASSES, activation='softmax'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['categorical_accuracy'])

epochs = 30
batch_size = 64

Instructions for updating:
Colocations handled automatically by placer.
Instructions for updating:
Please use `rate` instead of `keep_prob`. Rate should be set to `rate = 1 - keep_prob`.


In [18]:
model.fit(
    x_train, y_train, 
    epochs=epochs, batch_size=batch_size,
    validation_data=[x_test, y_test],
    callbacks=[
        ReduceLROnPlateau(),
        EarlyStopping(monitor='val_loss', patience=10, min_delta=0.0001),
        ModelCheckpoint(filepath='model-LSTM.h5', save_best_only=True)])

Instructions for updating:
Use tf.cast instead.
Train on 50586 samples, validate on 12647 samples
Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30


<keras.callbacks.History at 0x7f10a482b6d8>

In [22]:
def predict(reported_term):
    #tokenizer.fit_on_texts(reported_term)
    sequences = tokenizer.texts_to_sequences(reported_term)
    x = pad_sequences(sequences, maxlen=MAX_LEN)
    return x

In [29]:
input_text = predict(pd.Series("INSOMNIA"))

In [31]:
ypred = model.predict(input_text)

In [32]:
np.argmax(ypred)

283

In [33]:
df_train[df_train.ART_CODE == 283]

Unnamed: 0,ART_CODE,DESC_CODED,HLGT_NAME_COMPL,HLT_NAME_COMPL,INC_CODE,INC_CODE_J,LLT_NAME_COMPL,PT_NAME_COMPL,REPORTED_TERM,SOC_CODE,len
419,283,Insomnia,SLEEP DISORDERS AND DISTURBANCES,DISTURBANCES IN INITIATING AND MAINTAINING SLEEP,10022437.0,10022437,INSOMNIA,INSOMNIA,INITIAL INSOMNIA AND INTERMEDIATE.,10037175.0,1
619,283,Sleeplessness,SLEEP DISORDERS AND DISTURBANCES,DISTURBANCES IN INITIATING AND MAINTAINING SLEEP,10041017.0,10041017,SLEEPLESSNESS,INSOMNIA,SLEEPLESSNESS,10037175.0,1
1047,283,Insomnia,SLEEP DISORDERS AND DISTURBANCES,DISTURBANCES IN INITIATING AND MAINTAINING SLEEP,10012804.0,10012804,DIFFICULTY SLEEPING,INSOMNIA,DIFFICULTY SLEEPING,10037175.0,1
1302,283,Insomnia,SLEEP DISORDERS AND DISTURBANCES,DISTURBANCES IN INITIATING AND MAINTAINING SLEEP,10012804.0,10012804,DIFFICULTY SLEEPING,INSOMNIA,DIFFICULTY SLEEPING,10037175.0,1
1482,283,insomnia,SLEEP DISORDERS AND DISTURBANCES,DISTURBANCES IN INITIATING AND MAINTAINING SLEEP,10041017.0,10041017,SLEEPLESSNESS,INSOMNIA,I TRIED TO STOP IT LAST WEEK AND JUST FELT HOR...,10037175.0,1
1789,283,Difficulty sleeping,SLEEP DISORDERS AND DISTURBANCES,DISTURBANCES IN INITIATING AND MAINTAINING SLEEP,10012804.0,10012804,DIFFICULTY SLEEPING,INSOMNIA,LACK OF SLEEP\ DIFFICULTY IN FALLING ASLEEP,10037175.0,1
1844,283,Insomnia,SLEEP DISORDERS AND DISTURBANCES,DISTURBANCES IN INITIATING AND MAINTAINING SLEEP,10022437.0,10022437,INSOMNIA,INSOMNIA,INSOMNIA,10037175.0,1
3428,283,insomnia,SLEEP DISORDERS AND DISTURBANCES,DISTURBANCES IN INITIATING AND MAINTAINING SLEEP,10022437.0,10022437,INSOMNIA,INSOMNIA,INSOMNIA,10037175.0,1
4160,283,Insomnia,SLEEP DISORDERS AND DISTURBANCES,DISTURBANCES IN INITIATING AND MAINTAINING SLEEP,10012804.0,10012804,DIFFICULTY SLEEPING,INSOMNIA,DIFFICULTY IN FALLING ASLEEP/ LACK OF SLEEP,10037175.0,1
5867,283,Insomnia,SLEEP DISORDERS AND DISTURBANCES,DISTURBANCES IN INITIATING AND MAINTAINING SLEEP,10022437.0,10022437,INSOMNIA,INSOMNIA,INSOMNIA,10037175.0,1
