<a href="https://colab.research.google.com/github/MedleyHealth/TypeAssist/blob/master/TypeAssist_Training.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Important: Do not save the output from code cells in this notebook to Github (or any other public location). Access to the dataset is restricted and we cannot leak any information about individual samples.**

To suppress the output in Google Colab:

1. Go to Edit > Notebook Settings
2. Make sure the checkbox is ticked for "Omit code cell output when saving this notebook"

# **If you have any doubts about what this means, message me first before committing.**

### Modified from [code](https://nbviewer.jupyter.org/github/PrithivirajDamodaran/NLP-Experiments/blob/master/Gmail_style_smart_compose_with_char_ngram_based_language_model.ipynb) created by [Prithivi Da](https://github.com/PrithivirajDamodaran)

# Data Loading

### Import libraries and set seeds (must use Tensorflow 1.x)

In [None]:
%tensorflow_version 1.x
%matplotlib inline

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import tensorflow as tf

from tensorflow.keras.preprocessing.sequence import pad_sequences
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, LSTM, Dense, CuDNNLSTM, Embedding, Flatten, TimeDistributed, Dropout, LSTMCell, RNN, Bidirectional, Concatenate, Layer
from tensorflow.keras.callbacks import ModelCheckpoint
from tensorflow.python.keras.utils import tf_utils
from tensorflow.keras import backend as K
from tensorflow.keras.models import model_from_json
from tensorflow.keras.models import load_model
from datetime import datetime

import random
import unicodedata
import re
import os
import time
import shutil
import string
import os 

seed = 23

random.seed(seed)
np.random.seed(seed)

tf.__version__

### Mount Google Drive where dataset is saved

In [None]:
from google.colab import drive
drive.mount('/content/drive')

### Load dataset from path in Google Drive (change path to your location)

In [None]:
path = '/content/drive/My Drive/4 Archive/MIMIC/NOTEEVENTS.csv'

df = pd.read_csv(path)
df[:5]

# Data Preprocessing

### Select notes that are less than 100 characters long

In [None]:
corpus = [note for note in df['TEXT'] if len(note) < 100]

print('Number of Notes with Length < 100:', len(corpus), '\n')
corpus[:10]

### Split notes on newline characters

In [None]:
corpus = [note.split('\n') for note in corpus]

corpus[:10]

### Collapse the nested list structure from splitting on newline characters

In [None]:
corpus = [split_note for note in corpus for split_note in note if len(split_note) > 10]

print('Number of notes after merging sublists:', len(corpus), '\n')
corpus[:10]

### Drop any notes that contain PHI tags

In [None]:
phi_pattern = '(\[\*\*(.*)\*\*\])'

corpus = [note for note in corpus if re.search(phi_pattern, note) is None]

print('Number of notes after removing any note that contains a PHI tag:', len(corpus), '\n')
corpus[:10]

### Convert all notes to lower case

In [None]:
corpus = [note.lower() for note in corpus]

corpus[:10]

### Preprocessing methods

In [None]:
class LanguageIndex():
    def __init__(self, lang):
        self.lang = lang
        self.word2idx = {}
        self.idx2word = {}
        self.vocab = set()
        self.create_index()

    def create_index(self):
        for phrase in self.lang:
            self.vocab.update(phrase.split(' '))
        self.vocab = sorted(self.vocab)
        self.word2idx["<pad>"] = 0
        self.idx2word[0] = "<pad>"
        for i,word in enumerate(self.vocab):
            self.word2idx[word] = i + 1
            self.idx2word[i+1] = word


def max_length(t):
    return max(len(i) for i in t)


def clean_special_chars(text):
    punct='#$%&*+-/<=>@[\\]^_`{|}~\t\n'
    for p in punct:
        text = text.replace(p, '')
    return text


def generate_ngrams(corpus):
    processed_corpus = [clean_special_chars(line) for line in corpus]
    output = []
    for token_list in processed_corpus:
        for i in range(1, len(token_list)):
            x_ngram = '<start> ' + token_list[:i+1] + ' <end>'
            y_ngram = '<start> ' + token_list[i+1:] + ' <end>'
            output.append([x_ngram, y_ngram]) 
    return output

### Generate n-gram pairs with prefixes and suffixes for teacher forcing technique

In [None]:
pairs = generate_ngrams(corpus)

dummy_df = pd.DataFrame(pairs, columns=['input (i)','output (o)'])
print('Shape of n-gram pairs: {}\n'.format(dummy_df.shape))
dummy_df[:5]

### Convert words to index integers for input / output

In [None]:
out_lang = LanguageIndex(o for i, o in pairs)
in_lang = LanguageIndex(i for i, o in pairs)

### Generate word embeddings for input / output

In [None]:
input_data = [[in_lang.word2idx[word] for word in i.split(' ')] for i, o in pairs]
output_data = [[out_lang.word2idx[word] for word in o.split(' ')] for i, o in pairs]

print('input_data:', input_data[0])
print('output_data:', output_data[0])

### Calculate the max length of tokens for input and output

In [None]:
maxlen_in = max_length(input_data)
maxlen_out = max_length(output_data)

print('maxlen_in:', maxlen_in)
print('maxlen_out:', maxlen_out)

### Add padding to the input and output

In [None]:
input_data = pad_sequences(input_data, maxlen=maxlen_in, padding="post")
output_data = pad_sequences(output_data, maxlen=maxlen_out, padding="post")

print('input_data (padded):', input_data[0], '\n')
print('output_data (padded):', output_data[0])

### Create target data

In [None]:
target_data = [[output_data[n][i+1] for i in range(len(output_data[n])-1)] for n in range(len(output_data))]
target_data = pad_sequences(target_data, maxlen=maxlen_out, padding="post")

print('target_data:', target_data[:3])
print('target_data (padded:', target_data[:3])

### Reshape target_data

In [None]:
target_shape = (target_data.shape[0], target_data.shape[1], 1)
print('Using target shape:', target_shape)

target_data = target_data.reshape(target_shape)

### Shuffle the data

In [None]:
p = np.random.permutation(len(input_data))

input_data = input_data[p]
output_data = output_data[p]
target_data = target_data[p]

print('input_data:', input_data)
print('output_data:', output_data)
print('target_data:', target_data)

### Configuration parameters

In [None]:
BUFFER_SIZE = len(input_data)
BATCH_SIZE = 128
embedding_dim = 300
units = 128
vocab_in_size = len(in_lang.word2idx)
vocab_out_size = len(out_lang.word2idx)
loss = 'sparse_categorical_crossentropy'
metrics = ['sparse_categorical_accuracy']

### Build model

In [None]:
# Create the Encoder layers first.
encoder_inputs = Input(shape=(maxlen_in,))
encoder_emb = Embedding(input_dim=vocab_in_size, output_dim=embedding_dim)

# Use this if you dont need Bidirectional LSTM
# encoder_lstm = CuDNNLSTM(units=units, return_sequences=True, return_state=True)
# encoder_out, state_h, state_c = encoder_lstm(encoder_emb(encoder_inputs))

encoder_lstm = Bidirectional(CuDNNLSTM(units=units, return_sequences=True, return_state=True))
encoder_out, fstate_h, fstate_c, bstate_h, bstate_c = encoder_lstm(encoder_emb(encoder_inputs))
state_h = Concatenate()([fstate_h,bstate_h])
state_c = Concatenate()([bstate_h,bstate_c])
encoder_states = [state_h, state_c]

# Now create the Decoder layers.
decoder_inputs = Input(shape=(None,))
decoder_emb = Embedding(input_dim=vocab_out_size, output_dim=embedding_dim)
decoder_lstm = CuDNNLSTM(units=units*2, return_sequences=True, return_state=True)
decoder_lstm_out, _, _ = decoder_lstm(decoder_emb(decoder_inputs), initial_state=encoder_states)

# Two dense layers added to this model to improve inference capabilities.
decoder_d1 = Dense(units, activation="relu")
decoder_d2 = Dense(vocab_out_size, activation="softmax")
decoder_out = decoder_d2(Dropout(rate=.2)(decoder_d1(Dropout(rate=.2)(decoder_lstm_out))))

# Finally, create a training model which combines the encoder and the decoder.
# Note that this model has three inputs:
model = Model(inputs=[encoder_inputs, decoder_inputs], outputs=decoder_out)

opt = tf.train.AdamOptimizer()

# We'll use sparse_categorical_crossentropy so we don't have to expand decoder_out into a massive one-hot array.
model.compile(optimizer=opt, loss=loss, metrics=metrics)
model.summary()

### Train model

In [None]:
epochs = 10
N = 100000

history = model.fit([input_data[:N], output_data[:N]], target_data[:N],
                    batch_size=BATCH_SIZE,
                    epochs=epochs,
                    validation_split=0.2)

### Plot training vs. validation loss for signs of overfitting

In [None]:
plt.plot(history.history['loss'], label="Training loss")
plt.plot(history.history['val_loss'], label="Validation loss")
plt.legend()
plt.show()

### Create encoder model

In [None]:
# Create the encoder model from the tensors we previously declared.
encoder_model = Model(encoder_inputs, [encoder_out, state_h, state_c])

# Generate a new set of tensors for our new inference decoder. Note that we are using new tensors, 
# this does not preclude using the same underlying layers that we trained on. (e.g. weights/biases).
inf_decoder_inputs = Input(shape=(None,), name="inf_decoder_inputs")

# We'll need to force feed the two state variables into the decoder each step.
state_input_h = Input(shape=(units*2,), name="state_input_h")
state_input_c = Input(shape=(units*2,), name="state_input_c")

decoder_res, decoder_h, decoder_c = decoder_lstm(
    decoder_emb(inf_decoder_inputs), 
    initial_state=[state_input_h, state_input_c])

inf_decoder_out = decoder_d2(decoder_d1(decoder_res))

inf_model = Model(inputs=[inf_decoder_inputs, state_input_h, state_input_c], 
                  outputs=[inf_decoder_out, decoder_h, decoder_c])

### Methods for inference

In [None]:
def sentence_to_vector(sentence, lang):
    """
    Converts the given sentence (just a string) into a vector of word IDs
    Output is 1-D: [timesteps/words]
    """

    pre = sentence
    vec = np.zeros(maxlen_in)
    sentence_list = [lang.word2idx[s] for s in pre.split(' ')]
    for i,w in enumerate(sentence_list):
        vec[i] = w
    return vec

def translate(input_sentence, infenc_model, infmodel):
    """
    Given an input string, an encoder model (infenc_model) 
    and a decoder model (infmodel).
    """

    sv = sentence_to_vector(input_sentence, in_lang)
    sv = sv.reshape(1,len(sv))
    [emb_out, sh, sc] = infenc_model.predict(x=sv)
    
    i = 0
    start_vec = out_lang.word2idx["<start>"]
    stop_vec = out_lang.word2idx["<end>"]
    
    cur_vec = np.zeros((1,1))
    cur_vec[0,0] = start_vec
    cur_word = "<start>"
    output_sentence = ""

    while cur_word != "<end>" and i < (maxlen_out-1):
        i += 1
        if cur_word != "<start>":
            output_sentence = output_sentence + " " + cur_word
        x_in = [cur_vec, sh, sc]
        [nvec, sh, sc] = infmodel.predict(x=x_in)
        cur_vec[0,0] = np.argmax(nvec[0,0])
        cur_word = out_lang.idx2word[np.argmax(nvec[0,0])]

    return output_sentence

### Run tests to see how the model performs (we want inference < 100ms)

In [None]:
# Note that only words that we've trained the model on will be available, otherwise you'll get an error.

test = [
  'discha', #arge summary
  'left v', #entricular hypertrophy
  'no ch', #ange from previous
  'ventr', #ricular paced
  'no sig', #nificant change
  'previ', #ious tracing
  'no ma', #ajor change
  'sinu', #s rhythm
  'R wav', #e progression,
  'hydroc', #hlorothiazide
]

output = []  
for t in test:  
  input_seq = t.lower()

  t0 = datetime.now()
  pred_seq = translate(t.lower(), encoder_model, inf_model)
  t1 = datetime.now()
  print('Inference time:', (t1-t0).total_seconds())

  output.append({"Input Sequence": input_seq, "Predicted Sequence": pred_seq})

results_df = pd.DataFrame.from_dict(output) 
results_df.head(len(test))

### Save the model to JSON and weights to H5 (change save_path to your location)

In [None]:
save_path = '/content/drive/My Drive/3 Reference/TypeAssist/model_1'

if os.path.exists('{}.json'.format(save_path)):
  raise BaseException('WARNING. Save path exists. Please increment the model number.')

# serialize model to JSON
#  the keras model which is trained is defined as 'model' in this example
model_json = inf_model.to_json()

with open('{}.json'.format(save_path), 'w') as f:
    f.write(model_json)

# serialize weights to HDF5
inf_model.save_weights('{}.h5'.format(save_path))