In [None]:
!pip install pyhealth inflect autocorrect torchtext gensim==3.6.0

In [None]:
import numpy as np
import pandas as pd
from pyhealth.medcode import InnerMap
from pyhealth.datasets import MIMIC4Dataset

import nltk
nltk.download('stopwords')
nltk.download('punkt')
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import os
import csv
import pickle
import inflect
from autocorrect import spell
from collections import OrderedDict


import gensim
from gensim.models import Word2Vec
import pickle

import torch
import torchtext
from torchtext.data import get_tokenizer
import numpy as np
import statistics
# for progress bar
from tqdm import tqdm_notebook
import random
import json
import tqdm
from sklearn.metrics import *

import torch
import torch.nn as nn
import torch.nn.functional as F

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

In [None]:
with open('data/embeddings/rev_tokenized_notes.pckl', 'rb') as f:
    rev_input_ids = pickle.load(f)
    
with open('data/embeddings/tokenized_notes.pckl', 'rb') as f:
    input_ids = pickle.load(f)
    
with open('data/embeddings/masks.pckl', 'rb') as f:
    masks = pickle.load(f)

with open('data/embeddings/embedding_matrix_GNV.pckl', 'rb') as f:
    embedding_matrix_GNV = pickle.load(f)
    embedding_matrix_GNV = torch.tensor(embedding_matrix_GNV)

with open('data/embeddings/embedding_matrix_w2v.pckl', 'rb') as f:
    embedding_matrix_w2v = pickle.load(f)
    embedding_matrix_w2v = torch.tensor(embedding_matrix_w2v)

with open('data/embeddings/word_index_eff.pckl', 'rb') as f:
    word2idx = pickle.load(f)

with open('data/embeddings/max_len_eff.pckl', 'rb') as f:
    normal_max_len = pickle.load(f)

with open('data/labels.pckl', 'rb') as f:
    labels = pickle.load(f)

with open('data/df_notes_discharge.pckl', 'rb') as f:
    df_notes_discharge = pickle.load(f)
    
#with open('data/embeddings/pretrain.pckl', 'rb')
#    pretrain = pickle.load(f)

In [None]:
print('pretrained GNV num embeddings ', len(embedding_matrix_GNV))
print('GNV embedding dimensions ', len(embedding_matrix_GNV[0]))

print('pretrained w2v num embeddings ', len(embedding_matrix_w2v))
print('w2v embedding dimensions ', len(embedding_matrix_w2v[0]))

print('len encoded notes, or total notes is ', len(input_ids))
print('len of first note is ', len(input_ids[0]))
print('max len is ', normal_max_len)
print('len or word index, or total bique words is ', len(word2idx))

In [None]:
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler, SequentialSampler)
from sklearn.model_selection import train_test_split


def data_loader(x_train, x_test, rev_x_train, rev_x_test, masks_train, masks_test, y_train, y_test, batch_size=8):
    """Convert train and test sets to tensors and load them to a dataLoader
    """

    # reverse the sequence of word tokens for a patient.  x is from input_ids. x shape is (N, max_len)
    #rev_x_train = np.fliplr(x_train)
    #rev_x_test = np.fliplr(x_test)
    #rev_masks_train = np.fliplr(x_train)
    #rev_masks_test = np.fliplr(x_test)
    # copy to get rid of negative strides
    #rev_x_train = rev_x_train.copy()
    #rev_x_test = rev_x_test.copy()
    #rev_masks_train = rev_masks_train.copy()
    #rev_masks_test = rev_masks_test.copy()
    
    # we have padding at end of rev_x input arg
    
    # Convert data type to torch.Tensor
    x_train, rev_x_train, x_test, rev_x_test, masks_train, masks_test, y_train, y_test = tuple(torch.tensor(data) for data in [x_train, rev_x_train, x_test, rev_x_test, masks_train, masks_test, y_train, y_test])
    #x_train, x_test, y_train, y_test = tuple(torch.tensor(data) for data in [x_train, x_test, y_train, y_test])

    # Create DataLoader for training data
    train_data = TensorDataset(x_train, rev_x_train, masks_train, y_train)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create DataLoader for validation data
    val_data = TensorDataset(x_test, rev_x_test, masks_test, y_test)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    return train_dataloader, val_dataloader

    
 # Train Test Split
x_train, x_test, rev_x_train, rev_x_test, masks_train, masks_test, y_train, y_test = train_test_split(input_ids, rev_input_ids, masks, labels, test_size=0.33, random_state=seed)

batch_size=8
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = data_loader(x_train, x_test, rev_x_train, rev_x_test, masks_train, masks_test, y_train, y_test, batch_size=batch_size)

In [None]:
def find_last_word_before_pad(output, masks):
    """
    Arguments:
        hidden_states: the hidden states of each encoded note sequence of shape (batch_size, notes sequence size, hidden_size)
        masks: the padding masks of shape (batch_size, notes sequence size)

    Outputs:
        last_hidden_state: the hidden state for the last word before padding of shape (batch_size, hidden_size)
        
    First convert the mask to a vector of shape (batch_size,) containing the true notes length; 
          and then use this length vector as index to select the last word.
    """
    
    #masks_test = torch.argmin(masks, dim=1, keepdim=True)[0]
    #print('first zero mask',masks[0][masks_test])
    #print('last one mask',masks[0][masks_test - 1])
    
    lv_idx = torch.argmin(masks, dim=1, keepdim=True)
    lv_idx = lv_idx.reshape(lv_idx.shape[0])
    #print('lv_idx',lv_idx)
    return output[range(output.shape[0]),lv_idx,:]
        

In [None]:
class BidirectionalRNN(nn.Module):

    def __init__(self
                 ,pretrained_embedding
                 ,num_classes=2
                 ,dropout=0.2
                 ,freeze_embedding=False
                 ,embedding_dim=300
                 ,conv_reduce=True
                 ,conv_pool=2
                 ,num_words=None):
        super().__init__()
        
        self.name = "bi_rnn_model"

        # len of word index is same as len of embedding matrix
        # embeddings
        # we have 2 different pretrained, always using pretrained for this analysis, num words found in pretrained embedding matrix shape (equal to len of word idx)
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding, freeze=freeze_embedding)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=num_words, embedding_dim=self.embed_dim, padding_idx=0)
        
        self.dropout = nn.Dropout(p=dropout)
        
        self.conv_reduce = conv_reduce
        self.input_size = self.embed_dim
        self.hidden_size = self.embed_dim
        if (conv_reduce):
            self.conv1 =  nn.Conv1d(self.input_size, 32, kernel_size=3)
            self.pool = nn.MaxPool1d(conv_pool)
            self.input_size = 32
            self.hidden_size = 128
        
        # bidirectional is false by default, but I want to be clear I am handling this to apply masks
        self.rnn = nn.LSTM(input_size=self.input_size, hidden_size=self.hidden_size, batch_first=True, bidirectional=False)
        #batch_first – If True, then the input and output tensors are provided as (batch, seq, feature) instead of (seq, batch, feature)
        
        # we are always using binomial, ICD9 is much more common in dataset than ICD10
        if num_classes == 2:
            #binary
            self.fc = nn.Linear(self.hidden_size * 2, 1)
            self.out = nn.Sigmoid()
        else:
            self.fc = nn.Linear(self.hidden_size * 2, num_classes)
            self.out = nn.Softmax()         
            
        nn.init.kaiming_normal_(self.fc.weight, nonlinearity='relu')         
            
    
    def forward(self, x, rev_x, masks):
        '''
        Arguments:
            x: the tokenized notes or input_ids of shape (total notes, max_len)

        Outputs:
            probs: probabilities of shape (batch_size)
        '''
        
        batch_size = x.shape[0]
        
        x = self.embedding(x).float()
        x = self.dropout(x)
        
        if self.conv_reduce:
            # Permute to match input shape requirement of nn.Conv1d. Output shape: (b, embed_dim, max_len)
            x = x.permute(0, 2, 1)
            x = F.relu(self.conv1(x))
            x = self.pool(x)
            masks = self.pool(masks.float()).int()
            # permute back to original
            x = x.permute(0, 2, 1)
        
        # example input shape to LSTM
        # without conv reduce input should be (batch_size, max_len, embedding_dim) (8, 16410, 300)
        # if conv_reduce and pool then it becomes (8, 8204, 32)
        
        output, (hn, cn) = self.rnn(x) 
        true_h_n = find_last_word_before_pad(output, masks)
        if not self.conv_reduce:
            #activation not applied after conv so apply here
            true_h_n = F.relu(true_h_n)
        '''
        repeat for the reverse order (rev_x)
        '''
        rev_x = self.embedding(rev_x).float()
        rev_x = self.dropout(rev_x)
        
        if self.conv_reduce:
            # Permute to match input shape requirement of nn.Conv1d. Output shape: (b, embed_dim, max_len)
            rev_x = rev_x.permute(0, 2, 1)
            rev_x = F.relu(self.conv1(rev_x))
            rev_x = self.pool(rev_x)
            masks = self.pool(masks.float()).int()
            # permute back to original
            rev_x = rev_x.permute(0, 2, 1)
        
        # example input shape to LSTM
        # without conv reduce input should be (batch_size, max_len, embedding_dim) (8, 16410, 300)
        # if conv_reduce and pool then it becomes (8, 8204, 32)
        
        rev_output, (rev_hn, rev_cn) = self.rnn(rev_x)
        rev_true_h_n = find_last_word_before_pad(rev_output, masks)
        if not self.conv_reduce:
            #activation not applied after conv so apply here
            rev_true_h_n = F.relu(rev_true_h_n)
        '''
            concatenate the hidden states for both directions
        '''  

        both = torch.cat([true_h_n, rev_true_h_n], 1)
        x = self.fc(both)
        x = self.out(x).view(batch_size)
        return x     


In [None]:
def save_checkpoint(model, optimizer, epoch, loss, filename='models/bi_rnn_model_checkpoint.torch'):
    torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                }, filename)

def load_checkpoint(model, optimizer, filename='models/bi_rnn_model_checkpoint.torch'):
    # Note: Input model & optimizer should be pre-defined.  This routine only updates their states.
    epoch = 0
    if os.path.isfile(filename):
        print("=> loading checkpoint '{}'".format(filename))
        checkpoint = torch.load(filename)
        epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print("=> loaded checkpoint '{}' (epoch {}) loss {}"
                  .format(filename, checkpoint['epoch'], checkpoint['loss']))
    else:
        print("=> no checkpoint found at '{}'".format(filename))

    return model, optimizer, epoch

In [None]:
class EarlyStopper:
    def __init__(self, patience=2, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf
        self.prev_validation_loss = np.inf

    def early_stop_min(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False
    
    def early_stop(self, validation_loss):
        if validation_loss < self.prev_validation_loss:
            self.counter = 0
        if validation_loss > (self.prev_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
            
        self.prev_validation_loss = validation_loss
        return False

In [None]:
def eval_model(model, dataloader):
    model.eval()
    Y_pred = []
    Y_true = []
    batch_loss = []
    with torch.no_grad():
        for x, rev_x, masks, target in dataloader:
            # your code here
            Y_true.append(target)
            Y_scores = model(x, rev_x, masks)
            loss = criterion(Y_scores, target.float())
            batch_loss.append(loss.cpu().data.numpy())
            # the class with the highest energy is what we choose as prediction
            predicted = (Y_scores > .5).int()
            Y_pred.append(predicted)
        val_loss = np.mean(batch_loss)
        Y_pred = np.concatenate(Y_pred, axis=0)
        Y_true = np.concatenate(Y_true, axis=0)
        #f1 precision recall accuracy
        p, r, f, _ = precision_recall_fscore_support(Y_true, Y_pred, average='weighted')
        a = accuracy_score(Y_true, Y_pred)
    
    return val_loss, p, r, f, a

In [None]:
#TRAIN

n_epochs = 50

def train_model(model, train_dataloader, current_epoch=0, n_epochs=n_epochs, optimizer=None, criterion=None):

    model.train() # prep model for training
    last_epoch = 0
    last_epoch_loss = 0
    early_stopper = EarlyStopper(patience=2, min_delta=0)
    for epoch in range(current_epoch, current_epoch + n_epochs):
        last_epoch = epoch + 1
        curr_epoch_loss = []
        #count = 0
        #for data, rev_data, target in train_dataloader:
        for x_train, rev_x_train, masks_train, target in train_dataloader:
            #if count > 1:
            #    break
            #count = 1
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            #outputs = model(data, rev_data)
            outputs = model(x_train, rev_x_train, masks_train)
            loss = criterion(outputs, target.float())
            loss.backward()
            optimizer.step()
            curr_epoch_loss.append(loss.cpu().data.numpy())
            
        last_epoch_loss = np.mean(curr_epoch_loss)
        echo_out = f"Epoch {last_epoch}: curr_epoch_loss={last_epoch_loss}"
        print(echo_out)
        with open(model.name + "train_out", 'a') as f:
            f.write(echo_out + '\n')
        if (epoch + 1) % 10 == 0:
            val_loss, p, r, f1, a = eval_model(model, val_dataloader)
            val_echo_out = 'Epoch: {} \t Validation loss: {:.2f}, precision: {:.2f}, recall:{:.2f}, f1_score: {:.2f}, accuracy: {:.2f}'.format(epoch + 1, val_loss, p, r, f1, a)
            print(val_echo_out)
            with open(model.name + "train_out", 'a') as f:
                f.write(val_echo_out + '\n')
                
        if early_stopper.early_stop(last_epoch_loss):
            val_loss, p, r, f1, a = eval_model(model, val_dataloader)
            val_echo_out = 'Epoch: {} \t Validation loss: {:.2f}, precision: {:.2f}, recall:{:.2f}, f1_score: {:.2f}, accuracy: {:.2f}'.format(epoch + 1, val_loss, p, r, f1, a)
            print(val_echo_out)
            with open(model.name + "train_out", 'a') as f:
                f.write(val_echo_out + '\n')
                f.write('EARLY STOP\n')
            break
            
    return model, last_epoch, last_epoch_loss

In [None]:
criterion = torch.nn.BCELoss()

continued = False

if continued:
    with open('models/bi_rnn_model.pckl', 'rb') as f:
        bi_rnn_model = pickle.load(f)
    bi_rnn_optimizer = torch.optim.Adam(bi_rnn_model.parameters(), lr=0.001, betas=(0.9, 0.999), weight_decay=0.0, amsgrad=False)
    cnn_model, cnn_optimizer, current_epoch = load_checkpoint(bi_rnn_model, bi_rnn_optimizer)
else:
    bi_rnn_model = BidirectionalRNN(pretrained_embedding = embedding_matrix_GNV, conv_reduce=False)
    bi_rnn_optimizer = torch.optim.Adam(bi_rnn_model.parameters(), lr=0.001, betas=(0.9, 0.999), weight_decay=0.0, amsgrad=False)
    current_epoch = 0
    
bi_rnn_model, last_epoch, last_epoch_loss = train_model(bi_rnn_model, train_dataloader, current_epoch=current_epoch, optimizer=bi_rnn_optimizer, criterion=criterion)

with open('models/bi_rnn_model.pckl', 'wb') as f:
    pickle.dump(bi_rnn_model, f)

save_checkpoint(bi_rnn_model, bi_rnn_optimizer, last_epoch, last_epoch_loss)