In [1]:
!pip install pyhealth inflect autocorrect torchtext gensim==3.6.0

Looking in indexes: https://pypi.org/simple, https://pip.repos.neuron.amazonaws.com


In [2]:
import numpy as np
import pandas as pd
from pyhealth.medcode import InnerMap
from pyhealth.datasets import MIMIC4Dataset

import nltk
nltk.download('stopwords')
nltk.download('punkt')
import re
import string
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import RegexpTokenizer
import os
import csv
import pickle
import inflect
from autocorrect import spell
from collections import OrderedDict


import gensim
from gensim.models import Word2Vec
import pickle

import torch
import torchtext
from torchtext.data import get_tokenizer
import numpy as np
import statistics
# for progress bar
from tqdm import tqdm_notebook
import random
import json
import tqdm
from sklearn.metrics import *

import torch
import torch.nn as nn
import torch.nn.functional as F

# set seed
seed = 24
random.seed(seed)
np.random.seed(seed)
torch.manual_seed(seed)
os.environ["PYTHONHASHSEED"] = str(seed)

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/ec2-user/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package punkt to /home/ec2-user/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [3]:
# data prep
with open('data/8000/labels.pckl', 'rb') as f:
    labels = pickle.load(f)

with open('data/8000/df_notes_discharge.pckl', 'rb') as f:
    df_notes_discharge = pickle.load(f)
    
with open('data/8000/cleaned_notes.pckl', 'rb') as f:
    notes = pickle.load(f)

# embeddings 
with open('data/8000/embeddings/tokenized_notes.pckl', 'rb') as f:
    input_ids = pickle.load(f)

with open('data/8000/embeddings/embedding_matrix_GNV.pckl', 'rb') as f:
    embedding_matrix_GNV = pickle.load(f)
    embedding_matrix_GNV = torch.tensor(embedding_matrix_GNV)

with open('data/8000/embeddings/embedding_matrix_w2v.pckl', 'rb') as f:
    embedding_matrix_w2v = pickle.load(f)
    embedding_matrix_w2v = torch.tensor(embedding_matrix_w2v)

with open('data/8000/embeddings/word_index_eff.pckl', 'rb') as f:
    word2idx = pickle.load(f)

with open('data/8000/embeddings/max_len_eff.pckl', 'rb') as f:
    normal_max_len = pickle.load(f)


In [4]:
print('pretrained GNV num embeddings ', len(embedding_matrix_GNV))
print('GNV embedding dimensions ', len(embedding_matrix_GNV[0]))

print('pretrained w2v num embeddings ', len(embedding_matrix_w2v))
print('w2v embedding dimensions ', len(embedding_matrix_w2v[0]))

print('len encoded notes, or total notes is ', len(input_ids))
print('len of first note is ', len(input_ids[0]))
print('max len is ', normal_max_len)
print('len or word index, or total unique words is ', len(word2idx))

pretrained GNV num embeddings  61122
GNV embedding dimensions  300
pretrained w2v num embeddings  61122
w2v embedding dimensions  300
len encoded notes, or total notes is  8000
len of first note is  2629
max len is  2629
len or word index, or total unique words is  61122


In [5]:
from torch.utils.data import (TensorDataset, DataLoader, RandomSampler, SequentialSampler)
from sklearn.model_selection import train_test_split


def data_loader(x_train, x_test, y_train, y_test, batch_size=8):
    """Convert train and test sets to tensors and load them to a dataLoader
    """

    # Convert data type to torch.Tensor
    x_train, x_test, y_train, y_test = tuple(torch.tensor(data) for data in [x_train, x_test, y_train, y_test])

    # Create DataLoader for training data
    train_data = TensorDataset(x_train, y_train)
    train_sampler = RandomSampler(train_data)
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # Create DataLoader for validation data
    val_data = TensorDataset(x_test, y_test)
    val_sampler = SequentialSampler(val_data)
    val_dataloader = DataLoader(val_data, sampler=val_sampler, batch_size=batch_size)

    return train_dataloader, val_dataloader

    
 # Train Test Split
x_train, x_test, y_train, y_test = train_test_split(input_ids, labels, test_size=0.33, random_state=seed)

batch_size=8
# Load data to PyTorch DataLoader
train_dataloader, val_dataloader = data_loader(x_train, x_test, y_train, y_test, batch_size=batch_size)

In [6]:
class CNN(nn.Module):
    def __init__(self,
                 pretrained_embedding=None,
                 freeze_embedding=True,
                 vocab_size=None,
                 embed_dim=300,
                 num_classes=2,
                 dropout=0.2):
        
        super(CNN, self).__init__()
        self.name = "cnn_model"
        # embeddings
        # we have 2 different pretrained, always using pretrained for this analysis
        if pretrained_embedding is not None:
            self.vocab_size, self.embed_dim = pretrained_embedding.shape
            self.embedding = nn.Embedding.from_pretrained(pretrained_embedding, freeze=freeze_embedding)
        else:
            self.embed_dim = embed_dim
            self.embedding = nn.Embedding(num_embeddings=vocab_size, embedding_dim=self.embed_dim, padding_idx=0)
            
        self.filters_out = 128
        # Conv Network
        self.conv1 =  nn.Conv1d(self.embed_dim, self.filters_out, kernel_size=5)
        self.pool = nn.AdaptiveMaxPool1d(1)
        self.fc1 = nn.Linear(self.filters_out,10)
        self.dropout = nn.Dropout(p=dropout)

        #output layer
        # we are always using binomial, ICD9 is more prevalent than ICD10, use either to find tobacco use
        if num_classes == 2:
            #binary
            self.fc2 = nn.Linear(10, 1)
            self.out = nn.Sigmoid()
        else:
            self.fc2 = nn.Linear(10, num_classes)
            self.out = nn.Softmax()         
                
        nn.init.kaiming_normal_(self.fc1.weight, nonlinearity='relu')
        nn.init.kaiming_normal_(self.fc2.weight, nonlinearity='relu')
        

    def forward(self, input_ids):
        
        # get embeddings. Output shape: (b, max_len, embed_dim)
        batch_size = input_ids.shape[0]
        x = self.embedding(input_ids).float()
        #print(x)
        # Permute to match input shape requirement of nn.Conv1d. Output shape: (b, embed_dim, max_len)
        x = x.permute(0, 2, 1)
        x = F.relu(self.conv1(x))

        #adaptive Max pooling. Output shape: (b, self.filters_out, 1)
        #input is a,b,in output is a,b,out
        x = self.pool(x)
        
        # Output shape: (b, self.filters_out)
        x = x.squeeze()
        #x = self.dropout(x)
        x = F.relu(self.fc1(x))
        x = self.dropout(x)
        x = self.fc2(x)
        # final output activation function
        x = self.out(x)
        x = x.view(batch_size)

        return x

In [7]:
class EarlyStopper:
    def __init__(self, patience=2, min_delta=0):
        self.patience = patience
        self.min_delta = min_delta
        self.counter = 0
        self.min_validation_loss = np.inf
        self.prev_validation_loss = np.inf
        self.f1 = 0

    def early_stop_min(self, validation_loss):
        if validation_loss < self.min_validation_loss:
            self.min_validation_loss = validation_loss
            self.counter = 0
        elif validation_loss > (self.min_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False
    
    def early_stop_min_f1(self, f1):
        if f1 >= self.f1:
            self.f1 = f1
            self.counter = 0
        elif f1 < (self.f1 - self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True
        return False
    
    def early_stop(self, validation_loss):
        if validation_loss < self.prev_validation_loss:
            self.counter = 0
        if validation_loss > (self.prev_validation_loss + self.min_delta):
            self.counter += 1
            if self.counter >= self.patience:
                return True 
        self.prev_validation_loss = validation_loss
        return False

In [8]:
def save_checkpoint(model, optimizer, epoch, loss, filename='models/cnn_model_checkpoint.torch'):
    torch.save({
                'epoch': epoch,
                'model_state_dict': model.state_dict(),
                'optimizer_state_dict': optimizer.state_dict(),
                'loss': loss,
                }, filename)

def load_checkpoint(model, optimizer, filename='models/cnn_model_checkpoint.torch'):
    # Note: Input model & optimizer should be pre-defined.  This routine only updates their states.
    epoch = 0
    if os.path.isfile(filename):
        print("=> loading checkpoint '{}'".format(filename))
        checkpoint = torch.load(filename)
        epoch = checkpoint['epoch']
        model.load_state_dict(checkpoint['model_state_dict'])
        optimizer.load_state_dict(checkpoint['optimizer_state_dict'])
        print("=> loaded checkpoint '{}' (epoch {}) loss {}"
                  .format(filename, checkpoint['epoch'], checkpoint['loss']))
    else:
        print("=> no checkpoint found at '{}'".format(filename))

    return model, optimizer, epoch

In [9]:
def eval_model(model, dataloader):
    model.eval()
    Y_pred = []
    Y_scores = []
    Y_true = []
    batch_loss = []
    all_target = []
    with torch.no_grad():
        for x, target in dataloader:
            # your code here
            target = target.float()
            Y_true.append(target)
            Y_score = model(x)
            #print('calcing loss in eval',Y_score,target)
            loss = criterion(Y_score, target)
            #print('loss is', loss)
            batch_loss.append(loss.item())
            # the class with the highest energy is what we choose as prediction
            predicted = (Y_score > .5).int()
            Y_pred.append(predicted)
            Y_scores.append(Y_score)
            all_target.append(target)
        val_loss = np.mean(batch_loss)
        #print('mean loss is', val_loss)
        Y_pred = np.concatenate(Y_pred, axis=0)
        Y_true = np.concatenate(Y_true, axis=0)
        all_target = np.concatenate(all_target, axis=0)
        #f1 precision recall accuracy
        #print('Y_true len',len(Y_true), 'Y_pred len', len(Y_pred))
        print('sum Y pred', sum(Y_pred), 'sum target', sum(all_target))
        #print('all Y_scores', np.concatenate(Y_scores, axis=0))
        p, r, f, _ = precision_recall_fscore_support(Y_true, Y_pred, average='weighted')
        a = accuracy_score(Y_true, Y_pred)
    
    return val_loss, p, r, f, a

In [10]:
#TRAIN

n_epochs = 50

def train_model(model, train_dataloader, current_epoch=0, n_epochs=n_epochs, optimizer=None, criterion=None):

    model.train() # prep model for training
    last_epoch = 0
    last_epoch_loss = 0
    early_stopper = None
    early_stopper = EarlyStopper(patience=2, min_delta=0.01)
    for epoch in range(current_epoch, current_epoch + n_epochs):
        last_epoch = epoch + 1
        curr_epoch_loss = []
        #count = 0
        
        for data, target in train_dataloader:
            #if count > 1:
            #    break
            #count = 1
            # zero the parameter gradients
            optimizer.zero_grad()
            # forward + backward + optimize
            
            outputs = model(data)
            loss = criterion(outputs, target.float())
            loss.backward()
            optimizer.step()
            curr_epoch_loss.append(loss.item())
            
        last_epoch_loss = np.mean(curr_epoch_loss)
        echo_out = f"Epoch {last_epoch}: curr_epoch_loss={last_epoch_loss}"
        print(echo_out)
        #if (epoch + 1) % 10 == 0:
        val_loss, p, r, f1, a = eval_model(model, val_dataloader)
        val_echo_out = 'Epoch: {} \t Validation loss: {:.2f}, precision: {:.2f}, recall:{:.2f}, f1_score: {:.2f}, accuracy: {:.2f}'.format(epoch + 1, val_loss, p, r, f1, a)
        print(val_echo_out)
        
        with open(model.name + "train_out", 'a') as f:
            f.write(echo_out + '\n')
            f.write(val_echo_out + '\n')
                 
        if early_stopper and early_stopper.early_stop_min_f1(f1):
            print('EARLY STOP')
            with open(model.name + "train_out", 'a') as f:
                f.write('EARLY STOP\n')
            break
            
    return model, last_epoch, last_epoch_loss

In [None]:
criterion = torch.nn.BCELoss()

continued = False

if continued:
    with open('models/cnn_model.pckl', 'rb') as f:
        cnn_model = pickle.load(f)
    #cnn_optimizer = torch.optim.Adam(cnn_model.parameters(), lr=0.001, betas=(0.9, 0.999), weight_decay=1e-5, amsgrad=False)
    cnn_model, cnn_optimizer, current_epoch = load_checkpoint(cnn_model, cnn_optimizer)
else:
    cnn_model = CNN(pretrained_embedding=embedding_matrix_GNV)
    #cnn_model = CNN(pretrained_embedding=embedding_matrix_w2v)
    cnn_optimizer = torch.optim.Adam(cnn_model.parameters(), lr=0.0002, betas=(0.9, 0.999), weight_decay=1e-5, amsgrad=False)
    current_epoch = 0
    
cnn_model, last_epoch, last_epoch_loss = train_model(cnn_model, train_dataloader, current_epoch=current_epoch, optimizer=cnn_optimizer, criterion=criterion)

with open('models/cnn_model.pckl', 'wb') as f:
    pickle.dump(cnn_model, f)

save_checkpoint(cnn_model, cnn_optimizer, last_epoch, last_epoch_loss)


#p, r, f, a = eval_model(cnn_model, val_dataloader)
#print('Epoch: {} \t Validation precision: {:.2f}, recall:{:.2f}, f1_score: {:.2f}, accuracy: {:.2f}'.format(n_epochs, p, r, f, a))



Epoch 1: curr_epoch_loss=0.5858439897200954
sum Y pred 0 sum target 675.0
Epoch: 1 	 Validation loss: 0.54, precision: 0.55, recall:0.74, f1_score: 0.64, accuracy: 0.74


  _warn_prf(average, modifier, msg_start, len(result))


Epoch 2: curr_epoch_loss=0.4849683018762674
sum Y pred 225 sum target 675.0
Epoch: 2 	 Validation loss: 0.49, precision: 0.76, recall:0.78, f1_score: 0.73, accuracy: 0.78
Epoch 3: curr_epoch_loss=0.42406491665030593
sum Y pred 285 sum target 675.0
Epoch: 3 	 Validation loss: 0.48, precision: 0.76, recall:0.78, f1_score: 0.74, accuracy: 0.78
Epoch 4: curr_epoch_loss=0.3752488709207791
sum Y pred 403 sum target 675.0
Epoch: 4 	 Validation loss: 0.48, precision: 0.76, recall:0.78, f1_score: 0.76, accuracy: 0.78
Epoch 5: curr_epoch_loss=0.3071693166979213
sum Y pred 430 sum target 675.0
Epoch: 5 	 Validation loss: 0.49, precision: 0.76, recall:0.78, f1_score: 0.76, accuracy: 0.78
Epoch 6: curr_epoch_loss=0.24042308904341797
sum Y pred 516 sum target 675.0
Epoch: 6 	 Validation loss: 0.50, precision: 0.77, recall:0.78, f1_score: 0.77, accuracy: 0.78
Epoch 7: curr_epoch_loss=0.18374538474730145
sum Y pred 385 sum target 675.0
Epoch: 7 	 Validation loss: 0.52, precision: 0.77, recall:0.78, f1