In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
#import seaborn as sns

In [2]:
df_train = pd.read_csv("output_files/combined.csv", index_col=0)

In [3]:
df_train['REPORTED_TERM'] = df_train['REPORTED_TERM'].apply(lambda x: x.lower())

In [4]:
df_train.head()

Unnamed: 0,ART_CODE,DESC_CODED,HLGT_NAME_COMPL,HLT_NAME_COMPL,INC_CODE,INC_CODE_J,LLT_NAME_COMPL,PT_NAME_COMPL,REPORTED_TERM,SOC_CODE,len
0,0,Hyponatraemia,ELECTROLYTE AND FLUID BALANCE CONDITIONS,SODIUM IMBALANCE,10021038.0,10021036,HYPONATREMIA,HYPONATRAEMIA,hyponatremia,10027433.0,1
1,1,Subacute cutaneous lupus erythematosus,EPIDERMAL AND DERMAL CONDITIONS,CONNECTIVE TISSUE DISORDERS,10057903.0,10057903,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,SUBACUTE CUTANEOUS LUPUS ERYTHEMATOSUS,omeprazole induced subacute cutaneous lupus er...,10040785.0,4
2,2,Blood bilirubin unconjugated increased,HEPATOBILIARY INVESTIGATIONS,LIVER FUNCTION ANALYSES,10021709.0,10021709,INDIRECT BILIRUBIN INCREASED,BLOOD BILIRUBIN UNCONJUGATED INCREASED,indirect bilirubin (74.7 micromol/l),10022891.0,4
3,3,toxic epidermal necrolysis,EPIDERMAL AND DERMAL CONDITIONS,BULLOUS CONDITIONS,10044223.0,10044223,TOXIC EPIDERMAL NECROLYSIS,TOXIC EPIDERMAL NECROLYSIS,toxic epidermal necrolysis,10040785.0,3
4,4,Bradycardia,CARDIAC ARRHYTHMIAS,RATE AND RHYTHM DISORDERS NEC,10006093.0,10006093,BRADYCARDIA,BRADYCARDIA,bradycardia,10007541.0,1


In [5]:
terms_list = list(df_train['REPORTED_TERM'])

In [6]:
all_text = ' '.join(terms_list)

In [7]:
words = all_text.split()

#### Encoding Words

In [8]:
from collections import Counter

In [9]:
counts = Counter(words)
vocab = sorted(counts, key=counts.get, reverse=True)
vocab_to_int = {word: ii for ii, word in enumerate(vocab, 1)}

In [10]:
reported_terms_ints = []
for term in terms_list:
    reported_terms_ints.append([vocab_to_int[word] for word in term.split()])

In [11]:
# outlier review stats
reported_term_lens = Counter([len(x) for x in reported_terms_ints])

In [12]:
print("Zero-length reviews: {}".format(reported_term_lens[0]))
print("Maximum review length: {}".format(max(reported_term_lens)))

Zero-length reviews: 0
Maximum review length: 43


In [13]:
len(reported_terms_ints)

63233

In [14]:
def pad_features(reported_terms_ints, seq_length):
    ''' Return features of review_ints, where each review is padded with 0's 
        or truncated to the input seq_length.
    '''
    
    # getting the correct rows x cols shape
    features = np.zeros((len(reported_terms_ints), seq_length), dtype=int)

    # for each review, I grab that review and 
    for i, row in enumerate(reported_terms_ints):
        features[i, -len(row):] = np.array(row)[:seq_length]
    
    return features

In [15]:
seq_length = 40

features = pad_features(reported_terms_ints, seq_length=seq_length)

In [16]:
y = df_train.ART_CODE

In [17]:
dummies = pd.get_dummies(y.values)

In [18]:
encoded_labels = np.array(dummies)

In [19]:
split_frac = 0.8

## split data into training, validation, and test data (features and labels, x and y)

split_idx = int(len(features)*0.8)
train_x, remaining_x = features[:split_idx], features[split_idx:]
train_y, remaining_y = encoded_labels[:split_idx], encoded_labels[split_idx:]

test_idx = int(len(remaining_x)*0.5)
val_x, test_x = remaining_x[:test_idx], remaining_x[test_idx:]
val_y, test_y = remaining_y[:test_idx], remaining_y[test_idx:]

## print out the shapes of your resultant feature data
print("\t\t\tFeature Shapes:")
print("Train set: \t\t{}".format(train_x.shape), 
      "\nValidation set: \t{}".format(val_x.shape),
      "\nTest set: \t\t{}".format(test_x.shape))

			Feature Shapes:
Train set: 		(50586, 40) 
Validation set: 	(6323, 40) 
Test set: 		(6324, 40)


In [20]:
import torch
from torch.utils.data import TensorDataset, DataLoader

# create Tensor datasets
train_data = TensorDataset(torch.from_numpy(train_x), torch.from_numpy(train_y))
valid_data = TensorDataset(torch.from_numpy(val_x), torch.from_numpy(val_y))
test_data = TensorDataset(torch.from_numpy(test_x), torch.from_numpy(test_y))

# dataloaders
batch_size = 50

# make sure the SHUFFLE your training data
train_loader = DataLoader(train_data, shuffle=True, batch_size=batch_size)
valid_loader = DataLoader(valid_data, shuffle=True, batch_size=batch_size)
test_loader = DataLoader(test_data, shuffle=True, batch_size=batch_size)

In [28]:
# obtain one batch of training data
dataiter = iter(valid_loader)
sample_x, sample_y = dataiter.next()

print('Sample input size: ', sample_x.size()) # batch_size, seq_length
print('Sample input: \n', sample_x)
print()
print('Sample label size: ', sample_y.size()) # batch_size
print('Sample label: \n', sample_y)

Sample input size:  torch.Size([50, 40])
Sample input: 
 tensor([[    0,     0,     0,  ...,     0, 25002,  1809],
        [    0,     0,     0,  ...,     4,  6939,   109],
        [    0,     0,     0,  ...,     0,     0,   115],
        ...,
        [    0,     0,     0,  ...,   548,  2038,  2303],
        [    0,     0,     0,  ...,   260,  5149,    75],
        [    0,     0,     0,  ...,     4,    16,   222]])

Sample label size:  torch.Size([50, 5016])
Sample label: 
 tensor([[0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        ...,
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0],
        [0, 0, 0,  ..., 0, 0, 0]], dtype=torch.uint8)


In [29]:
# First checking if GPU is available
train_on_gpu=torch.cuda.is_available()

if(train_on_gpu):
    print('Training on GPU.')
else:
    print('No GPU available, training on CPU.')

No GPU available, training on CPU.


In [30]:
import torch.nn as nn

class SentimentRNN(nn.Module):
    """
    The RNN model that will be used to perform Sentiment analysis.
    """

    def __init__(self, vocab_size, output_size, embedding_dim, hidden_dim, n_layers, drop_prob=0.5):
        """
        Initialize the model by setting up the layers.
        """
        super(SentimentRNN, self).__init__()

        self.output_size = output_size
        self.n_layers = n_layers
        self.hidden_dim = hidden_dim
        
        # embedding and LSTM layers
        self.embedding = nn.Embedding(vocab_size, embedding_dim)
        self.lstm = nn.LSTM(embedding_dim, hidden_dim, n_layers, 
                            dropout=drop_prob, batch_first=True)
        
        # dropout layer
        self.dropout = nn.Dropout(0.3)
        
        # linear and sigmoid layers
        self.fc = nn.Linear(hidden_dim, output_size)
        self.softmax = nn.Softmax(dim=1)
        

    def forward(self, x, hidden):
        """
        Perform a forward pass of our model on some input and hidden state.
        """
        batch_size = x.size(0)

        # embeddings and lstm_out
        embeds = self.embedding(x)
        lstm_out, hidden = self.lstm(embeds, hidden)
        # stack up lstm outputs
        #lstm_out = lstm_out.contiguous().view(-1, self.hidden_dim)
        # dropout and fully-connected layer
        out = self.dropout(lstm_out)
        out = self.fc(out)
        # sigmoid function
        sig_out = self.softmax(out)
        # reshape to be batch_size first
        sig_out = sig_out[:, -1] # get last batch of labels
        # return last sigmoid output and hidden state
        return sig_out, hidden
    
    
    def init_hidden(self, batch_size):
        ''' Initializes hidden state '''
        # Create two new tensors with sizes n_layers x batch_size x hidden_dim,
        # initialized to zero, for hidden state and cell state of LSTM
        weight = next(self.parameters()).data
        
        if (train_on_gpu):
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda(),
                  weight.new(self.n_layers, batch_size, self.hidden_dim).zero_().cuda())
        else:
            hidden = (weight.new(self.n_layers, batch_size, self.hidden_dim).zero_(),
                      weight.new(self.n_layers, batch_size, self.hidden_dim).zero_())
        
        return hidden
        

In [31]:
output_size = encoded_labels.shape[1]

In [32]:
# Instantiate the model w/ hyperparams
vocab_size = len(vocab_to_int)+1 # +1 for the 0 padding + our word tokens
output_size = output_size
embedding_dim = 400
hidden_dim = 256
n_layers = 2

net = SentimentRNN(vocab_size, output_size, embedding_dim, hidden_dim, n_layers)

print(net)

SentimentRNN(
  (embedding): Embedding(26778, 400)
  (lstm): LSTM(400, 256, num_layers=2, batch_first=True, dropout=0.5)
  (dropout): Dropout(p=0.3)
  (fc): Linear(in_features=256, out_features=5016, bias=True)
  (softmax): Softmax()
)


In [33]:
# loss and optimization functions
lr=0.001

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(net.parameters(), lr=lr)

In [None]:
# training params

epochs = 30 # 3-4 is approx where I noticed the validation loss stop decreasing

counter = 0
print_every = 100
clip=5 # gradient clipping

# move model to GPU, if available
if(train_on_gpu):
    net.cuda()

net.train()
# train for some number of epochs
for e in range(epochs):
    # initialize hidden state
    h = net.init_hidden(batch_size)

    # batch loop
    for inputs, labels in train_loader:
        counter += 1

        if(train_on_gpu):
            inputs, labels = inputs.cuda(), labels.cuda()

        # Creating new variables for the hidden state, otherwise
        # we'd backprop through the entire training history
        h = tuple([each.data for each in h])

        # zero accumulated gradients
        net.zero_grad()
        # get the output from the model
        if((inputs.shape[0],inputs.shape[1]) != (batch_size, seq_length)):
            print('Validation - Input Shape Issue:,inputs.shape')
            continue
        output, h = net(inputs, h)
        # calculate the loss and perform backprop
        labels = torch.tensor(labels, dtype=torch.long)
        loss = criterion(output, torch.max(labels, 1)[1])
        loss.backward()
        # `clip_grad_norm` helps prevent the exploding gradient problem in RNNs / LSTMs.
        nn.utils.clip_grad_norm_(net.parameters(), clip)
        optimizer.step()

        # loss stats
        if counter % print_every == 0:
            # Get validation loss
            val_h = net.init_hidden(batch_size)
            val_losses = []
            net.eval()
            for inputs, labels in valid_loader:

                # Creating new variables for the hidden state, otherwise
                # we'd backprop through the entire training history
                val_h = tuple([each.data for each in val_h])
                
                if(train_on_gpu):
                    inputs, labels = inputs.cuda(), labels.cuda()
                if((inputs.shape[0],inputs.shape[1]) != (batch_size, seq_length)):
                    print('Validation - Input Shape Issue:,inputs.shape')
                    continue
                output, val_h = net(inputs, val_h)
                labels = torch.tensor(labels, dtype=torch.long)
                #loss = criterion(output, torch.max(labels, 1)[1])
                val_loss = criterion(output, torch.max(labels, 1)[1])

                val_losses.append(val_loss.item())

            net.train()
            print("Epoch: {}/{}...".format(e+1, epochs),
                  "Step: {}...".format(counter),
                  "Loss: {:.6f}...".format(loss.item()),
                  "Val Loss: {:.6f}".format(np.mean(val_losses)))



Validation - Input Shape Issue:,inputs.shape
Epoch: 1/30... Step: 100... Loss: 7.891144... Val Loss: 7.964530
Validation - Input Shape Issue:,inputs.shape
Epoch: 1/30... Step: 200... Loss: 7.865648... Val Loss: 7.961387
Validation - Input Shape Issue:,inputs.shape
Epoch: 1/30... Step: 300... Loss: 7.998343... Val Loss: 7.958833
Validation - Input Shape Issue:,inputs.shape
Epoch: 1/30... Step: 400... Loss: 8.080306... Val Loss: 7.955067
Validation - Input Shape Issue:,inputs.shape
Epoch: 1/30... Step: 500... Loss: 7.929850... Val Loss: 7.952180
Validation - Input Shape Issue:,inputs.shape
Epoch: 1/30... Step: 600... Loss: 8.119182... Val Loss: 7.948170
Validation - Input Shape Issue:,inputs.shape
Epoch: 1/30... Step: 700... Loss: 8.060533... Val Loss: 7.944318
