In [44]:
import sys
import numpy as np
import pandas as pd
import random as rn
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn
from torchnlp.datasets import imdb_dataset
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output
from collections import Counter
from sklearn import model_selection

In [2]:
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

## Prepare the Data

In [None]:
#use this cell when working online
path = '/floyd/home/ed-triage'
data_path = '/floyd/home/data'

In [3]:
#use this cell when working from home
path = '/Users/jjaskolkambp/Desktop/machine learning/my_projects/ed-triage'
data_path = '/Users/jjaskolkambp/Desktop/machine learning/my_projects/data/ED triage project'

In [36]:
df1 = pd.read_csv(data_path +'/targets.csv', index_col=0)
df2 = pd.read_csv(data_path + '/subj_data.csv', index_col=0)
data = pd.concat([df1,df2],axis = 1)

In [37]:
data.head()

Unnamed: 0,outcome,target,service,target2,discharge,target3,dispo,target4,SubjectiveNotes,MedicalHistory,pmhx,combo,combo_clean
0,discharge,1,discharge,1,discharge,1,,,,,,,
1,discharge,1,discharge,1,discharge,1,,,,,,,
2,discharge,1,discharge,1,discharge,1,,,,,,,
3,discharge,1,discharge,1,discharge,1,,,,,,,
4,discharge,1,discharge,1,discharge,1,,,,,,,


In [38]:
#lets just play with a limited dataset to see if we can make this work
data = data[['SubjectiveNotes', 'target3']]

In [39]:
data.head()

Unnamed: 0,SubjectiveNotes,target3
0,,1
1,,1
2,,1
3,,1
4,,1


In [40]:
data.dropna(inplace = True)
len(data)

110457

In [41]:
#I want a tiny dataset to play with at first
data = data.sample(n=1000,random_state=42)

In [42]:
texts = list(data.SubjectiveNotes)
labels = list(data.target3)
len(texts), len(labels)

(1000, 1000)

In [46]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(texts, labels, test_size=0.1, random_state=42)

In [48]:
Counter(y_train), Counter(y_test)

(Counter({0: 93, 1: 807}), Counter({0: 6, 1: 94}))

In [49]:
train_texts = X_train
train_labels = y_train
test_texts = X_test
test_labels = y_test

In [50]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [51]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))

len(train_tokens), len(test_tokens)                   
                   

(900, 100)

In [52]:
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape, test_tokens_ids.shape

((900, 512), (100, 512))

In [55]:
train_y = np.array(train_labels)
test_y = np.array(test_labels)
train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

((900,), (100,), 0.8966666666666666, 0.94)

In [56]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

# Bert Model

In [57]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        dropout_output = self.dropout(pooled_output)
        linear_output = self.linear(dropout_output)
        proba = self.sigmoid(linear_output)
        return proba
        

In [58]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cpu')

In [59]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

ValueError: Expected a cuda device, but got: cpu

In [58]:
bert_clf = BertBinaryClassifier()
bert_clf = bert_clf.cuda()


In [59]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'438.881792M'

In [60]:
x = torch.tensor(train_tokens_ids[:3]).to(device)
y, pooled = bert_clf.bert(x, output_all_encoded_layers=False)
x.shape, y.shape, pooled.shape

(torch.Size([3, 512]), torch.Size([3, 512, 768]), torch.Size([3, 768]))

In [61]:
y = bert_clf(x)
y.cpu().detach().numpy()

array([[0.480183  ],
       [0.4748414 ],
       [0.40391952]], dtype=float32)

In [62]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'8010.245632M'

In [116]:
y, x, pooled = None, None, None
torch.cuda.empty_cache()
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'11563.497984M'

# Fine-tune BERT

In [64]:
BATCH_SIZE = 4
EPOCHS = 10

In [73]:
train_y = train_y.astype(float)
test_y = test_y.astype(float)

In [74]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'438.881792M'

In [75]:
train_dataset = TensorDataset(train_tokens_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)


In [76]:
param_optimizer = list(bert_clf.sigmoid.named_parameters()) 
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [79]:
optimizer = Adam(bert_clf.parameters(), lr=3e-6)

In [80]:
 torch.cuda.empty_cache()

In [81]:
for epoch_num in range(EPOCHS):
    bert_clf.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
        logits = bert_clf(token_ids, masks)
        
        loss_func = nn.BCELoss()

        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()
        
        
        bert_clf.zero_grad()
        batch_loss.backward()
        

        clip_grad_norm_(parameters=bert_clf.parameters(), max_norm=1.0)
        optimizer.step()
        
        clear_output(wait=True)
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_data) / BATCH_SIZE, train_loss / (step_num + 1)))
        

Epoch:  10
249/250.0 loss: 0.036980899441521616 


In [82]:
bert_clf.eval()
bert_predicted = []
all_logits = []
with torch.no_grad():
    for step_num, batch_data in enumerate(test_dataloader):

        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)

        logits = bert_clf(token_ids, masks)
        loss_func = nn.BCELoss()
        loss = loss_func(logits, labels)
        numpy_logits = logits.cpu().detach().numpy()
        
        bert_predicted += list(numpy_logits[:, 0] > 0.5)
        all_logits += list(numpy_logits[:, 0])
    

In [83]:
np.mean(bert_predicted)

0.47

In [84]:
print(classification_report(test_y, bert_predicted))

              precision    recall  f1-score   support

         0.0       0.91      0.96      0.93        50
         1.0       0.96      0.90      0.93        50

   micro avg       0.93      0.93      0.93       100
   macro avg       0.93      0.93      0.93       100
weighted avg       0.93      0.93      0.93       100

