In [1]:
!pip install pytorch_pretrained_bert 

Collecting pytorch_pretrained_bert
[?25l  Downloading https://files.pythonhosted.org/packages/d7/e0/c08d5553b89973d9a240605b9c12404bcf8227590de62bae27acbcfe076b/pytorch_pretrained_bert-0.6.2-py3-none-any.whl (123kB)
[K    100% |████████████████████████████████| 133kB 6.5MB/s 
[?25hCollecting boto3 (from pytorch_pretrained_bert)
[?25l  Downloading https://files.pythonhosted.org/packages/8f/8f/a40b9d2e1b479bda3d60badaa88626636d608db0723ac3ba0614fe57a4d4/boto3-1.9.244-py2.py3-none-any.whl (128kB)
[K    100% |████████████████████████████████| 133kB 33.1MB/s 
Collecting regex (from pytorch_pretrained_bert)
[?25l  Downloading https://files.pythonhosted.org/packages/6f/a6/99eeb5904ab763db87af4bd71d9b1dfdd9792681240657a4c0a599c10a81/regex-2019.08.19.tar.gz (654kB)
[K    100% |████████████████████████████████| 655kB 16.8MB/s 
Collecting jmespath<1.0.0,>=0.7.1 (from boto3->pytorch_pretrained_bert)
  Downloading https://files.pythonhosted.org/packages/83/94/7179c3832a6d45b266ddb2aac329e101

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import random as rn
import torch
from pytorch_pretrained_bert import BertModel
from torch import nn
#from torchnlp.datasets import imdb_dataset
from pytorch_pretrained_bert import BertTokenizer
from keras.preprocessing.sequence import pad_sequences
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
from torch.optim import Adam
from torch.nn.utils import clip_grad_norm_
from IPython.display import clear_output
from collections import Counter
from sklearn import model_selection
from sklearn.metrics import classification_report

Using TensorFlow backend.


In [2]:
rn.seed(321)
np.random.seed(321)
torch.manual_seed(321)
torch.cuda.manual_seed(321)

In [3]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## Prepare the Data

In [4]:
#use this cell when working online
path = '/floyd/home/ed-triage'
data_path = '/floyd/home/data'

In [3]:
#use this cell when working from home
path = '/Users/jjaskolkambp/Desktop/machine learning/my_projects/ed-triage'
data_path = '/Users/jjaskolkambp/Desktop/machine learning/my_projects/data/ED triage project'

In [5]:
df1 = pd.read_csv(data_path +'/targets.csv', index_col=0)
df2 = pd.read_csv(data_path + '/subj_data.csv', index_col=0)
data = pd.concat([df1,df2],axis = 1)

In [6]:
data.head()

Unnamed: 0,outcome,target,service,target2,discharge,target3,dispo,target4,SubjectiveNotes,MedicalHistory,pmhx,combo,combo_clean
0,discharge,1,discharge,1,discharge,1,,,,,,,
1,discharge,1,discharge,1,discharge,1,,,,,,,
2,discharge,1,discharge,1,discharge,1,,,,,,,
3,discharge,1,discharge,1,discharge,1,,,,,,,
4,discharge,1,discharge,1,discharge,1,,,,,,,


In [7]:
#lets just play with a limited dataset to see if we can make this work
data = data[['SubjectiveNotes', 'pmhx','target3']]

In [8]:
data.dropna(inplace = True)
len(data)

102583

In [9]:
#let's take this opportunity to learn a bit about our sentences
data['subjnotelen'] = data.SubjectiveNotes.str.split().apply(len)
data['pmhxlen'] = data.pmhx.str.split().apply(len)

In [10]:
data['subjnotelen'].describe()

count    102583.000000
mean         36.531004
std          18.003066
min           1.000000
25%          24.000000
50%          34.000000
75%          46.000000
max         235.000000
Name: subjnotelen, dtype: float64

In [11]:
data['pmhxlen'].describe()

count    102583.000000
mean          3.796925
std           2.185584
min           1.000000
25%           2.000000
50%           4.000000
75%           4.000000
max          34.000000
Name: pmhxlen, dtype: float64

In [12]:
#I want a tiny dataset to play with at first
data = data.sample(n=500,random_state=42)

In [13]:
texts = list(data.SubjectiveNotes)
labels = list(data.target3)
len(texts), len(labels)

(500, 500)

In [14]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(texts, labels, test_size=0.1, random_state=42)

In [15]:
Counter(y_train), Counter(y_test)

(Counter({1: 409, 0: 41}), Counter({1: 47, 0: 3}))

In [16]:
train_texts = X_train
train_labels = y_train
test_texts = X_test
test_labels = y_test

In [17]:
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

In [18]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))

len(train_tokens), len(test_tokens)                   
                   

(450, 50)

In [19]:
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=237, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=237, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape, test_tokens_ids.shape

((450, 237), (50, 237))

In [20]:
train_y = np.array(train_labels)
test_y = np.array(test_labels)
train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

((450,), (50,), 0.9088888888888889, 0.94)

In [21]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

# Bert Model

In [22]:
class BertBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, pooled_output = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        #print (pooled_output.shape)
        dropout_output = self.dropout(pooled_output)
        #print (dropout_output.shape)
        linear_output = self.linear(dropout_output)
        #print (linear_output.shape)
        proba = self.sigmoid(linear_output)
        #print (proba.shape)
        return proba
        

In [23]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'0.0M'

In [24]:
bert_clf = BertBinaryClassifier()
bert_clf = bert_clf.cuda()


In [25]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'438.016512M'

In [26]:
x = torch.tensor(train_tokens_ids[:3]).to(device)
y, pooled = bert_clf.bert(x, output_all_encoded_layers=False)
x.shape, y.shape, pooled.shape

(torch.Size([3, 237]), torch.Size([3, 237, 768]), torch.Size([3, 768]))

In [27]:
y = bert_clf(x)
y.cpu().detach().numpy()

array([[0.39436665],
       [0.40824613],
       [0.41374868]], dtype=float32)

In [28]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'3010.249216M'

In [45]:
y, x, pooled = None, None, None
torch.cuda.empty_cache()
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'3737.261056M'

# Fine-tune BERT

In [30]:
BATCH_SIZE = 2
EPOCHS = 1

This next few cells is the first pass at creating a classified that will take an nlp and another input

In [46]:
class BertTwoInputBinaryClassifier(nn.Module):
    def __init__(self, dropout=0.1):
        super(BertTwoInputBinaryClassifier, self).__init__()

        self.bert = BertModel.from_pretrained('bert-base-uncased')
        #self.bert2 = BertModel.from_pretrained('bert-base-uncased')

        self.dropout = nn.Dropout(dropout)
        self.dropout2 = nn.Dropout(dropout)
        self.linear = nn.Linear(768, 50)
        self.linear2 = nn.Linear(100,1)
        self.sigmoid = nn.Sigmoid()
    
    def forward(self, tokens, masks=None):
        _, nlp_output1 = self.bert(tokens, attention_mask=masks, output_all_encoded_layers=False)
        print (nlp_output1.shape)
        dropout_output1 = self.dropout(nlp_output1)
        print (dropout_output1.shape)
        nlp_output2 = self.linear(dropout_output1)
        print (dropout_output1.shape)
        dropout_output2 = self.dropout2(nlp_output2)
        print (dropout_output1.shape)
        dummy = torch.rand(BATCH_SIZE,50).to(device)
        print (dummy.shape)
        merged = torch.cat((dropout_output2, dummy), 1)
        print (merged.shape)
        linear_output = self.linear2(merged)
        print (linear_output.shape)
        proba = self.sigmoid(linear_output)
        return proba

In [47]:
bert_clf2 = BertTwoInputBinaryClassifier()
bert_clf2 = bert_clf2.cuda()

RuntimeError: CUDA out of memory. Tried to allocate 89.50 MiB (GPU 0; 11.17 GiB total capacity; 3.48 GiB already allocated; 3.69 MiB free; 7.87 MiB cached)

In [None]:
x = torch.tensor(train_tokens_ids[:BATCH_SIZE]).to(device)
y = torch.rand(BATCH_SIZE, 237).to(device)
x.shape, y.shape

In [None]:
bert_clf2.forward(x)

In [26]:
y, pooled = bert_clf2.bert(x, output_all_encoded_layers=False)
x.shape, y.shape, pooled.shape

(torch.Size([2, 512]), torch.Size([2, 512, 768]), torch.Size([2, 768]))

In [33]:
param_optimizer = list(bert_clf2.sigmoid.named_parameters()) 
optimizer_grouped_parameters = [{"params": [p for n, p in param_optimizer]}]

In [34]:
optimizer = Adam(bert_clf2.parameters(), lr=3e-6)

In [35]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'3439.535616M'

In [36]:
torch.cuda.empty_cache()

In [37]:
str(torch.cuda.memory_allocated(device)/1000000 ) + 'M'

'3439.535616M'

In [38]:
for epoch_num in range(EPOCHS):
    bert_clf2.train()
    train_loss = 0
    for step_num, batch_data in enumerate(train_dataloader):
        token_ids, masks, labels = tuple(t.to(device) for t in batch_data)
        print(str(torch.cuda.memory_allocated(device)/1000000 ) + 'M')
        logits = bert_clf2(token_ids, masks)
        
        loss_func = nn.BCELoss()

        batch_loss = loss_func(logits, labels)
        train_loss += batch_loss.item()
        
        
        bert_clf2.zero_grad()
        batch_loss.backward()
        

        clip_grad_norm_(parameters=bert_clf2.parameters(), max_norm=1.0)
        optimizer.step()
        
        clear_output(wait=True)
        print('Epoch: ', epoch_num + 1)
        print("\r" + "{0}/{1} loss: {2} ".format(step_num, len(train_dataset) / BATCH_SIZE, train_loss / (step_num + 1)))
        

Epoch:  1
224/225.0 loss: 0.4440185919735167 


so now I have a way to merge the BERT NLP with some sort of other vector
let's see if I can put some real data into that vector

In [41]:
os.listdir(data_path)

['.ipynb_checkpoints',
 'med_freq.csv',
 'inf_control_data.csv',
 'med_lab.csv',
 'date_data.csv',
 'tabular_data.csv',
 'sentiment_columns.csv',
 'med_hx.csv',
 'subj_data.csv',
 'targets.csv',
 'jj_modified_bch_data.csv',
 'ed_databunch',
 'jj_modified_bch_data.xlsx']

In [46]:
df3 = pd.read_csv(data_path + '/tabular_data.csv', index_col = 0)

In [47]:
df3.head()

Unnamed: 0,ID,TriageLevel,AgeInYrs,GenderDesc,DischargeDisposition,PresentingComplaint,PresentingComplaintDesc,AdmitLocation,PatientService,BloodPressure_LastEDReading,systolic,diastolic,temp,pulse,o2sat
0,1,3.0,12.0,Male,17,852,Fever,,,117/72,117.0,72.0,36.8,96.0,98.0
1,2,2.0,80.0,Female,17,401,Altered level of consciousness,,,102/55,102.0,55.0,36.5,63.0,93.0
2,3,3.0,3.0,Male,17,253,Constipation,,,,,,,178.0,98.0
3,4,3.0,56.0,Female,62,409,Extremity weakness/symptoms of CVA,,,153/73,153.0,73.0,36.8,81.0,97.0
4,5,2.0,69.0,Female,17,3,Chest pain - cardiac features,,,151/91,151.0,91.0,37.1,93.0,100.0


In [48]:
df3.columns

Index(['ID', 'TriageLevel', 'AgeInYrs', 'GenderDesc', 'DischargeDisposition',
       'PresentingComplaint', 'PresentingComplaintDesc', 'AdmitLocation',
       'PatientService', 'BloodPressure_LastEDReading', 'systolic',
       'diastolic', 'temp', 'pulse', 'o2sat'],
      dtype='object')

In [49]:
df3 = df3[['TriageLevel', 'AgeInYrs', 'systolic',
       'diastolic', 'temp', 'pulse', 'o2sat']]

In [50]:
df3.head()

Unnamed: 0,TriageLevel,AgeInYrs,systolic,diastolic,temp,pulse,o2sat
0,3.0,12.0,117.0,72.0,36.8,96.0,98.0
1,2.0,80.0,102.0,55.0,36.5,63.0,93.0
2,3.0,3.0,,,,178.0,98.0
3,3.0,56.0,153.0,73.0,36.8,81.0,97.0
4,2.0,69.0,151.0,91.0,37.1,93.0,100.0


In [51]:
df3.fillna(df3.mean(), inplace=True)

In [68]:
data2 = pd.concat([df1,df2,df3], axis = 1)

In [69]:
data2.head()

Unnamed: 0,outcome,target,service,target2,discharge,target3,dispo,target4,SubjectiveNotes,MedicalHistory,pmhx,combo,combo_clean,TriageLevel,AgeInYrs,systolic,diastolic,temp,pulse,o2sat
0,discharge,1,discharge,1,discharge,1,,,,,,,,3.0,12.0,117.0,72.0,36.8,96.0,98.0
1,discharge,1,discharge,1,discharge,1,,,,,,,,2.0,80.0,102.0,55.0,36.5,63.0,93.0
2,discharge,1,discharge,1,discharge,1,,,,,,,,3.0,3.0,129.942469,77.794607,36.823675,178.0,98.0
3,discharge,1,discharge,1,discharge,1,,,,,,,,3.0,56.0,153.0,73.0,36.8,81.0,97.0
4,discharge,1,discharge,1,discharge,1,,,,,,,,2.0,69.0,151.0,91.0,37.1,93.0,100.0


In [56]:
data2.columns

Index(['outcome', 'target', 'service', 'target2', 'discharge', 'target3',
       'dispo', 'target4', 'SubjectiveNotes', 'MedicalHistory', 'pmhx',
       'combo', 'combo_clean', 'TriageLevel', 'AgeInYrs', 'systolic',
       'diastolic', 'temp', 'pulse', 'o2sat'],
      dtype='object')

In [70]:
data2.drop(['outcome', 'target', 'service', 'target2', 'discharge',
       'dispo', 'target4', 'MedicalHistory', 'pmhx',
       'combo', 'combo_clean'],axis = 1, inplace = True)

In [71]:
data2.head()

Unnamed: 0,target3,SubjectiveNotes,TriageLevel,AgeInYrs,systolic,diastolic,temp,pulse,o2sat
0,1,,3.0,12.0,117.0,72.0,36.8,96.0,98.0
1,1,,2.0,80.0,102.0,55.0,36.5,63.0,93.0
2,1,,3.0,3.0,129.942469,77.794607,36.823675,178.0,98.0
3,1,,3.0,56.0,153.0,73.0,36.8,81.0,97.0
4,1,,2.0,69.0,151.0,91.0,37.1,93.0,100.0


In [72]:
data2.shape

(136993, 9)

In [73]:
data2.dropna(inplace = True)
data2.shape

(110457, 9)

In [74]:
data2 = data2.sample(n=1000,random_state=42)

In [75]:
inputs = np.array(data2[['SubjectiveNotes','TriageLevel', 'AgeInYrs', 'systolic',
       'diastolic', 'temp', 'pulse', 'o2sat']])
labels = np.array(data2.target3)
inputs.shape, labels.shape

((1000, 8), (1000,))

In [76]:
X_train, X_test, y_train, y_test = model_selection.train_test_split(inputs, labels, test_size=0.1, random_state=42)

In [77]:
Counter(y_train), Counter(y_test)

(Counter({0: 93, 1: 807}), Counter({0: 6, 1: 94}))

In [80]:
train_texts = X_train[:,0]
train_labels = y_train
test_texts = X_test[:,0]
test_labels = y_test

In [82]:
train_texts.shape, test_texts.shape

((900,), (100,))

In [84]:
train_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], train_texts))
test_tokens = list(map(lambda t: ['[CLS]'] + tokenizer.tokenize(t)[:510] + ['[SEP]'], test_texts))

len(train_tokens), len(test_tokens)                   
                   

(900, 100)

In [85]:
train_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, train_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")
test_tokens_ids = pad_sequences(list(map(tokenizer.convert_tokens_to_ids, test_tokens)), maxlen=512, truncating="post", padding="post", dtype="int")

train_tokens_ids.shape, test_tokens_ids.shape

((900, 512), (100, 512))

In [94]:
train_y = np.array(train_labels)
test_y = np.array(test_labels)
train_y.shape, test_y.shape, np.mean(train_y), np.mean(test_y)

((900,), (100,), 0.8966666666666666, 0.94)

In [87]:
train_masks = [[float(i > 0) for i in ii] for ii in train_tokens_ids]
test_masks = [[float(i > 0) for i in ii] for ii in test_tokens_ids]

In [95]:
train_nums = X_train[:,1:].astype(float)
test_nums = X_test[:,1:].astype(float)

In [89]:
train_nums.shape, test_nums.shape

((900, 7), (100, 7))

In [96]:
train_tokens_tensor = torch.tensor(train_tokens_ids)
train_y_tensor = torch.tensor(train_y.reshape(-1, 1)).float()
train_nums_tensor = torch.tensor(train_nums)

test_tokens_tensor = torch.tensor(test_tokens_ids)
test_y_tensor = torch.tensor(test_y.reshape(-1, 1)).float()
test_nums_tensor = torch.tensor(test_nums)

train_masks_tensor = torch.tensor(train_masks)
test_masks_tensor = torch.tensor(test_masks)

In [97]:
train_dataset = TensorDataset(train_tokens_tensor, train_nums_tensor, train_masks_tensor, train_y_tensor)
train_sampler = RandomSampler(train_dataset)
train_dataloader = DataLoader(train_dataset, sampler=train_sampler, batch_size=BATCH_SIZE)

test_dataset = TensorDataset(test_tokens_tensor, test_nums_tensor, test_masks_tensor, test_y_tensor)
test_sampler = SequentialSampler(test_dataset)
test_dataloader = DataLoader(test_dataset, sampler=test_sampler, batch_size=BATCH_SIZE)


In [99]:
bert_clf2 = BertTwoInputBinaryClassifier()
bert_clf2 = bert_clf2.cuda()

In [100]:
x = torch.tensor(train_tokens_ids[:3]).to(device)
y = torch.tensor(train_nums[:3]).to(device)
x.shape, y.shape

(torch.Size([3, 512]), torch.Size([3, 7]))

In [105]:
bert_clf2.forward(x,y)

RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.cuda.DoubleTensor instead (while checking arguments for embedding)