In [12]:
## This is a sample training result - please check test_results for the full results

In [9]:
import pandas as pd
import numpy as np

import torch
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset
import copy

import time
import warnings
warnings.filterwarnings("ignore")

In [2]:
#BERT clinic
import transformers
from transformers import BertModel, BertTokenizer
from transformers import BertModel, BertForSequenceClassification
from transformers.optimization import AdamW
from transformers import AutoTokenizer, AutoModel

In [3]:
import torch.optim as optim

In [4]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

In [5]:
#Check if CUDA is available
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


In [6]:
dtrain = pd.read_csv('project_dtrain_clean.csv', index_col = 0)
dval = pd.read_csv('project_dval_clean.csv', index_col = 0)
dtest = pd.read_csv('project_dtest_clean.csv', index_col = 0)

## Create data loader with Bert tokenizer

In [7]:
tokenizer = AutoTokenizer.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT")


In [8]:
#Preprocess data using BERT CLINIC tokenizer
#tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
def bert_load(data):
    '''
    Load in data
    Return BERT's preprocessed inputs including token_id, mask, label
    '''
    token_ids = []
    attention_masks = []
    for row in data['TEXT']:
        #row = re.sub('[\\(\[#.!?,\'\/\])0-9]', ' ', row)
        encoded_dict = tokenizer.encode_plus(row,
                                            add_special_tokens= True, #add [CLS], [SEP]
                                            max_length = 512,  
                                            pad_to_max_length = True, #pad and truncate
                                            return_attention_mask = True, #construct attention mask
                                            return_tensors = 'pt') #return pytorch tensor
        
        token_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    token_ids = torch.cat(token_ids,dim=0)
    attention_masks = torch.cat(attention_masks,dim=0)
    labels = torch.tensor(data['READMIT'].values)
    data_out = TensorDataset(token_ids, attention_masks, labels)
    return data_out
        
datatrain = bert_load(dtrain)   
dataval = bert_load(dval)
datatest = bert_load(dtest)

In [41]:
BATCH_SIZE = 12
train_loaderB = DataLoader(datatrain,
                           batch_size=BATCH_SIZE,
                           shuffle=True)
                           

val_loaderB = DataLoader(dataval,
                         batch_size=BATCH_SIZE,
                         shuffle= True)
                         

test_loaderB = DataLoader(datatest,
                         batch_size=BATCH_SIZE,
                         shuffle= False)

## Train as feature extractor

In [11]:
class BertClassification(nn.Module):
  
    def __init__(self):
        super(BertClassification, self).__init__()
        self.bert = AutoModel.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT")
        for param in self.bert.parameters():
            param.requires_grad = False
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 2)
        
        nn.init.xavier_normal_(self.classifier.weight)
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids,  attention_mask=attention_mask)
        pooled_output = self.dropout(pooled_output)
        outputs = self.classifier(pooled_output)
        #outputs = F.sigmoid(outputs)
        return outputs
    

In [39]:
modelBERT = BertClassification()
modelBERT.to(device)

BertClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine

In [14]:
def trainBERT(model, train_loader, val_loader, num_epoch=20, lr =5e-5 ):
    # Training steps
    start_time = time.time()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=lr, eps= 1e-8) 
    #optimizer = optim.Adam(model.parameters(), lr = lr)
    
    train_loss = []
    train_acc = []
    val_loss = []
    val_acc = []
    auc = []
    best_auc = 0.
    best_model = copy.deepcopy(model.state_dict())

    for epoch in range(num_epoch):
        model.train()
        #Initialize
        correct = 0
        total = 0
        total_loss = 0
     
        for i, (data, mask, labels) in enumerate(train_loader):
            data, mask, labels = data.to(device), mask.to(device), labels.to(device, dtype=torch.long)
            optimizer.zero_grad()

            outputs = model(data, token_type_ids = None,
                                  attention_mask= mask,
                                  labels =None)
            
            loss = loss_fn(outputs.view(-1,2), labels.view(-1))
         

            loss.backward()
            optimizer.step()
            label_cpu = labels.squeeze().to('cpu').numpy()
            pred = outputs.data.max(-1)[1].to('cpu').numpy()
            total += labels.size(0)
            correct += float(sum((pred ==label_cpu)))
            total_loss += loss.item()
            
            
        acc = correct/total
       
        t_loss = total_loss/total
        train_loss.append(t_loss)
        train_acc.append(acc)
        # report performance
        
        print('Epoch: ',epoch)
        print('Train set | Accuracy: {:6.4f} | Loss: {:6.4f}'.format(acc, t_loss))     
    
    # Evaluate after every epoch
        #Reset the initialization
        correct = 0
        total = 0
        total_loss = 0
        model.eval()
        
        predictions =[]
        truths= []

        with torch.no_grad():
            for i, (data, mask, labels) in enumerate(val_loader):
                data, mask, labels = data.to(device), mask.to(device), labels.to(device, dtype=torch.long)


                optimizer.zero_grad()

                outputs = model(data, token_type_ids = None,
                                      attention_mask= mask,
                                      labels =None)
                #va_loss = loss_fn(outputs.squeeze(-1), labels)
                va_loss = loss_fn(outputs.view(-1,2), labels.view(-1))

                label_cpu = labels.squeeze().to('cpu').numpy()
                
                pred = outputs.data.max(-1)[1].to('cpu').numpy()
                total += labels.size(0)
                correct += float(sum((pred ==label_cpu)))
                total_loss += va_loss.item()
                
                predictions += list(pred)
                truths += list(label_cpu)
                       
            v_acc = correct/total
            v_loss = total_loss/total
            val_loss.append(v_loss)
            val_acc.append(v_acc)
            
            
            v_auc = roc_auc_score(truths, predictions)
            auc.append(v_auc)
            
            elapse = time.strftime('%H:%M:%S', time.gmtime(int((time.time() - start_time))))
            print('Validation set | Accuracy: {:6.4f} | AUC: {:6.4f} | Loss: {:4.2f} | time elapse: {:>9}'.format(
                v_acc, v_auc, v_loss, elapse))
            print('-'*10)
            
            if v_auc > best_auc:
                best_auc = v_auc
                best_model = copy.deepcopy(model.state_dict())

    print('Best validation auc: {:6.4f}'.format(best_auc))
    model.load_state_dict(best_model)     
    return train_loss, train_acc, val_loss, val_acc, v_auc, model

In [22]:
modelBERT_FC = BERT_FC()
modelBERT_FC.to(device)

BERT_FC(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(28996, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    

In [28]:
#Train only as feature extractor- batch size = 12
train_loss_BERTC4, train_acc_BERTC4, val_loss_BERTC4, val_acc_BERTC4, val_auc_BERTC4, model_BERTC4 = trainBERT(modelBERT_FC, 
                                                                                     train_loaderB, 
                                                                                     val_loaderB, 
                                                                                     num_epoch=5,lr =2e-3)

Epoch:  0
Train set | Accuracy: 0.7655 | Loss: 0.0445
Validation set | Accuracy: 0.7587 | AUC: 0.5455 | Loss: 0.04 | time elapse:  00:03:16
----------
Epoch:  1
Train set | Accuracy: 0.7801 | Loss: 0.0401
Validation set | Accuracy: 0.7809 | AUC: 0.5573 | Loss: 0.04 | time elapse:  00:06:32
----------
Epoch:  2
Train set | Accuracy: 0.8228 | Loss: 0.0339
Validation set | Accuracy: 0.7234 | AUC: 0.6456 | Loss: 0.05 | time elapse:  00:09:48
----------
Epoch:  3
Train set | Accuracy: 0.8785 | Loss: 0.0253
Validation set | Accuracy: 0.7972 | AUC: 0.6179 | Loss: 0.04 | time elapse:  00:13:05
----------
Epoch:  4
Train set | Accuracy: 0.9284 | Loss: 0.0154
Validation set | Accuracy: 0.7890 | AUC: 0.6619 | Loss: 0.06 | time elapse:  00:16:20
----------
Best validation auc: 0.6619


## Train Full

In [11]:
#del modelBERT
#torch.cuda.empty_cache()

In [10]:
class BertClassification(nn.Module):
  
    def __init__(self):
        super(BertClassification, self).__init__()
        self.bert = AutoModel.from_pretrained("emilyalsentzer/Bio_Discharge_Summary_BERT")
        self.dropout = nn.Dropout(0.1)
        self.classifier = nn.Linear(768, 2)
        
        nn.init.xavier_normal_(self.classifier.weight)
        
    def forward(self, input_ids, token_type_ids=None, attention_mask=None, labels=None):
        _, pooled_output = self.bert(input_ids,  attention_mask=attention_mask)
        pooled_output = self.dropout(pooled_output)
        outputs = self.classifier(pooled_output)
        #outputs = F.sigmoid(outputs)
        return outputs

modelBERT = BertClassification()
modelBERT.to(device)

In [19]:
#Train full - batch size = 12
train_loss_BERTC3, train_acc_BERTC3, val_loss_BERTC3, val_acc_BERTC3, val_auc_BERTC3, model_BERTC3 = trainBERT(modelBERT, 
                                                                                     train_loaderB, 
                                                                                     val_loaderB, 
                                                                                     num_epoch=5,lr =2e-5)

Epoch:  0
Train set | Accuracy: 0.7671 | Loss: 0.0447
Validation set | Accuracy: 0.7760 | AUC: 0.5213 | Loss: 0.04 | time elapse:  00:03:16
----------
Epoch:  1
Train set | Accuracy: 0.7855 | Loss: 0.0394
Validation set | Accuracy: 0.7180 | AUC: 0.6490 | Loss: 0.04 | time elapse:  00:06:33
----------
Epoch:  2
Train set | Accuracy: 0.8277 | Loss: 0.0331
Validation set | Accuracy: 0.7565 | AUC: 0.6435 | Loss: 0.04 | time elapse:  00:09:49
----------
Epoch:  3
Train set | Accuracy: 0.8888 | Loss: 0.0232
Validation set | Accuracy: 0.7858 | AUC: 0.6520 | Loss: 0.05 | time elapse:  00:13:05
----------
Epoch:  4
Train set | Accuracy: 0.9391 | Loss: 0.0133
Validation set | Accuracy: 0.7939 | AUC: 0.6417 | Loss: 0.06 | time elapse:  00:16:21
----------
Best validation auc: 0.6520


In [22]:
#Train full - batch size = 6
train_loss_BERTC, train_acc_BERTC, val_loss_BERTC, val_acc_BERTC, val_auc_BERTC, model_BERTC = trainBERT(modelBERT, 
                                                                                     train_loaderB, 
                                                                                     val_loaderB, 
                                                                                     num_epoch=5,lr =2e-5)

Epoch:  0
Train set | Accuracy: 0.7640 | Loss: 0.0890
Validation set | Accuracy: 0.7782 | AUC: 0.5270 | Loss: 0.08 | time elapse:  00:20:17
----------
Epoch:  1
Train set | Accuracy: 0.7837 | Loss: 0.0787
Validation set | Accuracy: 0.7164 | AUC: 0.6531 | Loss: 0.09 | time elapse:  00:40:36
----------
Epoch:  2
Train set | Accuracy: 0.8277 | Loss: 0.0652
Validation set | Accuracy: 0.7749 | AUC: 0.6364 | Loss: 0.08 | time elapse:  01:00:55
----------
Epoch:  3
Train set | Accuracy: 0.8975 | Loss: 0.0444
Validation set | Accuracy: 0.8037 | AUC: 0.6437 | Loss: 0.09 | time elapse:  01:21:12
----------
Epoch:  4
Train set | Accuracy: 0.9503 | Loss: 0.0232
Validation set | Accuracy: 0.7793 | AUC: 0.6340 | Loss: 0.11 | time elapse:  01:41:36
----------
Best validation auc: 0.6531
