In [3]:
###Note: This is a sample training and results for BERT Base - please see full results at result_test.ipynb

In [2]:
import pandas as pd
import numpy as np

#from sklearn import preprocessing
from collections import Counter
import re
import torch
from torch.nn.utils.rnn import pad_sequence
from torch.nn.utils.rnn import pack_padded_sequence
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, Dataset, TensorDataset
import copy

import time
import warnings
warnings.filterwarnings("ignore")

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_auc_score

import matplotlib.pyplot as plt
import seaborn as sns

# Data Loader - base

In [4]:
dtrain = pd.read_csv('project_dtrain.csv', index_col = 0)
dval = pd.read_csv('project_dval.csv', index_col = 0)
dtest = pd.read_csv('project_dtest.csv', index_col = 0)

In [15]:
#Check if CUDA is available
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

There are 1 GPU(s) available.
We will use the GPU: Tesla V100-SXM2-16GB


# BERT - PRETRAINED

In [3]:
#Check if CUDA is available
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")

    print('There are %d GPU(s) available.' % torch.cuda.device_count())

    print('We will use the GPU:', torch.cuda.get_device_name(0))

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [4]:
import transformers
from transformers import BertModel, BertTokenizer
from transformers import BertModel, BertForSequenceClassification
from transformers.optimization import AdamW

In [5]:
dtrain = pd.read_csv('project_dtrain.csv', index_col = 0)
dval = pd.read_csv('project_dval.csv', index_col = 0)
dtest = pd.read_csv('project_dtest.csv', index_col = 0)

In [6]:
#Preprocess data using BERT tokenizer
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased',do_lower_case=True)
def bert_load(data):
    '''
    Load in data
    Return BERT's preprocessed inputs including token_id, mask, label
    '''
    token_ids = []
    attention_masks = []
    for row in data['TEXT']:
        row = re.sub('[\\(\[#.!?,\'\/\])0-9]', ' ', row)
        encoded_dict = tokenizer.encode_plus(row,
                                            add_special_tokens= True, #add [CLS], [SEP]
                                            max_length = 512,  
                                            pad_to_max_length = True, #pad and truncate
                                            return_attention_mask = True, #construct attention mask
                                            return_tensors = 'pt') #return pytorch tensor
        
        token_ids.append(encoded_dict['input_ids'])
        attention_masks.append(encoded_dict['attention_mask'])
    
    token_ids = torch.cat(token_ids,dim=0)
    attention_masks = torch.cat(attention_masks,dim=0)
    labels = torch.tensor(data['READMIT'].values)
    data_out = TensorDataset(token_ids, attention_masks, labels)
    return data_out
        
datatrain = bert_load(dtrain)   
dataval = bert_load(dval)
datatest = bert_load(dtest)

In [7]:
BATCH_SIZE = 12
train_loaderB = DataLoader(datatrain,
                           batch_size=BATCH_SIZE,
                           shuffle=True)
                           

val_loaderB = DataLoader(dataval,
                         batch_size=BATCH_SIZE,
                         shuffle= True)
                         

test_loaderB = DataLoader(datatest,
                         batch_size=BATCH_SIZE,
                         shuffle= False)

In [8]:
torch.manual_seed(2020)
def trainBERT(model, train_loader, val_loader, num_epoch=20):
    # Training steps
    start_time = time.time()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=2e-5, eps= 1e-8) 
    
    train_loss = []
    train_acc = []
    val_loss = []
    val_acc = []
    auc = []
    best_auc = 0.
    best_model = copy.deepcopy(model.state_dict())

    for epoch in range(num_epoch):
        model.train()
        #Initialize
        correct = 0
        total = 0
        total_loss = 0

        for i, (data, mask, labels) in enumerate(train_loader):
            data, mask, labels = data.to(device), mask.to(device), labels.to(device)
            model.zero_grad()

            loss, outputs = model(data, token_type_ids = None,
                                  attention_mask= mask,
                                  labels =labels)

            loss.backward()
            optimizer.step()
            label_cpu = labels.squeeze().to('cpu').numpy()
            pred = outputs.data.max(-1)[1].to('cpu').numpy()
            total += labels.size(0)
            correct += float(sum((pred ==label_cpu)))
            total_loss += loss.item()

        acc = correct/total
        t_loss = total_loss/total
        train_loss.append(t_loss)
        train_acc.append(acc)
        # report performance
        
        print('Epoch: ',epoch)
        print('Train set | Accuracy: {:6.4f} | Loss: {:6.4f}'.format(acc, t_loss))     
    
    # Evaluate after every epoch
        #Reset the initialization
        correct = 0
        total = 0
        total_loss = 0
        model.eval()
        
        predictions =[]
        truths= []

        with torch.no_grad():
            for i, (data, mask, labels) in enumerate(val_loader):
                data, mask, labels = data.to(device), mask.to(device), labels.to(device)
                model.zero_grad()

                va_loss, outputs = model(data, token_type_ids = None,
                                      attention_mask= mask,
                                      labels =labels)

                label_cpu = labels.squeeze().to('cpu').numpy()
                
                pred = outputs.data.max(-1)[1].to('cpu').numpy()
                total += labels.size(0)
                correct += float(sum((pred ==label_cpu)))
                total_loss += va_loss.item()
                
                predictions += list(pred)
                truths += list(label_cpu)
                       
            v_acc = correct/total
            v_loss = total_loss/total
            val_loss.append(v_loss)
            val_acc.append(v_acc)
            
            v_auc = roc_auc_score(truths, predictions)
            auc.append(v_auc)
            
            elapse = time.strftime('%H:%M:%S', time.gmtime(int((time.time() - start_time))))
            print('Validation set | Accuracy: {:6.4f} | AUC: {:6.4f} | Loss: {:4.2f} | time elapse: {:>9}'.format(
                v_acc, v_auc, v_loss, elapse))
            print('-'*10)
            
            if v_auc > best_auc:
                best_auc = v_auc
                best_model = copy.deepcopy(model.state_dict())

    print('Best validation auc: {:6.4f}'.format(best_auc))
    model.load_state_dict(best_model)     
    return train_loss, train_acc, val_loss, val_acc, v_auc, model
        

In [30]:
# del modelBERT
# torch._C._cuda_emptyCache()

In [9]:
#Empty cache
#torch.cuda.empty_cache()
modelBERT = BertForSequenceClassification.from_pretrained("bert-base-uncased", num_labels=2)
modelBERT.to(device)

BertForSequenceClassification(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0): BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, element

## PRETRAINED BERT 

In [106]:
#Train BERT with batch size 12 for 2 epochs
train_loss_BERT, train_acc_BERT, val_loss_BERT, val_acc_BERT, val_auc_BERT, model_BERT = trainBERT(modelBERT, 
                                                                                     train_loaderB, 
                                                                                     val_loaderB, 
                                                                                     num_epoch=2)

Epoch:  0
Train set | Accuracy: 0.7830 | Loss: 0.0393
Validation set | Accuracy: 0.7316 | AUC: 0.5834 | Loss: 0.04 | time elapse:  00:03:14
----------
Epoch:  1
Train set | Accuracy: 0.8137 | Loss: 0.0351
Validation set | Accuracy: 0.7679 | AUC: 0.6232 | Loss: 0.04 | time elapse:  00:06:29
----------
Best validation accuracy: 0.7679


In [10]:
#Train BERT with batch size 12 for 6 epochs
train_loss_BERT2, train_acc_BERT2, val_loss_BERT2, val_acc_BERT2, val_auc_BERT2, model_BERT2 = trainBERT(modelBERT, 
                                                                                     train_loaderB, 
                                                                                     val_loaderB, 
                                                                                     num_epoch=6)

Epoch:  0
Train set | Accuracy: 0.7754 | Loss: 0.0446
Validation set | Accuracy: 0.7766 | AUC: 0.5000 | Loss: 0.04 | time elapse:  00:05:33
----------
Epoch:  1
Train set | Accuracy: 0.7761 | Loss: 0.0413
Validation set | Accuracy: 0.7495 | AUC: 0.5889 | Loss: 0.04 | time elapse:  00:11:07
----------
Epoch:  2
Train set | Accuracy: 0.8034 | Loss: 0.0363
Validation set | Accuracy: 0.7088 | AUC: 0.6241 | Loss: 0.04 | time elapse:  00:16:40
----------
Epoch:  3
Train set | Accuracy: 0.8535 | Loss: 0.0296
Validation set | Accuracy: 0.7625 | AUC: 0.6154 | Loss: 0.04 | time elapse:  00:22:13
----------
Epoch:  4
Train set | Accuracy: 0.9139 | Loss: 0.0195
Validation set | Accuracy: 0.7749 | AUC: 0.6269 | Loss: 0.06 | time elapse:  00:27:46
----------
Epoch:  5
Train set | Accuracy: 0.9461 | Loss: 0.0123
Validation set | Accuracy: 0.7858 | AUC: 0.6218 | Loss: 0.06 | time elapse:  00:33:20
----------
Best validation auc: 0.6269


In [None]:
import matplotlib.pyplot as plt
epoch = np.arange(0,6,1)

plt.figure(figsize=(10,7))
plt.plot(epoch, train_loss_BERT2, label='train loss')
plt.plot(epoch, val_loss_BERT2, label='validation loss')
plt.title('Plot of train and validation loss per epoch',fontsize = 15)
plt.xlabel('epoch',fontsize = 15)
plt.xticks(epoch)
plt.ylabel('loss',fontsize = 15)
plt.legend()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(epoch, train_acc_BERT2, label='train accuracy')
plt.plot(epoch, val_acc_BERT2, label='validation accuracy')
plt.title('Plot of train and validation accuracy per epoch',fontsize=15)
plt.xlabel('epoch',fontsize = 15)
plt.ylabel('accuracy rate',fontsize = 15)
plt.xticks(epoch)
plt.legend()

In [None]:
plt.figure(figsize=(10,7))
plt.plot(epoch, train_auc_BERT2, label='Best AUC: 0.6269')
plt.title('Plot of train and validation AUC per epoch',fontsize=15)
plt.xlabel('epoch',fontsize = 15)
plt.ylabel('auc',fontsize = 15)
plt.xticks(epoch)
plt.legend()

In [12]:
torch.save(model_BERT2,'model_bert_pretrained.pth')

In [13]:
#TEST SET
torch.manual_seed(2020)
model = model_BERT2.to(device)
model.eval()
total =0.
correct = 0.
predictions =[]
truths= []

with torch.no_grad():        
    for i, (data, mask, labels) in enumerate(test_loaderB):
        data, mask, labels = data.to(device), mask.to(device), labels.to(device)
        
        model.zero_grad()

        _, outputs = model(data, token_type_ids = None,
                                      attention_mask= mask,
                                      labels =labels)

        label_cpu = labels.squeeze().to('cpu').numpy()
        pred = outputs.data.max(-1)[1].to('cpu').numpy()
        total += labels.size(0)
        correct += float(sum((pred ==label_cpu))) 
        
        predictions += list(pred)
        truths += list(label_cpu)
        
    v_auc = roc_auc_score(truths, predictions)
    v_acc = correct/total
    
    
print('Test set | Accuracy: {:6.4f}'.format(v_acc))
print('Test set | AUC: {:6.4f}'.format(v_auc))

Test set | Accuracy: 0.8021
Test set | AUC: 0.6530


## TRAIN CLASSIFIER

In [20]:
#Freeze all layers except the classifier -> train classifier
torch.cuda.empty_cache()
model = model_BERT2.to(device)

In [21]:
for param in model.bert.parameters():
    param.requires_grad = False

In [None]:
## TRIAL 1- forget to mention classifier

In [17]:
train_loss_BERT_fc, train_acc_BERT_fc, val_loss_BERT_fc, val_acc_BERT_fc,val_auc_BERT_fc, model_BERT2_fc = trainBERT(model, 
                                                                                     train_loaderB, 
                                                                                     val_loaderB, 
                                                                                     num_epoch=5)

Epoch:  0
Train set | Accuracy: 0.9682 | Loss: 0.0088
Validation set | Accuracy: 0.7652 | AUC: 0.6362 | Loss: 0.07 | time elapse:  00:02:24
----------
Epoch:  1
Train set | Accuracy: 0.9640 | Loss: 0.0092
Validation set | Accuracy: 0.7652 | AUC: 0.6362 | Loss: 0.07 | time elapse:  00:04:49
----------
Epoch:  2
Train set | Accuracy: 0.9637 | Loss: 0.0091
Validation set | Accuracy: 0.7652 | AUC: 0.6370 | Loss: 0.07 | time elapse:  00:07:14
----------
Epoch:  3
Train set | Accuracy: 0.9640 | Loss: 0.0089
Validation set | Accuracy: 0.7657 | AUC: 0.6356 | Loss: 0.07 | time elapse:  00:09:39
----------
Epoch:  4
Train set | Accuracy: 0.9642 | Loss: 0.0091
Validation set | Accuracy: 0.7652 | AUC: 0.6362 | Loss: 0.07 | time elapse:  00:12:04
----------
Best validation auc: 0.6370


In [18]:
#TEST SET
torch.manual_seed(2020)
model =  model_BERT2_fc.to(device)
model.eval()
total =0.
correct = 0.
predictions =[]
truths= []

with torch.no_grad():        
    for i, (data, mask, labels) in enumerate(test_loaderB):
        data, mask, labels = data.to(device), mask.to(device), labels.to(device)
        
        model.zero_grad()

        _, outputs = model(data, token_type_ids = None,
                                      attention_mask= mask,
                                      labels =labels)

        label_cpu = labels.squeeze().to('cpu').numpy()
        pred = outputs.data.max(-1)[1].to('cpu').numpy()
        total += labels.size(0)
        correct += float(sum((pred ==label_cpu))) 
        
        predictions += list(pred)
        truths += list(label_cpu)
        
    v_auc = roc_auc_score(truths, predictions)
    v_acc = correct/total
    
    
print('Test set | Accuracy: {:6.4f}'.format(v_acc))
print('Test set | AUC: {:6.4f}'.format(v_auc))

Test set | Accuracy: 0.7923
Test set | AUC: 0.6683


In [None]:
## TRIAL 2-mmention classifier + increase learning rate

In [None]:
for param in model.bert.parameters():
    param.requires_grad = False

for param in model.classifier.parameters():
    param.requires_grad = True

In [24]:
#torch.manual_seed(2020)
def trainBERT_classifier(model, train_loader, val_loader, num_epoch=20, lr = 0.001):
    # Training steps
    start_time = time.time()
    loss_fn = nn.CrossEntropyLoss()
    optimizer = AdamW(model.parameters(), lr=lr, eps= 1e-8) 
    
    train_loss = []
    train_acc = []
    val_loss = []
    val_acc = []
    auc = []
    best_auc = 0.
    best_model = copy.deepcopy(model.state_dict())

    for epoch in range(num_epoch):
        model.train()
        #Initialize
        correct = 0
        total = 0
        total_loss = 0

        for i, (data, mask, labels) in enumerate(train_loader):
            data, mask, labels = data.to(device), mask.to(device), labels.to(device)
            model.zero_grad()

            loss, outputs = model(data, token_type_ids = None,
                                  attention_mask= mask,
                                  labels =labels)

            loss.backward()
            optimizer.step()
            label_cpu = labels.squeeze().to('cpu').numpy()
            pred = outputs.data.max(-1)[1].to('cpu').numpy()
            total += labels.size(0)
            correct += float(sum((pred ==label_cpu)))
            total_loss += loss.item()

        acc = correct/total
        t_loss = total_loss/total
        train_loss.append(t_loss)
        train_acc.append(acc)
        # report performance
        
        print('Epoch: ',epoch)
        print('Train set | Accuracy: {:6.4f} | Loss: {:6.4f}'.format(acc, t_loss))     
    
    # Evaluate after every epoch
        #Reset the initialization
        correct = 0
        total = 0
        total_loss = 0
        model.eval()
        
        predictions =[]
        truths= []

        with torch.no_grad():
            for i, (data, mask, labels) in enumerate(val_loader):
                data, mask, labels = data.to(device), mask.to(device), labels.to(device)
                model.zero_grad()

                va_loss, outputs = model(data, token_type_ids = None,
                                      attention_mask= mask,
                                      labels =labels)

                label_cpu = labels.squeeze().to('cpu').numpy()
                
                pred = outputs.data.max(-1)[1].to('cpu').numpy()
                total += labels.size(0)
                correct += float(sum((pred ==label_cpu)))
                total_loss += va_loss.item()
                
                predictions += list(pred)
                truths += list(label_cpu)
                       
            v_acc = correct/total
            v_loss = total_loss/total
            val_loss.append(v_loss)
            val_acc.append(v_acc)
            
            v_auc = roc_auc_score(truths, predictions)
            auc.append(v_auc)
            
            elapse = time.strftime('%H:%M:%S', time.gmtime(int((time.time() - start_time))))
            print('Validation set | Accuracy: {:6.4f} | AUC: {:6.4f} | Loss: {:4.2f} | time elapse: {:>9}'.format(
                v_acc, v_auc, v_loss, elapse))
            print('-'*10)
            
            if v_auc > best_auc:
                best_auc = v_auc
                best_model = copy.deepcopy(model.state_dict())

    print('Best validation auc: {:6.4f}'.format(best_auc))
    model.load_state_dict(best_model)     
    return train_loss, train_acc, val_loss, val_acc, v_auc, model
        

In [25]:
train_loss_BERT_fc2, train_acc_BERT_fc2, val_loss_BERT_fc2, val_acc_BERT_fc2,val_auc_BERT_fc2, model_BERT2_fc2 = trainBERT_classifier(model, 
                                                                                     train_loaderB, 
                                                                                     val_loaderB, 
                                                                                     num_epoch=5)

Epoch:  0
Train set | Accuracy: 0.9665 | Loss: 0.0091
Validation set | Accuracy: 0.7684 | AUC: 0.6348 | Loss: 0.06 | time elapse:  00:02:24
----------
Epoch:  1
Train set | Accuracy: 0.9653 | Loss: 0.0087
Validation set | Accuracy: 0.7608 | AUC: 0.6472 | Loss: 0.08 | time elapse:  00:04:50
----------
Epoch:  2
Train set | Accuracy: 0.9684 | Loss: 0.0086
Validation set | Accuracy: 0.7739 | AUC: 0.6340 | Loss: 0.09 | time elapse:  00:07:15
----------
Epoch:  3
Train set | Accuracy: 0.9647 | Loss: 0.0087
Validation set | Accuracy: 0.7749 | AUC: 0.6407 | Loss: 0.07 | time elapse:  00:09:40
----------
Epoch:  4
Train set | Accuracy: 0.9656 | Loss: 0.0092
Validation set | Accuracy: 0.7603 | AUC: 0.6460 | Loss: 0.08 | time elapse:  00:12:06
----------
Best validation auc: 0.6472


In [26]:
#TEST SET
torch.manual_seed(2020)
model =  model_BERT2_fc2.to(device)
model.eval()
total =0.
correct = 0.
predictions =[]
truths= []

with torch.no_grad():        
    for i, (data, mask, labels) in enumerate(test_loaderB):
        data, mask, labels = data.to(device), mask.to(device), labels.to(device)
        
        model.zero_grad()

        _, outputs = model(data, token_type_ids = None,
                                      attention_mask= mask,
                                      labels =labels)

        label_cpu = labels.squeeze().to('cpu').numpy()
        pred = outputs.data.max(-1)[1].to('cpu').numpy()
        total += labels.size(0)
        correct += float(sum((pred ==label_cpu))) 
        
        predictions += list(pred)
        truths += list(label_cpu)
        
    v_auc = roc_auc_score(truths, predictions)
    v_acc = correct/total
    
    
print('Test set | Accuracy: {:6.4f}'.format(v_acc))
print('Test set | AUC: {:6.4f}'.format(v_auc))

Test set | Accuracy: 0.7852
Test set | AUC: 0.6724


In [36]:
torch.save(model_BERT2_fc2,'model_bert_pretrained_fc.pth')

In [33]:
epoch = np.arange(0,5,1)

0.6459870098172155

In [None]:
### Change learning rate 0.0001

In [34]:
model =  model_BERT2.to(device)

for param in model.bert.parameters():
    param.requires_grad = False

for param in model.classifier.parameters():
    param.requires_grad = True

In [35]:
train_loss_BERT_fc3, train_acc_BERT_fc3, val_loss_BERT_fc3, val_acc_BERT_fc3,val_auc_BERT_fc3, model_BERT2_fc3 = trainBERT_classifier(model, 
                                                                                     train_loaderB, 
                                                                                     val_loaderB, 
                                                                                     num_epoch=5,
                                                                                    lr = 0.0001)

Epoch:  0
Train set | Accuracy: 0.9664 | Loss: 0.0086
Validation set | Accuracy: 0.7706 | AUC: 0.6397 | Loss: 0.07 | time elapse:  00:02:25
----------
Epoch:  1
Train set | Accuracy: 0.9669 | Loss: 0.0085
Validation set | Accuracy: 0.7674 | AUC: 0.6402 | Loss: 0.07 | time elapse:  00:04:50
----------
Epoch:  2
Train set | Accuracy: 0.9656 | Loss: 0.0085
Validation set | Accuracy: 0.7684 | AUC: 0.6400 | Loss: 0.07 | time elapse:  00:07:16
----------
Epoch:  3
Train set | Accuracy: 0.9684 | Loss: 0.0080
Validation set | Accuracy: 0.7684 | AUC: 0.6409 | Loss: 0.07 | time elapse:  00:09:41
----------
Epoch:  4
Train set | Accuracy: 0.9660 | Loss: 0.0082
Validation set | Accuracy: 0.7711 | AUC: 0.6400 | Loss: 0.07 | time elapse:  00:12:07
----------
Best validation auc: 0.6409
