### GPU check

In [None]:
# GPU availability check and device initialization
import torch

# If there's a GPU available...
if torch.cuda.is_available():    
    # Tell PyTorch to use the GPU.    
    device = torch.device("cuda")
    print('There are %d GPU(s) available.' % torch.cuda.device_count())
    print('We will use the GPU:', torch.cuda.get_device_name(0))

# If not...
else:
    print('No GPU available, using the CPU instead.')
    device = torch.device("cpu")

There are 1 GPU(s) available.
We will use the GPU: Tesla K80


In [None]:
!pip install transformers

Collecting transformers
  Downloading transformers-4.12.5-py3-none-any.whl (3.1 MB)
[K     |████████████████████████████████| 3.1 MB 13.2 MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
  Downloading tokenizers-0.10.3-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (3.3 MB)
[K     |████████████████████████████████| 3.3 MB 33.3 MB/s 
Collecting sacremoses
  Downloading sacremoses-0.0.46-py3-none-any.whl (895 kB)
[K     |████████████████████████████████| 895 kB 36.3 MB/s 
Collecting huggingface-hub<1.0,>=0.1.0
  Downloading huggingface_hub-0.1.2-py3-none-any.whl (59 kB)
[K     |████████████████████████████████| 59 kB 7.4 MB/s 
Collecting pyyaml>=5.1
  Downloading PyYAML-6.0-cp37-cp37m-manylinux_2_5_x86_64.manylinux1_x86_64.manylinux_2_12_x86_64.manylinux2010_x86_64.whl (596 kB)
[K     |████████████████████████████████| 596 kB 44.9 MB/s 
Installing collected packages: pyyaml, tokenizers, sacremoses, huggingface-hub, transformers
  Atte

### Essential Library Imports

In [None]:
# Importing essential libraries
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import MultiLabelBinarizer
from sklearn.metrics import f1_score
from sklearn.metrics import label_ranking_average_precision_score,log_loss
import random
import time

import torch.nn as nn
import torch.nn.functional as F
from transformers import BertModel
from transformers import AdamW, get_linear_schedule_with_warmup

from transformers import BertTokenizer
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler, random_split

## Data Preprocessing functions

In [None]:
def split(string):
    """ To split the label values """
    return str(string).split(';')

def preprocess_dataset(df):
    """ Return preprocessed text and labels """

    # Append the title and abstract information for text
    processed_data = pd.DataFrame()
    processed_data['labels'] = df['label'].apply(split)
    processed_data['text'] = df.apply(lambda row: row['title'] + ' [SEP] ' +row['abstract'],axis=1)
    
    # Converting labels to One-Hot Encoded list
    label_mlb = MultiLabelBinarizer()
    label_mle = label_mlb.fit_transform(processed_data['labels'])
    processed_data['labels'] = label_mle.tolist()

    # Taking and returning the text and label values
    text = processed_data.text.values
    labels = np.array(list(processed_data.labels.values))
    return text, labels

In [None]:
def prepare_dataloader(text,labels=np.array([]),is_test_data=False,batch_size=4,val_split=0.05):
    """ Prepare the pytorch DataLoaders for training and validation using the training dataset """

    # Load the BERT tokenizer.
    print('Loading BERT tokenizer...')
    tokenizer = BertTokenizer.from_pretrained('bert-base-uncased', do_lower_case=True)

    # Tokenizing text
    input_ids = []
    attention_masks = []
    MAX_LEN = 512
    for sent in text:
        encoded_sent = tokenizer.encode_plus(
            text = sent,   
            add_special_tokens = True,         #Add `[CLS]` and `[SEP]`
            max_length= MAX_LEN,             #Max length to truncate/pad
            pad_to_max_length = True,          #pad sentence to max length 
            return_attention_mask= True,       #Return attention mask
        )
        input_ids.append(encoded_sent.get('input_ids'))
        attention_masks.append(encoded_sent.get('attention_mask'))
    
    input_ids = torch.tensor(input_ids)
    attention_masks = torch.tensor(attention_masks)

    # For the test dataset
    if is_test_data==True:
        dataset = TensorDataset(input_ids, attention_masks)
        dataloader = DataLoader(dataset, shuffle = False, batch_size=batch_size)
        return dataloader

    # For the training dataset
    labels = torch.tensor(labels)
    dataset = TensorDataset(input_ids, attention_masks, labels)

    # Create a train-validation split.
    val_size = int(val_split * len(dataset))
    train_size = len(dataset) - val_size
    # Divide the dataset by randomly selecting samples.
    train_dataset, val_dataset = random_split(dataset, [train_size, val_size])
    # Print training and validation size
    print('{:>5,} training samples'.format(train_size))
    print('{:>5,} validation samples'.format(val_size))
    
    # Creating training and validation dataloaders
    train_dataloader = DataLoader(
            train_dataset,  # The training samples.
            sampler = RandomSampler(train_dataset), # Select batches randomly
            batch_size = batch_size # Trains with this batch size.
        )
    val_dataloader = DataLoader(
            val_dataset, # The validation samples.
            sampler = SequentialSampler(val_dataset), # Pull out batches sequentially.
            batch_size = batch_size # Evaluate with this batch size.
        )

    return train_dataloader, val_dataloader

## BERT with Attention Model classes

In [None]:
class EmbeddingAttention(nn.Module):
    def __init__(self, num_input_features, num_hidden_features):
        super(EmbeddingAttention,self).__init__()
        self.l1 = nn.Linear(num_input_features,num_hidden_features)
        self.act_1 = nn.LeakyReLU()
        self.l2 = nn.Linear(num_hidden_features, 1) # the final attention weight for the input
        self.attn = nn.Softmax(dim=-2)
        self.attention_weights = torch.zeros((1,1))
    
    def getAttentionWeights(self):
        return self.attention_weights

    def forward(self,x): # input format ==> (m,num_input_features)
        l1_out = self.l1(x)
        act1_out = self.act_1(l1_out)
        individual_attention_weights = self.l2(act1_out)
        self.attention_weights = self.attn(individual_attention_weights) * 511 # dim is the second last dim here, since input will have shape (num_samples, token,1)
        # print(self.attention_weights,'\n\n')
        return torch.mul(self.attention_weights,x) # broadcasting will happen so final result is elementwide multiplication of (m,1) and (m,num_features) == (m,num_features)

In [None]:
class BertClassifier(nn.Module):
    """
        Bert Model for classification Tasks.
    """
    def __init__(self, freeze_bert=False):
        """
        @param   bert: a BertModel object
        @param   classifier: a torch.nn.Module classifier
        @param   attention: an attention layer from class EmbeddingAttention
        @param   freeze_bert (bool): Set `False` to fine_tune the Bert model
        """
        super(BertClassifier,self).__init__()
        # Specify hidden size of Bert, hidden size of our classifier, and number of labels
        A_in,A_h = 768,10
        C_in,C_h,C_out = 768,50,7
        
        self.bert = BertModel.from_pretrained("bert-base-uncased")

        self.attention = EmbeddingAttention(A_in, A_h)
        
        self.classifier = nn.Sequential(
                            # nn.Linear(C_in, C_h),
                            # nn.LeakyReLU(),
                            # nn.Linear(C_h, C_out)
                            nn.Linear(C_in, C_out)
                          )

        # Freeze the Bert Model
        if freeze_bert:
            for param in self.bert.parameters():
                param.requires_grad = False
    
    def forward(self,input_ids,attention_mask):
        """
        Feed input to BERT and the classifier to compute logits.
        @param    input_ids (torch.Tensor): an input tensor with shape (batch_size,
                      max_length)
        @param    attention_mask (torch.Tensor): a tensor that hold attention mask
                      information with shape (batch_size, max_length)
        @return   logits (torch.Tensor): an output tensor with shape (batch_size,
                      num_labels)
        """

        # Get the BERT word embeddings
        outputs = self.bert(input_ids=input_ids,
                           attention_mask = attention_mask)

        # Pass all the token embeddings (except CLS) to the attention layer
        important_tokens = outputs[0][:,1:,:]
        attention_out = self.attention.forward(important_tokens)

        # Pool the attention-weighted embeddings of the tokens
        mean_att = torch.mean(attention_out,dim=1)

        # Feed the pooled document embedding to classifier to compute logits
        logit = self.classifier(mean_att)
        
        return logit

### Model training helper functions

In [None]:
def save_ckp(state):
    """ Save the model at a checkpoint """
    ep = state['epoch']
    f_path = 'checkpoint_ep'+str(ep)+'.pt'
    torch.save(state, f_path)
    
def load_ckp(checkpoint_fpath, model, optimizer,scheduler):
    """ Load the model from a saved checkpoint for further training """
    checkpoint = torch.load(checkpoint_fpath)
    model.load_state_dict(checkpoint['state_dict'])
    optimizer.load_state_dict(checkpoint['optimizer'])
    scheduler.load_state_dict(checkpoint['scheduler'])
    return model,optimizer,scheduler

In [None]:
def initialize_model(epochs=4):
    """Initialize the Bert Classifier, the optimizer and the learning rate scheduler.
    """
    
    # Instantiate Bert Classifier
    bert_classifier = BertClassifier(freeze_bert=False)
    
    bert_classifier.to(device)
    
    # Create the optimizer
    optimizer = AdamW(bert_classifier.parameters(),
                      lr=5e-5,
                      eps=1e-8 #Default epsilon value
                     )
    
    # Total number of training steps
    total_steps = len(train_dataloader) * epochs
    
    # Set up the learning rate scheduler
    scheduler = get_linear_schedule_with_warmup(optimizer, 
                                              num_warmup_steps=0, # Default value
                                              num_training_steps=total_steps)
    return bert_classifier, optimizer, scheduler

In [None]:
# Specify loss function
loss_fn = nn.BCEWithLogitsLoss()

def set_seed(seed_value=42):
    """ Set seed for reproducibility. """
    random.seed(seed_value)
    np.random.seed(seed_value)
    torch.manual_seed(seed_value)
    torch.cuda.manual_seed_all(seed_value)

def train(model, train_dataloader, val_dataloader=None, epochs=4, evaluation=False):
    """ Train the BertClassifier model. """

    # Start training loop
    print("Start training...\n")
    for epoch_i in range(epochs):

        # =======================================
        #               Training
        # =======================================
        # Print the header of the result table
        print(f"{'Epoch':^7} | {'Batch':^7} | {'Train Loss':^12} | {'Val Loss':^10} | {'Val Acc':^9} | {'Elapsed':^9}")
        print("-"*70)

        # Measure the elapsed time of each epoch
        t0_epoch, t0_batch = time.time(), time.time()

        # Reset tracking variables at the beginning of each epoch
        total_loss, batch_loss, batch_counts = 0, 0, 0

        # Put the model into the training mode
        model.train()

        # For each batch of training data...
        for step, batch in enumerate(train_dataloader):
            batch_counts +=1
            # if batch_counts >100: 
            #   break
            # Load batch to GPU
            b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

            # Zero out any previously calculated gradients
            model.zero_grad()

            # Perform a forward pass. This will return logits.
            logits = model(b_input_ids, b_attn_mask)

            # Compute loss and accumulate the loss values
            loss = loss_fn(logits, b_labels.float())
            batch_loss += loss.item()
            total_loss += loss.item()

            # Perform a backward pass to calculate gradients
            loss.backward()

            # Clip the norm of the gradients to 1.0 to prevent "exploding gradients"
            torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

            # Update parameters and the learning rate
            optimizer.step()
            scheduler.step()

            # Print the loss values and time elapsed for every 20--50000 batches
            if (step % 50000 == 0 and step != 0) or (step == len(train_dataloader) - 1):
                # Calculate time elapsed for 20 batches
                time_elapsed = time.time() - t0_batch

                # Print training results
                print(f"{epoch_i + 1:^7} | {step:^7} | {batch_loss / batch_counts:^12.6f} | {'-':^10} | {'-':^9} | {time_elapsed:^9.2f}")

                # Reset batch tracking variables
                batch_loss, batch_counts = 0, 0
                t0_batch = time.time()

        # Calculate the average loss over the entire training data
        avg_train_loss = total_loss / len(train_dataloader)

        # Save checkpoint model after every epoch
        checkpoint = {
            'epoch': epoch_i + 1,
            'state_dict': model.state_dict(),
            'optimizer': optimizer.state_dict(),
            'scheduler': scheduler.state_dict()
        }
        # save_ckp(checkpoint)

        print("-"*70)
        # =======================================
        #               Evaluation
        # =======================================
        if evaluation == True:
            # After the completion of each training epoch, measure the model's performance
            # on our validation set.
            val_loss, val_accuracy = evaluate(model, val_dataloader)

            # Print performance over the entire training data
            time_elapsed = time.time() - t0_epoch
            
            print(f"{epoch_i + 1:^7} | {'-':^7} | {avg_train_loss:^12.6f} | {val_loss:^10.6f} | {val_accuracy:^9.2f} | {time_elapsed:^9.2f}")
            print("-"*70)
        print("\n")
    
    print("Training complete!")


def evaluate(model, val_dataloader):
    """After the completion of each training epoch, measure the model's performance
    on our validation set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    # Tracking variables
    val_accuracy = []
    val_loss = []

    # For each batch in our validation set...
    for batch in val_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask, b_labels = tuple(t.to(device) for t in batch)

        # Compute logits
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        att_weight = model.attention.getAttentionWeights()
        print(att_weight)
        
        # Compute loss
        loss = loss_fn(logits, b_labels.float())
        val_loss.append(loss.item())
        
        # Calculate the accuracy rate
        accuracy = accuracy_thresh(logits.view(-1,7),b_labels.view(-1,7))      
        val_accuracy.append(accuracy)

    # Compute the average accuracy and loss over the validation set.
    val_loss = np.mean(val_loss)
    val_accuracy = np.mean(val_accuracy)
    return val_loss, val_accuracy


def accuracy_thresh(y_pred, y_true, thresh:float=0.5, sigmoid:bool=True):
    "Compute accuracy when `y_pred` and `y_true` are the same size."
    if sigmoid: 
        y_pred = y_pred.sigmoid()
    return ((y_pred>thresh)==y_true.byte()).float().mean().item()

### Predictions and evaluation helper functions

In [None]:
def bert_predict(model, test_dataloader):
    """Perform a forward pass on the trained BERT model to predict probabilities and attention weights
    on the test set.
    """
    # Put the model into the evaluation mode. The dropout layers are disabled during
    # the test time.
    model.eval()

    all_logits = []
    attention_weights = []

    # For each batch in our test set...
    for batch in test_dataloader:
        # Load batch to GPU
        b_input_ids, b_attn_mask = tuple(t.to(device) for t in batch)[:2]

        # Compute logits and attention weights
        with torch.no_grad():
            logits = model(b_input_ids, b_attn_mask)
        att_weight = model.attention.getAttentionWeights()
        print(att_weight)
        all_logits.append(logits)
        attention_weights.append(att_weight)
    
    # Concatenate logits and weights from each batch
    all_logits = torch.cat(all_logits, dim=0)
    attention_weights = torch.cat(attention_weights,dim=0)

    # Apply sigmoid to calculate probabilities
    probs = all_logits.sigmoid().cpu().numpy()

    attention_weights_final = attention_weights.cpu().numpy()
    
    return probs,attention_weights_final

In [None]:
def weighted_f1(labels, preds, threshold=0.5):
    """ Converts probabilities to labels using the [threshold] and calculates metrics. 
    Parameters ---------- labels preds threshold 
    Returns ------- """ 
    preds[preds > threshold] = 1
    preds[preds <= threshold] = 0 

    scores = f1_score(labels, preds, average='weighted') 
    return scores

## Training the model on the training set

### Preprocessing training data

In [None]:
from google.colab import drive
drive.mount('/content/drive',force_remount=True)

Mounted at /content/drive


In [None]:
train_data = pd.read_csv('/content/drive/MyDrive/Information Retrieval/BC7-LitCovid-Train.csv')

In [None]:
train_text, train_labels = preprocess_dataset(train_data.head(10000))
train_dataloader, val_dataloader = prepare_dataloader(text=train_text,labels=train_labels,batch_size=4,val_split=0.15)

Loading BERT tokenizer...


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


8,500 training samples
1,500 validation samples


### Training...

In [None]:
# Actual Training cell Prep

epochs=6   # Set number of epochs for training

 # Initialize the model
bert_classifier, optimizer, scheduler = initialize_model(epochs=epochs)
# 
# Load the model from a previously saved checkpoint
# file = '/content/drive/MyDrive/Information Retrieval/AttentionV1/v1_model.pt'   
# load_ckp(file,bert_classifier, optimizer, scheduler)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.transform.LayerNorm.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight', 'cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [None]:
#single classifier layer // smaller attention hidden layer more data
# Train...
train(bert_classifier, train_dataloader, val_dataloader, epochs=epochs, evaluation=True)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
         ...,
         [1.0202],
         [1.0198],
         [1.0269]]], device='cuda:0')
tensor([[[0.9997],
         [0.9296],
         [0.9622],
         ...,
         [1.0131],
         [1.0164],
         [1.0187]],

        [[0.9856],
         [1.0066],
         [0.9978],
         ...,
         [1.0262],
         [1.0212],
         [1.0304]],

        [[1.0113],
         [0.9660],
         [0.9525],
         ...,
         [1.0412],
         [1.0390],
         [1.0444]],

        [[0.9760],
         [0.9623],
         [1.0005],
         ...,
         [0.9854],
         [1.0024],
         [0.9949]]], device='cuda:0')
tensor([[[0.9784],
         [1.0103],
         [0.9645],
         ...,
         [1.0335],
         [1.0179],
         [1.0375]],

        [[0.9189],
         [0.9876],
         [0.9841],
         ...,
         [1.0177],
         [1.0160],
         [1.0164]],

        [[1.0051],
         [1.0003],
         [

## Evaluating the model on the test set

### Preprocess test data

In [None]:
# Reading and preprocessing test dataset
test_data = pd.read_csv('/content/drive/MyDrive/Information Retrieval/BC7-LitCovid-Dev.csv')
test_text, test_labels = preprocess_dataset(test_data)
test_dataloader = prepare_dataloader(text=test_text, batch_size=1, is_test_data=True)

Loading BERT tokenizer...


Truncation was not explicitly activated but `max_length` is provided a specific value, please use `truncation=True` to explicitly truncate examples to max length. Defaulting to 'longest_first' truncation strategy. If you encode pairs of sequences (GLUE-style) with the tokenizer you can select this strategy more precisely by providing a specific strategy to `truncation`.


### Load the trained model (if needed)

In [None]:
set_seed(2021) # Set seed for reproducibility (same as training)

# Initialize the model
final_model = BertClassifier(freeze_bert=True)
final_model.to(device)

# Load model from a file
# file = '/content/drive/MyDrive/Information Retrieval/AttentionV1/v1_model.pt'   
f_file = '/content/drive/MyDrive/Information Retrieval/Bert-Attention/checkpoint_ep20.pt'
ckp = torch.load(file)
final_model.load_state_dict(ckp['state_dict'])

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertModel: ['cls.predictions.decoder.weight', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.bias', 'cls.seq_relationship.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.seq_relationship.bias', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.dense.weight']
- This IS expected if you are initializing BertModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


<All keys matched successfully>

### Predictions and Evaluation

In [None]:
# Getting the predictions and attention weights on the test set
predictions,attention_weights = bert_predict(bert_classifier,test_dataloader)

[1;30;43mStreaming output truncated to the last 5000 lines.[0m
         [0.9943],
         [1.0064],
         [0.9979],
         [0.9988],
         [0.9602],
         [1.0003],
         [1.0081],
         [0.9936],
         [0.9814],
         [1.0099],
         [1.0041],
         [1.0106],
         [0.9932],
         [1.0139],
         [1.0048],
         [1.0041],
         [1.0170],
         [0.9824],
         [1.0151],
         [0.9978],
         [0.9891],
         [1.0062],
         [1.0185],
         [0.9987],
         [1.0195],
         [1.0178],
         [1.0265],
         [1.0261],
         [1.0211],
         [0.9849],
         [1.0057],
         [1.0028],
         [0.9815],
         [0.9809],
         [0.9930],
         [1.0054],
         [1.0067],
         [0.9980],
         [0.9796],
         [1.0144],
         [0.9960],
         [0.9876],
         [0.9874],
         [0.9932],
         [0.9999],
         [0.9963],
         [0.9837],
         [1.0170],
         [1.0200],
    

In [None]:
# Evaluating the predictions
f1 = weighted_f1(test_labels,predictions)
print("F1 score :", f1)
lrap_score = label_ranking_average_precision_score(test_labels, predictions)
print("LRAP score:",lrap_score)

F1 score : 0.8875371405027642
LRAP score: 0.8869981325863693


## Further analysis...

In [None]:
attention_weights.min()

0.111121185

In [None]:
attention_weights.max()

1.7247238

In [None]:
torch.save(bert_classifier.state_dict(), '/content/drive/MyDrive/Information Retrieval/mini_trained_model.pt')