## Bert-BiLSTM

This notebook performs training and testing of the Bert-BiLSTM model for TSA on the IMDB movie review dataset

In [2]:
# import libraries
import torch
import torch.nn as nn
from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler
import gc, os, time

import numpy as np
import pandas as pd
import seaborn as sns

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from transformers import AutoModel
from transformers import BertModel, BertTokenizer, BertForSequenceClassification
from transformers import AdamW

In [3]:
def set_seed(seed = 0): # Define seed for reproducability
    '''
    set random seed
    '''
    # random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)
device = 'cuda'

### Utility Functions

In [4]:
# Bert-BiLSTM Model definition
class BERT_Bi_Arch(nn.Module): 
    def __init__(self, bert):
        super(BERT_Bi_Arch, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.softmax = nn.LogSoftmax(dim=1)
        self.lstm = nn.LSTM(768, 256, batch_first=True,bidirectional=True)
        self.linear = nn.Linear(256*2, 2)       

    def forward(self, sent_id, mask):
        sequence_output, pooled_output = self.bert(sent_id, attention_mask=mask, return_dict=False) 
        lstm_output, (h,c) = self.lstm(sequence_output) # extract the 1st token's embeddings
        hidden = torch.cat((lstm_output[:,-1, :256],lstm_output[:,0, 256:]),dim=-1)
        linear_output = self.linear(hidden.view(-1,256*2))
        return self.softmax(linear_output)

In [None]:
# splits dataset into train and test, converts class label values to integer representation
def read_dataset(df): 
    X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.3, shuffle=True) # train test split of 0.3
    
    # map labels to integers
    polarity_class = {"negative":0, "positive":1} # binary classes
    y_train = y_train.apply(lambda x: polarity_class[x])
    y_test = y_test.apply(lambda x: polarity_class[x])
    
    return X_train.tolist(), y_train, X_test.tolist(), y_test

# convert series to list
def pre_process_dataset(values): 
    new_values = list()
    
    for value in values:
        new_values.append(value)
    return new_values

# tokenize sentence inputs and generate attention masks
def data_process(data, labels): 
    input_ids = []
    attention_masks = []
    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    for sentence in data:
        bert_inp = bert_tokenizer.__call__(sentence, max_length=150, 
                                           padding='max_length', pad_to_max_length=True,
                                           truncation=True, return_token_type_ids=False)
        input_ids.append(bert_inp['input_ids'])
        attention_masks.append(bert_inp['attention_mask'])

    input_ids = np.asarray(input_ids)
    attention_masks = np.array(attention_masks)
    labels = np.array(labels)
    return input_ids, attention_masks, labels

# preprocesses and return data in terms of ids, masks and labels
def load_and_process(df): 
    train_data, train_labels, test_data, test_labels = read_dataset(df)

    train_input_ids, train_attention_masks, train_labels = data_process(pre_process_dataset(train_data), train_labels)
    test_input_ids, test_attention_masks, test_labels = data_process(pre_process_dataset(test_data), test_labels)

    return train_input_ids, train_attention_masks, train_labels,\
           test_input_ids, test_attention_masks, test_labels

In [None]:
# Train Model Function
def train(model, loss_function, batch_size, train_dataloader, optimizer):
    model.train()

    total_loss, total_accuracy = 0, 0
    total_preds = [] # empty list to save model predictions

    # iterate over batches
    total = len(train_dataloader)
    for i, batch in enumerate(train_dataloader):

        step = i+1
        percent = "{0:.2f}".format(100 * (step / float(total)))
        lossp = "{0:.2f}".format(total_loss/(total*batch_size))
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>'  *(filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='') # accuracy={total_accuracy}

        batch = [r.to(device) for r in batch] # push the batch to gpu
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        
        model.zero_grad() # clear previously calculated gradients
        preds = model(sent_id, mask) # get model predictions for the current batch
        labels = labels.type(torch.LongTensor) # make sure it labels are int64 type
        
        loss = loss_function(preds, labels.to('cuda')) # compute the loss between actual and predicted values
        total_loss += float(loss.item()) # add on to the total loss
        loss.backward() # backward pass to calculate the gradients
        
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        optimizer.step() # update parameters

        total_preds.append(preds.detach().cpu().numpy()) # append the model predictions

    gc.collect()
    torch.cuda.empty_cache()
    
    avg_loss = total_loss / (len(train_dataloader)*batch_size) # compute the training loss of the epoch

    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds = np.concatenate(total_preds, axis=0)
    
    return avg_loss, total_preds # returns the loss and predictions

In [None]:
# Data loader function
def load_data(df):
    # load dataset
    train_input_ids, train_attention_masks, train_labels,\
    test_input_ids, test_attention_masks, test_labels = load_and_process(df)

    train_df = pd.DataFrame(list(zip(train_input_ids, train_attention_masks)), columns=['input_ids', 'attention_masks'])
    test_df = pd.DataFrame(list(zip(test_input_ids, test_attention_masks)), columns=['input_ids', 'attention_masks'])
    
    # import BERT Model and BERT Tokenizer
    bert = AutoModel.from_pretrained('bert-base-uncased')

    # Tokenization 
    # for train set
    train_seq = torch.tensor(train_df['input_ids'].tolist())
    train_mask = torch.tensor(train_df['attention_masks'].tolist())
    train_y = torch.tensor(train_labels.tolist())

    # for test set
    test_seq = torch.tensor(test_df['input_ids'].tolist())
    test_mask = torch.tensor(test_df['attention_masks'].tolist())
    test_y = torch.tensor(test_labels.tolist())

    # Create DataLoaders 
    batch_size = 16 
    train_data = TensorDataset(train_seq, train_mask, train_y) # wrap tensors
    train_sampler = RandomSampler(train_data) # sampler for sampling the data during training
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) # dataLoader for train set

    return bert, batch_size, train_dataloader, train_y, test_y, train_seq, test_seq, train_mask, test_mask

### Load and Preprocess Dataset

In [None]:
df = pd.read_csv('IMDB Dataset.csv') # read df
bert, batch_size, train_dataloader, train_y, test_y, train_seq, test_seq, train_mask, test_mask= load_data(df) # preprocess dataframe

### Training

In [None]:
# Train loop
for param in bert.parameters(): # freeze all the parameters, we are performing pre-training
    param.requires_grad = False

model = BERT_Bi_Arch(bert).to(device) # pass the pre-trained BERT to our defined architecture
optimizer = AdamW( # define the optimizer
    model.parameters(),
    lr = 5e-5, 
    eps = 1e-8
)

loss_function = nn.NLLLoss() # loss function

best_loss = float('inf') # set initial loss to infinite

epochs = 3
current = 1
train_loss_list = []

while current <= epochs: # for each epoch

    print(f'\nEpoch {current} / {epochs}:')

    train_loss, _ = train(model, loss_function, batch_size, train_dataloader, optimizer) # train model
    train_loss_list.append(train_loss)
    
    # evaluate model
    # valid_loss, _ = evaluate(model, loss_function, batch_size, val_dataloader, task)

    # save the best model
    if os.path.isfile('polarityBertBiLSTM.pth') == False:
        torch.save(model.state_dict(), 'polarityBertBiLSTM.pth')
        best_loss = train_loss_list[current-1]
        
    if len(train_loss_list) > 1:
        if train_loss_list[current-1] < best_loss:
            best_loss = train_loss_list[current-1]
            torch.save(model.state_dict(), 'polarityBertBiLSTM.pth')

                
    print(f'\n\nTraining Loss: {train_loss:.3f}')

    current = current + 1

else:
    # load weights of best model
    print("Loading polarity weights")
    model.load_state_dict(torch.load("polarityBertBiLSTM.pth"))
    print("Loaded polarity weights!")

  train_seq = torch.tensor(train_df['input_ids'].tolist())



Epoch 1 / 3:
NO1
Batch 2188/2188 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.02

Training Loss: 0.020

Epoch 2 / 3:
NO1
Batch 2188/2188 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.01

Training Loss: 0.013

Epoch 3 / 3:
NO1
Batch 2188/2188 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.01

Training Loss: 0.008
Loading polarity weights
Loaded polarity weights!

Predicting Results...


### Evaluation

Trainable parameters

In [6]:
# Get total trainable parameters for BERT-BiLSTM

bert = BertModel.from_pretrained('bert-base-uncased')
for param in bert.parameters(): # freeze all the parameters, we are performing pre-training
    param.requires_grad = False
model = BERT_Bi_Arch(bert).to(device) 
model.load_state_dict(torch.load("./stored_weights/polarityBertBiLSTM.pth"))
model.eval()

with torch.no_grad():
    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"Total Trainable Parameters (BERT-BiLSTM): {pytorch_total_params}")

Total Trainable Parameters (BERT-BiLSTM): 111584514


Classification metric

In [24]:
# Model Evaluation
model = BERT_Bi_Arch(bert).to(device) 
model.load_state_dict(torch.load("../stored_weights/polarityBertBiLSTM.pth")) # load trained model
model.eval()

batch_size = 16
test_data = TensorDataset(test_seq, test_mask, test_y) # wrap tensors
test_dataloader = DataLoader(test_data, batch_size=batch_size) # dataLoader for train set

total_preds = []

start = time.time()
for i, batch in enumerate(test_dataloader):
    batch = [r.to(device) for r in batch] # push the batch to gpu
    sent_id, mask, labels = batch
    
    with torch.no_grad():
        preds = model(sent_id.to(device), mask.to(device))
        preds = preds.detach().cpu().numpy()

    preds = np.argmax(preds, axis=1)
    total_preds.append(preds) # append the model predictions
    
end = time.time()
print("Time taken to predict: ", end - start) # time taken for prediction
  
total_preds = np.concatenate(total_preds, axis=0)
precision, recall, fscore, _ = precision_recall_fscore_support(test_y, total_preds, average='macro') # calculate metrics
accuracy = accuracy_score(test_y, total_preds) # accuracy score

print(f"Accuracy: {accuracy}")
print(f"Precision: {precision}")
print(f"Recall: {recall}")
print(f"F1-score: {fscore}")
print(classification_report(test_y, total_preds))

Time taken to predict:  220.23912048339844
Accuracy: 0.9583333333333334
Precision: 0.9584075854988081
Recall: 0.9583324881481332
F1-score: 0.9583315908530592
              precision    recall  f1-score   support

           0       0.96      0.95      0.96      7499
           1       0.95      0.96      0.96      7501

    accuracy                           0.96     15000
   macro avg       0.96      0.96      0.96     15000
weighted avg       0.96      0.96      0.96     15000

