## BART

In [1]:
#import libraries
import gc
import os
import time
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader, RandomSampler

from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support

import transformers
from transformers import pipeline, AdamW, BartForSequenceClassification, BartTokenizer, BartModel

In [5]:
def set_seed(seed = 0): # Define seed for reproducability
    '''
    set random seed
    '''
    # random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)
device = 'cuda'

### Helper Functions

In [3]:
# BART model
class BARTModel(nn.Module):
  def __init__(self):
    super(BARTModel, self).__init__()
    self.bart = BartModel.from_pretrained("facebook/bart-base")
    self.bart_drop = nn.Dropout(0.3)
    self.out = nn.Linear(768, 1)

  def forward(self, input_ids, attention_mask):
    bart_outputs = self.bart(input_ids=input_ids, attention_mask=attention_mask)
    hidden_states = bart_outputs[0]  
    drop = self.bart_drop(hidden_states[:,0])  
    output = self.out(drop)

    return output

In [None]:
# process data into train test sets of tokens
def read_dataset(df): 
    X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.3, shuffle=True) # train test split of 0.3

    # map labels to integers
    polarity_class = {"negative":0, "positive":1} # binary classes
    y_train = y_train.apply(lambda x: polarity_class[x])
    y_test = y_test.apply(lambda x: polarity_class[x])

    return X_train.tolist(), y_train, X_test.tolist(), y_test

# convert series to list
def pre_process_dataset(values):
    new_values = list()
    for value in values:
        new_values.append(value)
    return new_values

# tokenize data to input_ids, attention_masks
def data_process(data, labels):
    input_ids = []
    attention_masks = []

    tokenizer = BartTokenizer.from_pretrained('facebook/bart-base', do_lower_case = True)

    for sentence in data:
        inp = tokenizer.encode_plus(sentence, max_length=150,
                                     padding='max_length',
                                     pad_to_max_length=True,
                                     truncation=True, return_token_type_ids=False)

        input_ids.append(inp['input_ids'])
        attention_masks.append(inp['attention_mask'])

    input_ids = np.asarray(input_ids)
    attention_masks = np.array(attention_masks)
    labels = np.array(labels)
    return input_ids, attention_masks, labels

# tokenize sentence inputs and generate attention masks
def load_and_process(df):
    train_data, train_labels, test_data, test_labels = read_dataset(df)

    train_input_ids, train_attention_masks, train_labels = data_process(pre_process_dataset(train_data), train_labels)
    test_input_ids, test_attention_masks, test_labels = data_process(pre_process_dataset(test_data), test_labels)

    return train_input_ids, train_attention_masks, train_labels,\
           test_input_ids, test_attention_masks, test_labels

# preprocesses and return data in terms of ids, masks and labels
def load_data(df):
    # load dataset
    train_input_ids, train_attention_masks, train_labels,\
    test_input_ids, test_attention_masks, test_labels = load_and_process(df)

    train_df = pd.DataFrame(list(zip(train_input_ids, train_attention_masks)), columns=['input_ids', 'attention_masks'])
    test_df = pd.DataFrame(list(zip(test_input_ids, test_attention_masks)), columns=['input_ids', 'attention_masks'])

    # import Model
    model = BARTModel()

    # Tokenization
    # for train set
    train_seq = torch.tensor(train_df['input_ids'].tolist())
    train_mask = torch.tensor(train_df['attention_masks'].tolist())
    train_y = torch.tensor(train_labels.tolist())

    # for test set
    test_seq = torch.tensor(test_df['input_ids'].tolist())
    test_mask = torch.tensor(test_df['attention_masks'].tolist())
    test_y = torch.tensor(test_labels.tolist())

    # Create DataLoaders
    batch_size = 16
    train_data = TensorDataset(train_seq, train_mask, train_y) # wrap tensors
    train_sampler = RandomSampler(train_data) # sampler for sampling the data during training
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size) # dataLoader for train set

    return model, batch_size, train_dataloader, train_y, test_y, train_seq, test_seq, train_mask, test_mask

# process BART models outputs
def preprocess_output(self, outputs):
  outputs = torch.sigmoid(outputs).cpu().detach().numpy()
  return outputs[0][0]

# model loss function
def loss_fn(outputs, labels):
  return nn.BCEWithLogitsLoss()(outputs, labels.view(-1, 1))

In [6]:
# Train Model 
def train(model, batch_size, train_dataloader, optimizer):
    model.train()

    total_loss = 0

    total_preds = []
    total_labels = []

    # iterate over batches
    total = len(train_dataloader)
    for i, batch in enumerate(train_dataloader):

        step = i+1
        percent = "{0:.2f}".format(100 * (step / float(total)))
        lossp = "{0:.2f}".format(total_loss/(total*batch_size))
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>'  *(filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='') # accuracy={total_accuracy}

        batch = [r.to(device) for r in batch] # push the batch to gpu
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()

        model.zero_grad()  
        preds = model(sent_id, mask)  
        labels = labels.type(torch.float)  

        loss = loss_fn(preds, labels)
        total_loss += loss.item()

        loss.backward()  

        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0) # clip the the gradients to 1.0, preventis the exploding gradient problem
        optimizer.step() # update parameters

    gc.collect()
    torch.cuda.empty_cache()

    avg_loss = total_loss / (len(train_dataloader)*batch_size) # compute the training loss of the epoch

    return avg_loss

### Load and Preprocess Dataset

In [7]:
df = pd.read_csv('IMDB Dataset.csv') # read df
bart, batch_size, train_dataloader, train_y, test_y, train_seq, test_seq, train_mask, test_mask= load_data(df) # preprocess dataframe

  train_seq = torch.tensor(train_df['input_ids'].tolist())


In [None]:
model = bart.to(device)
model.train()
# define the optimizer
optimizer = AdamW(
    model.parameters(),
    lr = 5e-5,
    eps = 1e-8
)

best_loss = 0.02 #float('inf') # set initial loss to infinite


epochs = 2
current = 1
train_loss_list = [0.02]

while current <= epochs: # for each epoch

    print(f'\nEpoch {current} / {epochs}:')

    train_loss = train(model, batch_size, train_dataloader, optimizer) # train model
    train_loss_list.append(train_loss) # store train loss

    # evaluate model
    # valid_loss, _ = evaluate(model, loss_function, batch_size, val_dataloader, task)

    # save the best model based on train loss
    if os.path.isfile('/content/BART_1.pth') == False:
        torch.save(model.state_dict(), '/content/bart.pth')
        best_loss = train_loss_list[current-1]

    if len(train_loss_list) > 1:
        if train_loss_list[current-1] < best_loss:
            best_loss = train_loss_list[current-1]
            torch.save(model.state_dict(), '/content/bart.pth')


    print(f'\n\nTraining Loss: {train_loss:.3f}')

    current = current + 1

    gc.collect()
    torch.cuda.empty_cache()

else:
    # load weights of best model
    print("Loading polarity weights")
    model.load_state_dict(torch.load("/content/bart.pth"))
    print("Loaded polarity weights!")

gc.collect()
torch.cuda.empty_cache()




Epoch 1 / 2:
Batch 2188/2188 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.01

Training Loss: 0.015

Epoch 2 / 2:
Batch 2188/2188 |████████████████████████████████████████████████████████████████████████████████████████████████████| 100.00% complete, loss=0.01

Training Loss: 0.010
Loading polarity weights
Loaded polarity weights!


In [9]:
model = BARTModel().to(device) # initialize model
model.load_state_dict(torch.load("./stored_weights/bart_3rd_epoch.pth")) # load model
model.eval()

batch_size = 16
test_data = TensorDataset(test_seq, test_mask, test_y) # wrap tensors
test_dataloader = DataLoader(test_data, batch_size=batch_size) # dataLoader for train set

total_preds = []
true_labels = []

start = time.time()

with torch.no_grad():
  for i, batch in enumerate(test_dataloader):
      batch = [r.to(device) for r in batch] # push the batch to gpu
      sent_id, mask, labels = batch
      labels = labels.to(device, dtype = torch.float)
      
      outputs = model(sent_id, attention_mask=mask) # perform prediction

      true_labels.extend(labels.cpu().detach().numpy().tolist())
      total_preds.extend(torch.sigmoid(outputs).cpu().detach().numpy().tolist())

end = time.time()

time_test = end - start
print("Test time:", time_test)

Test time: 209.95807647705078


Trainable Parameter

In [6]:
model = BARTModel().to(device)
model.load_state_dict(torch.load("./stored_weights/bart_3rd_epoch.pth"))
model.eval()

with torch.no_grad():
    pytorch_total_params = sum(p.numel() for p in model.parameters() if p.requires_grad) # get total trainable params

print(f"Total Trainable Parameters (BART): {pytorch_total_params}")

Total Trainable Parameters (BART): 139421185


Classification metric

In [43]:
valid_outputs = np.array(total_preds) >= 0.5 # convert preds to 0 or 1 based on threshold
print(classification_report(true_labels, valid_outputs))

              precision    recall  f1-score   support

         0.0       0.87      0.91      0.89      7411
         1.0       0.91      0.87      0.89      7589

    accuracy                           0.89     15000
   macro avg       0.89      0.89      0.89     15000
weighted avg       0.89      0.89      0.89     15000

