In [1]:
# Import Libraries
##!pip install transformers==3.0.0
import gc
import os
import time

import re
import string
import numpy as np
import pandas as pd

import torch
import torch.nn as nn
from sklearn.model_selection import train_test_split
from sklearn.metrics import classification_report, accuracy_score, precision_recall_fscore_support
from transformers import AutoModel
from transformers import BertModel, BertTokenizer, BertForSequenceClassification

import warnings
warnings.filterwarnings('ignore')

In [2]:
# Specify the GPU
# Setting up the device for GPU usage
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(device)

cpu


In [3]:
# Bert Model

# for binary class
class BERT_Bi_Arch(nn.Module):

    def __init__(self, bert):
        super(BERT_Bi_Arch, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')

        self.softmax = nn.LogSoftmax(dim=1) 
        self.lstm = nn.LSTM(768, 256, batch_first=True,bidirectional=True)
        self.linear = nn.Linear(256*2, 2)       

    def forward(self, sent_id, mask):
        sequence_output, pooled_output = self.bert(sent_id, attention_mask=mask)

        # sequence_output has the following shape: (batch_size, sequence_length, 768)
        lstm_output, (h,c) = self.lstm(sequence_output) ## extract the 1st token's embeddings
        hidden = torch.cat((lstm_output[:,-1, :256],lstm_output[:,0, 256:]),dim=-1)
        linear_output = self.linear(hidden.view(-1,256*2))
        return self.softmax(linear_output)
    

# for multiclass
class BERT_Multi_Arch(nn.Module):

    def __init__(self, bert):
        super(BERT_Multi_Arch, self).__init__()
        self.bert = BertModel.from_pretrained('bert-base-uncased')
 
        self.softmax = nn.LogSoftmax(dim=1) 
        self.lstm = nn.LSTM(768, 256, batch_first=True,bidirectional=True)
        self.linear = nn.Linear(256*2, 3)

    def forward(self, sent_id, mask):
        sequence_output, pooled_output = self.bert(sent_id, attention_mask=mask)

        # sequence_output has the following shape: (batch_size, sequence_length, 768)
        lstm_output, (h,c) = self.lstm(sequence_output) ## extract the 1st token's embeddings
        hidden = torch.cat((lstm_output[:,-1, :256],lstm_output[:,0, 256:]),dim=-1)
        linear_output = self.linear(hidden.view(-1,256*2)) 
        return self.softmax(linear_output)

In [4]:
# Dataset Functions
def read_dataset(task):
    
    df = pd.read_csv('movieCorpus.csv')
    
    train = df.loc[df['Test'] == 0] #pd.read_csv("trainData.csv")
    test = df.loc[df['Test'] == 1] #pd.read_csv("evaluation.csv")
    
    train_subj = train.copy()
    test_subj = test.copy()
    
    def change_subj_labels(data):
        group_subj = {"OBJECTIVE":0, "SUBJECTIVE":1}
        data['Auto_labeller_eval_subj'] = data['Auto_labeller_eval_subj'].apply(lambda x: group_subj[x])
        return data['Auto_labeller_eval_subj']

    train_subj['Auto_labeller_eval_subj'] = change_subj_labels(train_subj)
    test_subj['Auto_labeller_eval_subj'] = change_subj_labels(test_subj)
    
    
    train_pol = train.loc[train['Auto_labeller_eval_subj'] == "SUBJECTIVE"]
    test_pol = test.loc[test['Auto_labeller_eval_subj'] == "SUBJECTIVE"]
    
    def change_pol_labels(data):
        group_pol = {"NEGATIVE":0, "POSITIVE":1, "NEUTRAL":2, "None":"None"}
        data['Auto_labeller_eval_pol'] = data['Auto_labeller_eval_pol'].apply(lambda x: group_pol[x])
        return data['Auto_labeller_eval_pol']
    
    train_pol['Auto_labeller_eval_pol'] = change_pol_labels(train_pol)
    test_pol['Auto_labeller_eval_pol'] = change_pol_labels(test_pol)    
    
    keep_col = ['body', 'Auto_labeller_eval_subj', 'Auto_labeller_eval_pol']
    train_subj = train_subj[keep_col]
    test_subj = test_subj[keep_col]
    train_pol = train_pol[keep_col]
    test_pol = test_pol[keep_col]
    
    if task == "s":
        return train_pol['body'].tolist(), train_pol['Auto_labeller_eval_pol'],\
               test_pol['body'].tolist(), test_pol['Auto_labeller_eval_pol']

    elif task == "o":
        return train_subj['body'].tolist(), train_subj['Auto_labeller_eval_subj'],\
               test_subj['body'].tolist(), test_subj['Auto_labeller_eval_subj']

    elif task == "getDataSubj":
        test_subj['Auto_labeller_eval_pol'] = change_pol_labels(test_subj)
        return test_subj
    
    elif task == "getDataPol":
        test_pol['Auto_labeller_eval_subj'] = change_subj_labels(test_pol)
        return test_pol

def pre_process_dataset(values):
    new_values = list()
    
    for value in values:
        new_values.append(value)
    return new_values


def data_process(data, labels):
    input_ids = []
    attention_masks = []
    bert_tokenizer = BertTokenizer.from_pretrained("bert-base-uncased")
    for sentence in data:
        bert_inp = bert_tokenizer.__call__(sentence, max_length=150,
                                           padding='max_length', pad_to_max_length=True,
                                           truncation=True, return_token_type_ids=False)

        input_ids.append(bert_inp['input_ids'])
        attention_masks.append(bert_inp['attention_mask'])

    input_ids = np.asarray(input_ids)
    attention_masks = np.array(attention_masks)
    labels = np.array(labels)
    return input_ids, attention_masks, labels


def load_and_process(task):
    train_data, train_labels, test_data, test_labels = read_dataset(task)

    train_input_ids, train_attention_masks, train_labels = data_process(pre_process_dataset(train_data), train_labels)
    test_input_ids, test_attention_masks, test_labels = data_process(pre_process_dataset(test_data), test_labels)

    return train_input_ids, train_attention_masks, train_labels,\
           test_input_ids, test_attention_masks, test_labels

In [5]:
# Train Model Function
def train(model, loss_function, batch_size, train_dataloader, task, optimizer):
    model.train()

    total_loss, total_accuracy = 0, 0

    # empty list to save model predictions
    total_preds = []

    # iterate over batches
    total = len(train_dataloader)
    for i, batch in enumerate(train_dataloader):

        step = i+1
        percent = "{0:.2f}".format(100 * (step / float(total)))
        lossp = "{0:.2f}".format(total_loss/(total*batch_size))
        filledLength = int(100 * step // total)
        bar = '█' * filledLength + '>'  *(filledLength < 100) + '.' * (99 - filledLength)
        print(f'\rBatch {step}/{total} |{bar}| {percent}% complete, loss={lossp}', end='') # accuracy={total_accuracy}

        # push the batch to gpu
        batch = [r.to(device) for r in batch]
        sent_id, mask, labels = batch
        del batch
        gc.collect()
        torch.cuda.empty_cache()
        # clear previously calculated gradients
        model.zero_grad()

        # get model predictions for the current batch
        preds = model(sent_id, mask)
        
        # make sure it labels are int64 type
        labels = labels.type(torch.LongTensor)

        # compute the loss between actual and predicted values
        loss = loss_function(preds, labels)

        # add on to the total loss
        total_loss += float(loss.item())

        # backward pass to calculate the gradients
        loss.backward()

        # clip the the gradients to 1.0. It helps in preventing the exploding gradient problem
        torch.nn.utils.clip_grad_norm_(model.parameters(), 1.0)

        # update parameters
        optimizer.step()

        # model predictions are stored on GPU. So, push it to CPU
        #preds = preds.detach().cpu().numpy()

        # append the model predictions
        total_preds.append(preds.detach().cpu().numpy())

    gc.collect()
    torch.cuda.empty_cache()

    # compute the training loss of the epoch
    avg_loss = total_loss / (len(train_dataloader)*batch_size)

    # predictions are in the form of (no. of batches, size of batch, no. of classes).
    # reshape the predictions in form of (number of samples, no. of classes)
    total_preds = np.concatenate(total_preds, axis=0)

    # returns the loss and predictions
    return avg_loss, total_preds

In [7]:
def load_data(task):
    
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Run Data Functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Load Data-set ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    train_input_ids, train_attention_masks, train_labels,\
    test_input_ids, test_attention_masks, test_labels = load_and_process(task)

    train_df = pd.DataFrame(list(zip(train_input_ids, train_attention_masks)), columns=['input_ids', 'attention_masks'])
    test_df = pd.DataFrame(list(zip(test_input_ids, test_attention_masks)), columns=['input_ids', 'attention_masks'])
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    # ~~~~~~~~~~~~~~~~~~~~~ Import BERT Model and BERT Tokenizer ~~~~~~~~~~~~~~~~~~~~~#
    bert = AutoModel.from_pretrained('bert-base-uncased')
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Tokenization ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    # for train set
    train_seq = torch.tensor(train_df['input_ids'].tolist())
    train_mask = torch.tensor(train_df['attention_masks'].tolist())
    train_y = torch.tensor(train_labels.tolist())

    # for test set
    test_seq = torch.tensor(test_df['input_ids'].tolist())
    test_mask = torch.tensor(test_df['attention_masks'].tolist())
    test_y = torch.tensor(test_labels.tolist())
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Create DataLoaders ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    from torch.utils.data import TensorDataset, DataLoader, RandomSampler, SequentialSampler

    # define a batch size
    batch_size = 32

    # wrap tensors
    train_data = TensorDataset(train_seq, train_mask, train_y)

    # sampler for sampling the data during training
    train_sampler = RandomSampler(train_data)

    # dataLoader for train set
    train_dataloader = DataLoader(train_data, sampler=train_sampler, batch_size=batch_size)

    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    return bert, batch_size,\
           train_dataloader,\
           train_y, test_y,\
           train_seq, test_seq,\
           train_mask, test_mask

In [8]:
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Run Model Functions ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
# ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

def run_model(task='s', epochs=3):
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Load Data ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    bert, batch_size,\
    train_dataloader,\
    _, test_y,\
    _, test_seq,\
    _, test_mask= load_data(task)
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ Freeze BERT Parameters ~~~~~~~~~~~~~~~~~~~~~~~~~~~~#
    # freeze all the parameters
    for param in bert.parameters():
        param.requires_grad = False
    # ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~#

    # pass the pre-trained BERT to our define architecture
    if task=='s':
        model = BERT_Multi_Arch(bert)
    else:
        model = BERT_Bi_Arch(bert)
        
    # push the model to GPU
    model = model.to(device)

    # optimizer from hugging face transformers
    from transformers import AdamW

    # define the optimizer
#     optimizer = AdamW(model.parameters(), lr=2e-5)
    optimizer = AdamW(
        model.parameters(),
        lr = 5e-5, #5e-5 is the best setting so dat. 70% accuracy
        eps = 1e-8
    )

    # loss function
    loss_function = nn.NLLLoss()

    # set initial loss to infinite
    best_loss = float('inf')

    epochs = epochs
    current = 1
    train_loss_list = []
    # for each epoch
    while current <= epochs:

        print(f'\nEpoch {current} / {epochs}:')

        # train model
        train_loss, _ = train(model, loss_function, batch_size, train_dataloader, task, optimizer)
        train_loss_list.append(train_loss)
        # evaluate model
#         valid_loss, _ = evaluate(model, loss_function, batch_size, val_dataloader, task)

        # save the best model
        if task == "s" and os.path.isfile('polarityBertBiLSTM.pth') == False:
            torch.save(model.state_dict(), 'polarityBertBiLSTM.pth')
            best_loss = train_loss_list[current-1]
        if task == "o" and os.path.isdir('subjectivityBertBiLSTM.pth') == False:
            torch.save(model.state_dict(), 'subjectivityBertBiLSTM.pth')
            best_loss = train_loss_list[current-1]
            
        if len(train_loss_list) > 1:
            if train_loss_list[current-1] < best_loss:
                best_loss = train_loss_list[current-1]

                if task == "s":
                    torch.save(model.state_dict(), 'polarityBertBiLSTM.pth')
                elif task == "o":
                    torch.save(model.state_dict(), 'subjectivityBertBiLSTM.pth')
                
                    
        print(f'\n\nTraining Loss: {train_loss:.3f}')

        current = current + 1
    
    else:
        #print("Got weights!")
        # load weights of best model
        if task == "s":
            print("Loading polarity weights...")
            model.load_state_dict(torch.load("polarityBertBiLSTM.pth"))
            print("Loaded polarity weights!")

        elif task == "o":
            print("Loading subjectivity weights...")
            model.load_state_dict(torch.load("subjectivityBertBiLSTM.pth"))        
            print("Loaded subjectivity weights!")
            
#         model.load_state_dict(torch.load("/content/drive/MyDrive/saved_weights.pth"), strict=False)

    # get predictions for test data
    print("\nPredicting Results...")
    model.eval()
    
    gc.collect()
    torch.cuda.empty_cache()

    with torch.no_grad():
        start = time.time()
        preds = model(test_seq.to(device), test_mask.to(device))
        end = time.time()
        print("Time taken to predict: ", end - start)
        
        preds = preds.detach().cpu().numpy()

    # model performance on test dataset
    print("\nTest Performance for Task "+task+":")

    ## precision_recall_fscore_support:
    ### micro: balanced dataset
    ### macro: imbalanced dataset where all classes are equally important
    ### weighted: imbalanced dataset but want to assign greater contribution to classes with more examples in the dataset
    ### binary: binary classfication - Only report results for the class specified by pos_label
    if task == 's':
        preds = np.argmax(preds, axis=1)
        precision, recall, fscore, _ = precision_recall_fscore_support(test_y, preds, average='macro')
    else:
        preds = np.argmax(preds, axis=1)
        precision, recall, fscore, _ = precision_recall_fscore_support(test_y, preds, average='macro')
        
    ## accuracy score
    accuracy = accuracy_score(test_y, preds)
    
    print(f"Accuracy: {accuracy}")
    print(f"Precision: {precision}")
    print(f"Recall: {recall}")
    print(f"F1-score: {fscore}")
    
    print(classification_report(test_y, preds))
    
    return preds

In [9]:
# run everything
tasks_list = ['s','o']
preds_list = []

for task in tasks_list:
    # parameters:
    ## task: 's' for sentiment, 'o' for opinion (default 's')
    ## epochs: number of epochs when training model (default 3, to load saved weights enter 0)
    preds = run_model(task=task, epochs=0)
    preds_list.append(preds)

Loading polarity weights...
Loaded polarity weights!

Predicting Results...
Time taken to predict:  503.9818260669708

Test Performance for Task s:
Accuracy: 0.8840579710144928
Precision: 0.8597478791037672
Recall: 0.8674814116461974
F1-score: 0.8628865610573335
              precision    recall  f1-score   support

           0       0.94      0.91      0.93       493
           1       0.94      0.91      0.92       710
           2       0.70      0.78      0.74       315

    accuracy                           0.88      1518
   macro avg       0.86      0.87      0.86      1518
weighted avg       0.89      0.88      0.89      1518

Loading subjectivity weights...
Loaded subjectivity weights!

Predicting Results...
Time taken to predict:  728.5104732513428

Test Performance for Task o:
Accuracy: 0.9048498845265589
Precision: 0.9165374779003477
Recall: 0.8545496290775505
F1-score: 0.8780448455017773
              precision    recall  f1-score   support

           0       0.94      0

###  Save Results

In [10]:
preds_list

[array([1, 1, 1, ..., 2, 2, 2], dtype=int64),
 array([1, 1, 1, ..., 0, 0, 0], dtype=int64)]

In [11]:
pred_pol = preds_list[0].tolist()
pol = read_dataset("getDataPol")

In [12]:
pol.head()

Unnamed: 0,body,Auto_labeller_eval_subj,Auto_labeller_eval_pol
19284,Serenity with Anne Hathaway and Resort to Love...,1,1
19286,Dammit I kept saying she reminded me of someon...,1,1
19288,"No it's CGI, like Dude, in Free Guy.",1,1
19290,Right? or that Jungle Cruise redefined some ge...,1,2
19292,"Oh yeah, bringing the birthday cake to him whi...",1,0


In [13]:
pol['Classification Results'] = pred_pol
pol.head()

Unnamed: 0,body,Auto_labeller_eval_subj,Auto_labeller_eval_pol,Classification Results
19284,Serenity with Anne Hathaway and Resort to Love...,1,1,1
19286,Dammit I kept saying she reminded me of someon...,1,1,1
19288,"No it's CGI, like Dude, in Free Guy.",1,1,1
19290,Right? or that Jungle Cruise redefined some ge...,1,2,2
19292,"Oh yeah, bringing the birthday cake to him whi...",1,0,0


In [14]:
pol.to_csv("BertBiLSTM_polarityClassificationResults.csv", index = False)

In [15]:
pred_subj = preds_list[1].tolist()
subj = read_dataset("getDataSubj")

In [16]:
subj.head()

Unnamed: 0,body,Auto_labeller_eval_subj,Auto_labeller_eval_pol
19284,Serenity with Anne Hathaway and Resort to Love...,1,1
19286,Dammit I kept saying she reminded me of someon...,1,1
19288,"No it's CGI, like Dude, in Free Guy.",1,1
19290,Right? or that Jungle Cruise redefined some ge...,1,2
19292,"Oh yeah, bringing the birthday cake to him whi...",1,0


In [17]:
subj['Classification Results'] = pred_subj

In [18]:
subj.to_csv("BertBiLSTM_subjectivityClassificationResults.csv", index = False)

### References<br>https://github.com/Rachel-loo/BERT-BiLSTM/blob/main/models/bert_BiLSTM.py<br>