In [1]:
!pip install transformers



In [2]:
from google.colab import drive
 
current_directory = '/content/drive/My Drive/FSem/'
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


Go sure current directory contains test sets and models

Models:

ClozeOnly: https://drive.google.com/drive/folders/119WnpHBmM637M0SVk3buy-KPI5aWpsF0?usp=sharing

RocOnly: ...still training

Cloze + 5000 Roc: https://drive.google.com/drive/folders/1-XfWuEsxEAKUby35Zz_y9zEo6kyDuSRF?usp=sharing

# Headers and Global Variables

In [3]:
import csv
import torch
from torch.nn.functional import softmax
from transformers import BertForNextSentencePrediction, BertTokenizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report

device = "cuda:0" if torch.cuda.is_available() else "cpu"
tokenizer = BertTokenizer.from_pretrained('bert-large-uncased-whole-word-masking')

# Datasets

In [4]:
class ClozeTest(torch.utils.data.Dataset):
    def __init__(self, filename):

        dataset = []

        with open(current_directory + filename, 'r', encoding='utf-8') as d:
            reader = csv.reader(d, quotechar='"', delimiter=',', 
                                quoting=csv.QUOTE_ALL, skipinitialspace=True)                
            for line in reader:
                dataset.append(line) 
            dataset.pop(0)

        self.data = []
        self.labels = []

        for sample in dataset:
            
            start = " ".join(sample[1:-3])
            end1 = sample[-3]
            end2 = sample[-2]
            right_ending = sample[-1]

            self.data.append([start, end1])
            self.labels.append(0 if "1" == right_ending else 1)

            self.data.append([start, end2])
            self.labels.append(0 if "2" == right_ending else 1)

    def __getitem__(self, idx):
        X = self.data[idx]
        y = self.labels[idx]        
        return X, y

    def __len__(self):
        assert len(self.data) == len(self.labels)
        return len(self.labels)

In [5]:
triggers_only_set = ClozeTest('cloze_test_triggers_only.csv')
triggers_removed_only_set = ClozeTest('cloze_test_triggers_removed_only.csv')
triggers_synonymized_only_set = ClozeTest('cloze_test_triggers_synonymized_only.csv')

# Functions for Testing

In [6]:
def test(model, dataset, verbose=False):
    softmax = torch.nn.Softmax(dim=1)

    #Send to GPU and allow Evaluation
    model = model.to(device)
    model.eval()

    #Dataloader
    devloader = torch.utils.data.DataLoader(dataset, batch_size=10)

    pred_list, label_list = list(), list()

    for stories, labels in devloader:
        
        start = stories[0]
        end = stories[1]
        
        # Tokenize sentence pairs.
        # All sequences in batch processing must be same length.
        # Therefore we use padding to fill shorter sequences
        # with uninterpreted [PAD] tokens)
        tokenized_batch = tokenizer(start, padding = True, text_pair = end,
                                    return_tensors='pt').to(device)

        #Send to GPU
        labels = labels.to(device)

        outputs = model(**tokenized_batch, labels = labels)
        logits = outputs.logits

        # Model predicts sentence-pair as correct if True-logit > False-logit
        predictions = logits.argmax(dim=1).int()
        probs = softmax(logits).cpu().detach()

        # Extra info print() if verbose
        if verbose:
            # iterate over elements in batch
            for i, element_input_ids in enumerate(tokenized_batch.input_ids):
                print(tokenizer.decode(element_input_ids))
                print("Probability:", probs[i][0].item() * 100)
                print("Predicted: ", bool(predictions[i]))
                print("True label: ", bool(labels[i]))

        pred_list.extend(predictions.tolist())
        label_list.extend(labels.tolist())

    #print(confusion_matrix(label_list, pred_list))
    print(classification_report(label_list, pred_list))

    #return confusion_matrix(label_list, pred_list).ravel()

In [7]:
def test_model(model):
    print("With triggers:")
    test(model, triggers_only_set)
    print("Triggers removed:")
    test(model, triggers_removed_only_set)
    print("Triggers synonymized:")
    test(model, triggers_synonymized_only_set)

# Cloze Only

In [8]:
model = BertForNextSentencePrediction.from_pretrained(current_directory + 'shared/bertfornsp_clozeonly_finetuned10')
test_model(model)

With triggers:
              precision    recall  f1-score   support

           0       0.80      0.89      0.84       174
           1       0.88      0.78      0.82       174

    accuracy                           0.83       348
   macro avg       0.84      0.83      0.83       348
weighted avg       0.84      0.83      0.83       348

Triggers removed:
              precision    recall  f1-score   support

           0       0.79      0.82      0.81       174
           1       0.82      0.79      0.80       174

    accuracy                           0.80       348
   macro avg       0.80      0.80      0.80       348
weighted avg       0.80      0.80      0.80       348

Triggers synonymized:
              precision    recall  f1-score   support

           0       0.80      0.87      0.83       174
           1       0.86      0.79      0.82       174

    accuracy                           0.83       348
   macro avg       0.83      0.83      0.83       348
weighted avg       

# Roc Only

In [9]:
model = BertForNextSentencePrediction.from_pretrained(current_directory + 'shared/bertfornsp_roc_finetuned')
test_model(model)

With triggers:
              precision    recall  f1-score   support

           0       0.16      0.03      0.05       174
           1       0.47      0.85      0.60       174

    accuracy                           0.44       348
   macro avg       0.31      0.44      0.33       348
weighted avg       0.31      0.44      0.33       348

Triggers removed:
              precision    recall  f1-score   support

           0       0.19      0.03      0.06       174
           1       0.47      0.85      0.60       174

    accuracy                           0.44       348
   macro avg       0.33      0.44      0.33       348
weighted avg       0.33      0.44      0.33       348

Triggers synonymized:
              precision    recall  f1-score   support

           0       0.19      0.03      0.06       174
           1       0.47      0.85      0.60       174

    accuracy                           0.44       348
   macro avg       0.33      0.44      0.33       348
weighted avg       

# 1. Roc 2. Cloze

In [10]:
model = BertForNextSentencePrediction.from_pretrained(current_directory + 'shared/bertfornsp_cloze_finetuned10')
test_model(model)

With triggers:
              precision    recall  f1-score   support

           0       0.83      0.93      0.88       174
           1       0.92      0.80      0.86       174

    accuracy                           0.87       348
   macro avg       0.87      0.87      0.87       348
weighted avg       0.87      0.87      0.87       348

Triggers removed:
              precision    recall  f1-score   support

           0       0.82      0.84      0.83       174
           1       0.84      0.82      0.83       174

    accuracy                           0.83       348
   macro avg       0.83      0.83      0.83       348
weighted avg       0.83      0.83      0.83       348

Triggers synonymized:
              precision    recall  f1-score   support

           0       0.83      0.90      0.87       174
           1       0.89      0.82      0.85       174

    accuracy                           0.86       348
   macro avg       0.86      0.86      0.86       348
weighted avg       

# 1. Cloze 2. Roc

In [11]:
model = BertForNextSentencePrediction.from_pretrained(current_directory + 'alternative/trained_model')
test_model(model)

With triggers:
              precision    recall  f1-score   support

           0       0.54      0.99      0.70       174
           1       0.93      0.16      0.27       174

    accuracy                           0.57       348
   macro avg       0.74      0.57      0.48       348
weighted avg       0.74      0.57      0.48       348

Triggers removed:
              precision    recall  f1-score   support

           0       0.54      0.98      0.69       174
           1       0.87      0.16      0.26       174

    accuracy                           0.57       348
   macro avg       0.70      0.57      0.48       348
weighted avg       0.70      0.57      0.48       348

Triggers synonymized:
              precision    recall  f1-score   support

           0       0.54      0.98      0.70       174
           1       0.90      0.16      0.26       174

    accuracy                           0.57       348
   macro avg       0.72      0.57      0.48       348
weighted avg       

# 1. Roc 2. Cloze with Squared Error Loss, Hypothesis Only data with target probability (0.5, 0.5)

In [12]:
model = BertForNextSentencePrediction.from_pretrained(current_directory + 'shared/bertfornsp_cloze_finetuned_bias_reduced10')
test_model(model)

With triggers:
              precision    recall  f1-score   support

           0       0.86      0.89      0.87       174
           1       0.88      0.85      0.87       174

    accuracy                           0.87       348
   macro avg       0.87      0.87      0.87       348
weighted avg       0.87      0.87      0.87       348

Triggers removed:
              precision    recall  f1-score   support

           0       0.84      0.83      0.83       174
           1       0.83      0.84      0.84       174

    accuracy                           0.84       348
   macro avg       0.84      0.84      0.84       348
weighted avg       0.84      0.84      0.84       348

Triggers synonymized:
              precision    recall  f1-score   support

           0       0.84      0.87      0.86       174
           1       0.87      0.83      0.85       174

    accuracy                           0.85       348
   macro avg       0.85      0.85      0.85       348
weighted avg       

# Cloze with 5 000 Roc Stories mixed in

In [13]:
model = BertForNextSentencePrediction.from_pretrained(current_directory + 'shared/bertfornsp_mixed5')
test_model(model)

With triggers:
              precision    recall  f1-score   support

           0       0.78      0.93      0.85       174
           1       0.91      0.74      0.81       174

    accuracy                           0.83       348
   macro avg       0.84      0.83      0.83       348
weighted avg       0.84      0.83      0.83       348

Triggers removed:
              precision    recall  f1-score   support

           0       0.80      0.90      0.85       174
           1       0.88      0.78      0.83       174

    accuracy                           0.84       348
   macro avg       0.84      0.84      0.84       348
weighted avg       0.84      0.84      0.84       348

Triggers synonymized:
              precision    recall  f1-score   support

           0       0.78      0.93      0.85       174
           1       0.91      0.74      0.81       174

    accuracy                           0.83       348
   macro avg       0.84      0.83      0.83       348
weighted avg       

# Cloze with 10 000 Roc Stories mixed in

In [14]:
model = BertForNextSentencePrediction.from_pretrained(current_directory + 'shared/bertfornsp_mixed_more_roc5')
test_model(model)

With triggers:
              precision    recall  f1-score   support

           0       0.76      0.96      0.85       174
           1       0.95      0.70      0.81       174

    accuracy                           0.83       348
   macro avg       0.85      0.83      0.83       348
weighted avg       0.85      0.83      0.83       348

Triggers removed:
              precision    recall  f1-score   support

           0       0.77      0.91      0.83       174
           1       0.89      0.73      0.80       174

    accuracy                           0.82       348
   macro avg       0.83      0.82      0.82       348
weighted avg       0.83      0.82      0.82       348

Triggers synonymized:
              precision    recall  f1-score   support

           0       0.74      0.97      0.84       174
           1       0.95      0.67      0.78       174

    accuracy                           0.82       348
   macro avg       0.85      0.82      0.81       348
weighted avg       