In [1]:
!pip install transformers

Collecting transformers
[?25l  Downloading https://files.pythonhosted.org/packages/ed/d5/f4157a376b8a79489a76ce6cfe147f4f3be1e029b7144fa7b8432e8acb26/transformers-4.4.2-py3-none-any.whl (2.0MB)
[K     |████████████████████████████████| 2.0MB 18.5MB/s 
[?25hCollecting tokenizers<0.11,>=0.10.1
[?25l  Downloading https://files.pythonhosted.org/packages/71/23/2ddc317b2121117bf34dd00f5b0de194158f2a44ee2bf5e47c7166878a97/tokenizers-0.10.1-cp37-cp37m-manylinux2010_x86_64.whl (3.2MB)
[K     |████████████████████████████████| 3.2MB 52.5MB/s 
Collecting sacremoses
[?25l  Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)
[K     |████████████████████████████████| 890kB 55.9MB/s 
Building wheels for collected packages: sacremoses
  Building wheel for sacremoses (setup.py) ... [?25l[?25hdone
  Created wheel for sacremoses: filename=sacremoses-0.0.43-cp37-none-any.whl size=893262 sha256=84b7

In [2]:
from google.colab import drive
 
current_directory = '/content/drive/My Drive/Semantics/BERT_ROC/'
drive.mount('/content/drive')

Mounted at /content/drive


Go sure current directory contains test sets and models

Models:

ClozeOnly: https://drive.google.com/drive/folders/119WnpHBmM637M0SVk3buy-KPI5aWpsF0?usp=sharing

RocOnly: https://drive.google.com/drive/folders/1eVdGH5DVEDCrGMSc9thTwJo18btzXras?usp=sharing

Cloze + 5000 Roc: https://drive.google.com/drive/folders/1-XfWuEsxEAKUby35Zz_y9zEo6kyDuSRF?usp=sharing

# Headers and Global Variables

In [3]:
import csv
import torch
from torch.nn.functional import softmax
from torch.nn.functional import relu
from transformers import BertForNextSentencePrediction, BertTokenizer
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from tqdm import tqdm
from transformers import AdamW
from transformers import get_linear_schedule_with_warmup
import matplotlib
from matplotlib import pyplot as plt
from matplotlib import pyplot as plt
from IPython.display import display, HTML

import os
import sys
from pathlib import Path

project_path = Path(os.path.dirname(os.path.realpath(sys.argv[0]))).parent
modelpath = str(project_path.joinpath('models')) + "/"
datapath = str(project_path.joinpath('datasets')) + "/"

device = "cuda:0" if torch.cuda.is_available() else "cpu"

CLOZE_MODEL = 'bertfornsp_clozeonly_finetuned'
ROC_MODEL = 'bertfornsp_roc_finetuned'
# underlying pretrained LM
BASE_MODEL = 'bert-large-uncased-whole-word-masking'

BATCH_SIZE = 12
WARMUP_EPOCHS = 1
TRAIN_EPOCHS = 10
LAST_EPOCH = -1

# Datasets

In [4]:
class RocStories(torch.utils.data.Dataset):
    def __init__(self, short = False):    
        dataset = []       
        with open(datapath + 'roc_stories.csv', 
                  'r', encoding='utf-8') as d:
            
            reader = csv.reader(d, quotechar='"', delimiter=',',
                                quoting=csv.QUOTE_ALL, skipinitialspace=True)                
            for line in reader:
                dataset.append(line)  

        self.data = []
        self.labels = []

        stories = []
        endings = []
        for i, sample in enumerate(dataset):
            if short == True:
                if i >= 5000: break           
            start = " ".join(sample[2:-1])
            stories.append(start)            
            end = sample[-1]                        
            endings.append(end)

        from random import shuffle
        wrong_endings = endings.copy()
        shuffle(wrong_endings)

        assert len(stories) == len(endings)
        for i, story in enumerate(stories):
            
            #True Ending
            self.data.append([story, endings[i]])
            self.labels.append(0)

            #Wrong Ending
            self.data.append([story, wrong_endings[i]])
            self.labels.append(1)

    def __getitem__(self, idx):
        X = self.data[idx]
        y = self.labels[idx]        
        return X, y

    def __len__(self):
        assert len(self.data) == len(self.labels)
        return len(self.labels)

In [5]:
class ClozeTest(torch.utils.data.Dataset):
    def __init__(self, dev=True, hypothesis_only=False, file = None):
        """
        :param hypothesis_only: Replaces story with empty string. Only Keeps endings as they are.
        :param hard: For future hard_test_set.csv
        """

        dataset = []

        # if dev=True, we load the dev set for testing
        dir = ""
        
        if file is None:
          if dev:
              dir = datapath + 'cloze_test.csv'
          else:
              dir = datapath + 'cloze_train.csv'
        else: dir = datapath + file

        with open(dir, 'r', encoding='utf-8') as d:
            reader = csv.reader(d, quotechar='"', delimiter=',', 
                                quoting=csv.QUOTE_ALL, skipinitialspace=True)                
            for line in reader:
                dataset.append(line) 
            dataset.pop(0)

        self.data = []
        self.labels = []

        for sample in dataset:
            
            start = " ".join(sample[1:-3])
            if hypothesis_only: start = ""
            end1 = sample[-3]
            end2 = sample[-2]
            right_ending = sample[-1]

            self.data.append([start, end1])
            self.labels.append(0 if "1" == right_ending else 1)

            self.data.append([start, end2])
            self.labels.append(0 if "2" == right_ending else 1)

    def __getitem__(self, idx):
        X = self.data[idx]
        y = self.labels[idx]        
        return X, y

    def __len__(self):
        assert len(self.data) == len(self.labels)
        return len(self.labels)

In [7]:
class ClozeTest_MC(torch.utils.data.Dataset):
    def __init__(self, dev=True,  hypothesis_only=False, file = None):
        
        dataset = []
 
        dir = ""
        
        if file is None:
          if dev:
              dir = datapath + 'cloze_test.csv'
          else:
              dir = datapath + 'cloze_train.csv'
        else: dir = datapath + file

        # if dev=True, we load the dev set for testing
        with open(dir, 'r', encoding='utf-8') as d:
            reader = csv.reader(d, quotechar='"', delimiter=',', 
                                quoting=csv.QUOTE_ALL, skipinitialspace=True)                
            for line in reader:
                dataset.append(line) 
            dataset.pop(0)
 
        self.data = []
        self.labels = []
 
        for sample in dataset:
            
            start = " ".join(sample[1:-3])
            if hypothesis_only: start = ""
            end1 = sample[-3]
            end2 = sample[-2]
            right_ending = sample[-1]
 
            self.data.append([start, end1, end2])
            self.labels.append(0 if "1" == right_ending else 1)
 
    def __getitem__(self, idx):
        X = self.data[idx]
        y = self.labels[idx]        
        return X, y
 
    def __len__(self):
        assert len(self.data) == len(self.labels)
        return len(self.labels)

# Auxiliary Functions

In [9]:
def getModelFileName(model_name, last_epoch):
    return modelpath + model_name + str(last_epoch)

In [10]:
def weight_diff(model1, model2):
    diff = torch.nn.MSELoss() # diff(a, b) = ((a - b) ** 2).mean()

    xweights, yweights, xbiases, ybiases = dict(), dict(), dict(), dict()
    layer_names = set()

    for (name, parameter1), parameter2 in zip(
        model1.bert.encoder.layer.named_parameters(),
        model2.bert.encoder.layer.parameters()
    ):

        difference = diff(parameter1, parameter2).item()

        name = name.split(".")
        xtick = float(name[0])
        layer_name = ".".join(name[1:-1])
        parameter_type = name[-1]

        if layer_name not in layer_names:
            layer_names.add(layer_name)
            xweights[layer_name], xbiases[layer_name] = list(), list()
            yweights[layer_name], ybiases[layer_name] = list(), list()

        if parameter_type == "weight":
            yweights[layer_name].append(difference)
            xweights[layer_name].append(xtick + 0.0)
        else: # if parameter_type == "bias"
            ybiases[layer_name].append(difference)
            xbiases[layer_name].append(xtick + 0.5)

    for name in layer_names:
        plt.bar(xweights[name], yweights[name], width=0.4, label="weight")
        plt.bar(xbiases[name], ybiases[name], width=0.4, label="bias")
        plt.xticks(xweights[name])
        plt.legend()
        plt.title(name)
        plt.show()

# Functions for Training and Testing

In [11]:
def train(cloze_test, model_file=BASE_MODEL, batch_size=BATCH_SIZE,
          warmup_epochs=WARMUP_EPOCHS, train_epochs=TRAIN_EPOCHS,
          last_epoch=LAST_EPOCH, verbose=False, model_name=None):
    
    tokenizer = BertTokenizer.from_pretrained(BASE_MODEL)
    model = BertForNextSentencePrediction.from_pretrained(model_file)
    # The old weights are saved in model_old to be used to compare to model
    model_old = BertForNextSentencePrediction.from_pretrained(model_file)

    #Send to GPU and allow Training
    model = model.to(device)
    model.train()

    trainloader = torch.utils.data.DataLoader(
        ClozeTest(dev=False) if cloze_test else RocStories(),
        batch_size=batch_size, shuffle=True
    )

    #LR maybe needs to be optimized
    optimizer = AdamW(model.parameters(), lr=1e-5)
    n_batches =  len(trainloader)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=(warmup_epochs * n_batches),
        num_training_steps=(train_epochs * n_batches),
        last_epoch=max(-1, last_epoch * n_batches) # actually, last_step
    )
    losses = []

    epochs_range = range(last_epoch + 1, train_epochs)
    for epoch in tqdm(epochs_range):
        
        for batchId, (stories, labels) in zip(range(n_batches), trainloader):
            # this is PyTorch-specific as gradients get accumulated        
            optimizer.zero_grad()

            start = stories[0]
            end = stories[1]

            labels = labels.to(device)
           
            # Tokenize sentence pairs.
            # All sequences in batch processing must be same length.
            # Therefore we use padding to fill shorter sequences
            # with uninterpreted [PAD] tokens)
            tokenized_batch = tokenizer(start, padding = True, text_pair = end,
                                        return_tensors='pt').to(device)
            
            loss = model(**tokenized_batch, labels = labels).loss
            if verbose:
                print("Epoch " + str(epoch + 1) + 
                      " Batch " + batchId + " of " + n_batches + 
                      " Loss: " + loss.item())
            losses.append(loss.item())

            loss.backward()
            optimizer.step()
            scheduler.step() # Huggingface specific: step = epoch

        model.save_pretrained(
            getModelFileName(model_name, epoch + 1)
        )
    
    # Loss function change over steps is plotted below.
    plt.plot(losses)
    plt.xticks(
        ticks=[(i - last_epoch - 1) * n_batches for i in epochs_range],
        labels=epochs_range
    )
    plt.title(("Story Cloze" if cloze_test else "ROCStories") + " Training")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()

    # Models are compared
    weight_diff(model, model_old)

In [12]:
def test(model_file=BASE_MODEL, verbose = False, cloze_test = ClozeTest()):
    softmax = torch.nn.Softmax(dim=1)
    tokenizer = BertTokenizer.from_pretrained(BASE_MODEL)
    model = BertForNextSentencePrediction.from_pretrained(model_file)

    #Send to GPU and allow Evaluation
    model = model.to(device)
    model.eval()

    #Dataloader
    devloader = torch.utils.data.DataLoader(cloze_test, batch_size=10)

    pred_list, label_list = list(), list()

    for stories, labels in tqdm(devloader, disable=verbose):
        
        start = stories[0]
        end = stories[1]
        
        # Tokenize sentence pairs.
        # All sequences in batch processing must be same length.
        # Therefore we use padding to fill shorter sequences
        # with uninterpreted [PAD] tokens)
        tokenized_batch = tokenizer(start, padding = True, text_pair = end,
                                    return_tensors='pt').to(device)

        #Send to GPU
        labels = labels.to(device)

        outputs = model(**tokenized_batch, labels = labels)
        logits = outputs.logits

        # Model predicts sentence-pair as correct if True-logit > False-logit
        predictions = logits.argmax(dim=1).int()
        probs = softmax(logits).cpu().detach()

        # Extra info print() if verbose
        if verbose:
            # iterate over elements in batch
            for i, element_input_ids in enumerate(tokenized_batch.input_ids):
                print(tokenizer.decode(element_input_ids))
                print("Probability:", probs[i][0].item() * 100)
                print("Predicted: ", bool(predictions[i]))
                print("True label: ", bool(labels[i]))

        pred_list.extend(predictions.tolist())
        label_list.extend(labels.tolist())

    #print(confusion_matrix(label_list, pred_list))
    print(classification_report(label_list, pred_list))


In [13]:
def train_MC(cloze_test = ClozeTest_MC(dev=False), model_file=BASE_MODEL, batch_size=BATCH_SIZE,
          warmup_epochs=WARMUP_EPOCHS, train_epochs=TRAIN_EPOCHS,
          last_epoch=LAST_EPOCH, verbose=False, model_name=None):
    
    tokenizer = BertTokenizer.from_pretrained(BASE_MODEL)
    model = BertForNextSentencePrediction.from_pretrained(model_file)
    # The old weights are saved in model_old to be used to compare to model
    model_old = BertForNextSentencePrediction.from_pretrained(model_file)

    #Send to GPU and allow Training
    model = model.to(device)
    model.train()

    trainloader = torch.utils.data.DataLoader(cloze_test, batch_size=batch_size, shuffle=True)

    #LR maybe needs to be optimized
    optimizer = AdamW(model.parameters(), lr=1e-5)
    n_batches =  len(trainloader)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=(warmup_epochs * n_batches),
        num_training_steps=(train_epochs * n_batches),
        last_epoch=max(-1, last_epoch * n_batches) # actually, last_step
    )
    
    loss_fct = torch.nn.CrossEntropyLoss()

    losses = []
    epochs_range = range(last_epoch + 1, train_epochs)
    for epoch in tqdm(epochs_range):
        
        for batchId, (stories, labels) in zip(range(n_batches), trainloader):
            # this is PyTorch-specific as gradients get accumulated        
            optimizer.zero_grad()

            start = stories[0]
            end1 = stories[1]
            end2 = stories[2]

            tokenized_batch_end1 = tokenizer(start, padding = True, text_pair = end1,
                                        return_tensors='pt').to(device)
            
            tokenized_batch_end2 = tokenizer(start, padding = True, text_pair = end2,
                                        return_tensors='pt').to(device) 
    
            #Send to GPU
            labels = labels.to(device)
    
            
            logits0 = model(**tokenized_batch_end1).logits
            logits1 = model(**tokenized_batch_end2).logits    

            logits_combined = logits0 + logits1.flip(-1)
            loss = loss_fct(logits_combined.view(-1,2), labels.view(-1))
            losses.append(loss.item())

            """
            loss = 0            
            for i in range(len(labels.data)): #Iterate through batch
                log0 = logits0.data[i].to(device)
                log1 = logits1.data[i].to(device)
                if labels[i].item() == 0:
                    label0 = torch.tensor([1,0])
                    label1 = torch.tensor([0,1])
                else:
                    label0 = torch.tensor([0,1])
                    label1 = torch.tensor([1,0])

                label0 = label0.to(device)
                label1 = label1.to(device)

                logits_combined = (log0 + log1.flip(-1)) * (label0 + label1.flip(-1))
                
                loss += loss_fct(logits_combined.unsqueeze(0), labels[i].unsqueeze(0))

            """

            loss.backward()
            optimizer.step()
            scheduler.step() # Huggingface specific: step = epoch

        model.save_pretrained(
            getModelFileName(model_name, epoch + 1)
        )
    
    # Loss function change over steps is plotted below.
    plt.plot(losses)
    plt.xticks(
        ticks=[(i - last_epoch - 1) * n_batches for i in epochs_range],
        labels=epochs_range
    )
    plt.title(("Story Cloze" if cloze_test else "ROCStories") + " Training")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()

    # Models are compared
    weight_diff(model, model_old)

In [14]:
def test_MC(model_file=BASE_MODEL, verbose = False, cloze_test = ClozeTest_MC(dev=True)):
    softmax = torch.nn.Softmax(dim=1)
    tokenizer = BertTokenizer.from_pretrained(BASE_MODEL)
    model = BertForNextSentencePrediction.from_pretrained(model_file)
 
    #Send to GPU and allow Evaluation
    model = model.to(device)
    model.eval()
 
    #Dataloader
    devloader = torch.utils.data.DataLoader(cloze_test, batch_size=10)
 
    pred_list, label_list = list(), list()
 
    for stories, labels in tqdm(devloader, disable=verbose):
        
        start = stories[0]
        end1 = stories[1]
        end2 = stories[2]
 
        tokenized_batch_end1 = tokenizer(start, padding = True, text_pair = end1,
                                    return_tensors='pt').to(device)
        
        tokenized_batch_end2 = tokenizer(start, padding = True, text_pair = end2,
                                    return_tensors='pt').to(device) 
 
        #Send to GPU
        labels = labels.to(device)       
        
        logits0 = model(**tokenized_batch_end1).logits
        logits1 = model(**tokenized_batch_end2).logits    

        logits = logits0 + logits1.flip(-1)
        
        predictions = logits.argmax(dim=1).int()
        probs = softmax(logits).cpu().detach()        
        

        # Extra info print() if verbose
        if verbose:
            # iterate over elements in batch
            for i, element_input_ids in enumerate(tokenized_batch.input_ids):
                print(tokenizer.decode(element_input_ids))
                print("Probability:", probs[i][0].item() * 100)
                print("Predicted: ", bool(predictions[i]))
                print("True label: ", bool(labels[i]))
 
        pred_list.extend(predictions.tolist())
        label_list.extend(labels.tolist())
 
    #print(confusion_matrix(label_list, pred_list))

    print(classification_report(label_list, pred_list))

In [None]:
def train_mixed(model_file=BASE_MODEL, batch_size=BATCH_SIZE,
          warmup_epochs=WARMUP_EPOCHS, train_epochs=TRAIN_EPOCHS,
          last_epoch=LAST_EPOCH, verbose=False, model_name=None):
    
    tokenizer = BertTokenizer.from_pretrained(BASE_MODEL)
    model = BertForNextSentencePrediction.from_pretrained(model_file)
    # The old weights are saved in model_old to be used to compare to model
    model_old = BertForNextSentencePrediction.from_pretrained(model_file)

    #Send to GPU and allow Training
    model = model.to(device)
    model.train()

    cloze = ClozeTest(dev=False)
    roc = RocStories(short = True)
    cloze.data.extend(roc.data)
    cloze.labels.extend(roc.labels)


    trainloader = torch.utils.data.DataLoader(cloze, batch_size=batch_size, shuffle=True)


    #LR maybe needs to be optimized
    optimizer = AdamW(model.parameters(), lr=1e-5)
    n_batches =  len(trainloader)
    scheduler = get_linear_schedule_with_warmup(
        optimizer,
        num_warmup_steps=(warmup_epochs * n_batches),
        num_training_steps=(train_epochs * n_batches),
        last_epoch=max(-1, last_epoch * n_batches) # actually, last_step
    )
    losses = []

    epochs_range = range(last_epoch + 1, train_epochs)
    for epoch in tqdm(epochs_range):
        
        for batchId, (stories, labels) in zip(range(n_batches), trainloader):
            # this is PyTorch-specific as gradients get accumulated        
            optimizer.zero_grad()

            start = stories[0]
            end = stories[1]

            labels = labels.to(device)
           
            # Tokenize sentence pairs.
            # All sequences in batch processing must be same length.
            # Therefore we use padding to fill shorter sequences
            # with uninterpreted [PAD] tokens)
            tokenized_batch = tokenizer(start, padding = True, text_pair = end,
                                        return_tensors='pt').to(device)
            
            loss = model(**tokenized_batch, labels = labels).loss
            if verbose:
                print("Epoch " + str(epoch + 1) + 
                      " Batch " + batchId + " of " + n_batches + 
                      " Loss: " + loss.item())
            losses.append(loss.item())

            loss.backward()
            optimizer.step()
            scheduler.step() # Huggingface specific: step = epoch

        model.save_pretrained(
            getModelFileName(model_name, epoch + 1)
        )
    
    # Loss function change over steps is plotted below.
    plt.plot(losses)
    plt.xticks(
        ticks=[(i - last_epoch - 1) * n_batches for i in epochs_range],
        labels=epochs_range
    )
    plt.title(("Story Cloze" if cloze_test else "ROCStories") + " Training")
    plt.xlabel("Epoch")
    plt.ylabel("Loss")
    plt.show()

    # Models are compared
    weight_diff(model, model_old)

# Training

## RocOnly 

In [None]:
train_epochs_roc = 2
train(train_epochs=train_epochs_roc, cloze_test=False, batch_size=32, warmup_epochs=0, model_name=ROC_MODEL)
test(getModelFileName(ROC_MODEL, train_epochs_roc))

HBox(children=(FloatProgress(value=0.0, description='Downloading', max=231508.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=28.0, style=ProgressStyle(description_w…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=466062.0, style=ProgressStyle(descripti…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=434.0, style=ProgressStyle(description_…




HBox(children=(FloatProgress(value=0.0, description='Downloading', max=1345000548.0, style=ProgressStyle(descr…




Some weights of the model checkpoint at bert-large-uncased-whole-word-masking were not used when initializing BertForNextSentencePrediction: ['cls.predictions.bias', 'cls.predictions.transform.dense.weight', 'cls.predictions.transform.dense.bias', 'cls.predictions.transform.LayerNorm.weight', 'cls.predictions.transform.LayerNorm.bias', 'cls.predictions.decoder.weight']
- This IS expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForNextSentencePrediction from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
  0%|          | 0/2 [00:00<?, ?it/s]

## ClozeOnly

In [None]:
train_epochs_cloze = 10
train(train_epochs = train_epochs_cloze, cloze_test=True, model_name="bertfornsp_clozeonly_finetuned")

## Cloze + 5000Roc

In [None]:
train_epochs_cloze = 5
train_mixed(model_name = "bertfornsp_mixed_more_roc", train_epochs = train_epochs_cloze)

# Trigger Words

In [None]:
def vocab_distribution(dev_only=True, train_only=False, hard = True, token_ids = False):
    """    
    :param token_ids: Return words when False, token_ids when True
    :param dev_only: Identify words with high class likelihood in test-set endings
    :param train_only: Identify words with high class likelihood in train-set endings (this is where the model gets biased). 
    :param hard: Use hard test-set (Doesnt exist yet)
    """
    data = []
    labels = []
    
    if dev_only:
        cloze = ClozeTest(dev=False, hard = hard)
        data.extend(cloze.data)
        labels.extend(cloze.labels)

    if train_only:
        clozedev = ClozeTest(dev=True)
        data.extend(clozedev.data)
        labels.extend(clozedev.labels)
    
    tokenizer = BertTokenizer.from_pretrained(BASE_MODEL)


    ending_tokens = {}
    word_count = 0

    for i, story in enumerate(data):
        label = labels[i]
        end = story[1]    
        tokens = tokenizer(end).input_ids
        tokens.pop(0)
        tokens.pop(-1)
        
        for token in tokens:
            if not token_ids: token = tokenizer.decode(token).replace(" ", "")
            word_count += 1
            if token not in ending_tokens:
                ending_tokens[token] = [0,0]
            ending_tokens[token][label] += 1

    return ending_tokens, word_count

def pmi(class_count, other_class_count, word_count):
    """
    :param class_count: Number of occurences in the class you want to calculate the pmi with
    :param other_class_count: Number of occurences in the other class
    :param word_count: Total word count
    """
    import math
    if class_count < 1:
        return 0
    return math.log((class_count / word_count) / ((class_count + other_class_count)/(word_count*2)))

def class_prob(class_count, other_class_count):
    return class_count/(class_count + other_class_count)

In [None]:
def get_trigger_words(hard = False, dev_only = True, train_only = True, min_occurences = 30, token_ids = False):
    """    
    :param token_ids: Return words when False, token_ids when True
    :param dev_only: Identify words with high class likelihood in test-set endings
    :param train_only: Identify words with high class likelihood in train-set endings (this is where the model gets biased). 
    :param hard: Use hard test-set (Doesnt exist yet)
    :param min_occunrences: Only return trigger words minimally occuring this often
    """
    
    vocab_dis, word_count = vocab_distribution(dev_only=dev_only, train_only=train_only, hard=hard, token_ids=token_ids)

    pos_triggers = []
    neg_triggers = []

    for word, dis in vocab_dis.items():
        if(dis[0]+dis[1] >= min_occurences):      
            pmi_pos = pmi(dis[0], dis[1], word_count)    
            pmi_neg = pmi(dis[1], dis[0], word_count)

            class_prob_pos = class_prob(dis[0], dis[1])
            class_prob_neg = class_prob(dis[1], dis[0])

            pos_triggers.append([word, dis[0], pmi_pos, class_prob_pos])
            neg_triggers.append([word, dis[1], pmi_neg, class_prob_neg])

    pos_triggers.sort(key=lambda x: x[2], reverse = True)
    neg_triggers.sort(key=lambda x: x[2], reverse = True)


    from tabulate import tabulate
    print(tabulate(pos_triggers[:100], headers=['Token', 'n', 'pmi', 'pos_class_likelihood']))
    print("\n")
    print(tabulate(neg_triggers[:100], headers=['Token', 'n', 'pmi', 'neg_class_likelihood']))

    return pos_triggers, neg_triggers


In [None]:
get_trigger_words(hard = False, dev_only = False, train_only = True, min_occurences = 5, token_ids = False)

# Experiments

In [None]:
def test_testset(file, hypothesis_only):
    cloze_test = ClozeTest(dev=True, hypothesis_only = hypothesis_only, file=file)
    cloze_test_mc = ClozeTest_MC(dev=True, hypothesis_only=hypothesis_only, file=file)
    

    print("\nBert\n")
    test(BASE_MODEL, cloze_test = cloze_test)
    test_MC(BASE_MODEL, cloze_test = cloze_test_mc)

    print("\nRocOnly\n")
    test(getModelFileName(ROC_MODEL, ""), cloze_test = cloze_test)
    test_MC(getModelFileName(ROC_MODEL, ""), cloze_test = cloze_test_mc)

    print("\nClozeOnly\n")
    test(getModelFileName("bertfornsp_clozeonly_finetuned", "10"), cloze_test = cloze_test)
    test_MC(getModelFileName("bertfornsp_clozeonly_finetuned", "10"), cloze_test = cloze_test_mc)

    print("\nRocCloze\n")
    test(getModelFileName("bertfornsp_cloze_finetuned", "10"), cloze_test = cloze_test)
    test_MC(getModelFileName("bertfornsp_cloze_finetuned", "10"), cloze_test = cloze_test_mc)

    print("\nCloze + 5000 Roc\n")
    test(getModelFileName("bertfornsp_mixed", "5"), cloze_test = cloze_test)
    test_MC(getModelFileName("bertfornsp_mixed", "5"), cloze_test = cloze_test_mc)