In [14]:
!pip install transformers



# Headers and Global Variables

In [15]:
import torch
from transformers import BertTokenizer
import csv
from tabulate import tabulate

BASE_MODEL = 'bert-large-uncased-whole-word-masking'
CLOZE_TEST_PATH = "../datasets/cloze_test.csv"

# Datasets

In [16]:
class ClozeTest(torch.utils.data.Dataset):
    def __init__(self, filepath):

        dataset = []

        with open(filepath, 'r', encoding='utf-8') as d:
            reader = csv.reader(d, quotechar='"', delimiter=',', 
                                quoting=csv.QUOTE_ALL, skipinitialspace=True)                
            for line in reader:
                dataset.append(line) 
            dataset.pop(0)

        self.data = []
        self.labels = []

        for sample in dataset:
            
            start = " ".join(sample[1:-3])
            end1 = sample[-3]
            end2 = sample[-2]
            right_ending = sample[-1]

            self.data.append([start, end1])
            self.labels.append(0 if "1" == right_ending else 1)

            self.data.append([start, end2])
            self.labels.append(0 if "2" == right_ending else 1)

    def __getitem__(self, idx):
        X = self.data[idx]
        y = self.labels[idx]        
        return X, y

    def __len__(self):
        assert len(self.data) == len(self.labels)
        return len(self.labels)

# Trigger Words

In [17]:
def vocab_distribution(filepath, token_ids = False):
    """
    :param filepath: Path to the CSV file with Story Cloze set    
    :param token_ids: Return words when False, token_ids when True
    """
    data = []
    labels = []
    
    dataset = ClozeTest(filepath)
    data.extend(dataset.data)
    labels.extend(dataset.labels)
    
    tokenizer = BertTokenizer.from_pretrained(BASE_MODEL)

    ending_tokens = {}
    word_count = 0

    for i, story in enumerate(data):
        label = labels[i]
        end = story[1]    
        tokens = tokenizer(end).input_ids
        tokens.pop(0)
        tokens.pop(-1)
        
        for token in tokens:
            if not token_ids: token = tokenizer.decode(token).replace(" ", "")
            word_count += 1
            if token not in ending_tokens:
                ending_tokens[token] = [0,0]
            ending_tokens[token][label] += 1

    return ending_tokens, word_count

def pmi(class_count, other_class_count, word_count):
    """
    :param class_count: Number of occurences in the class you want to calculate the pmi with
    :param other_class_count: Number of occurences in the other class
    :param word_count: Total word count
    """
    import math
    if class_count < 1:
        return 0
    return math.log((class_count / word_count) / ((class_count + other_class_count)/(word_count*2)))

def class_prob(class_count, other_class_count):
    return class_count/(class_count + other_class_count)

In [18]:
def get_trigger_words(filepath, min_occurences = 30, token_ids = False):
    """    
    :param token_ids: Return words when False, token_ids when True
    :param filepath: Path to the CSV file with Story Cloze set    
    :param min_occunrences: Only return trigger words minimally occuring this often
    """
    
    vocab_dis, word_count = vocab_distribution(filepath, token_ids=token_ids)

    pos_triggers = []
    neg_triggers = []

    for word, dis in vocab_dis.items():
        if(dis[0]+dis[1] >= min_occurences):      
            pmi_pos = pmi(dis[0], dis[1], word_count)    
            pmi_neg = pmi(dis[1], dis[0], word_count)

            class_prob_pos = class_prob(dis[0], dis[1])
            class_prob_neg = class_prob(dis[1], dis[0])

            pos_triggers.append([word, dis[0], pmi_pos, class_prob_pos])
            neg_triggers.append([word, dis[1], pmi_neg, class_prob_neg])

    pos_triggers.sort(key=lambda x: x[2], reverse = True)
    neg_triggers.sort(key=lambda x: x[2], reverse = True)

    print(tabulate(pos_triggers, headers=['Token', 'n', 'pmi', 'pos_class_likelihood']))
    print("\n")
    print(tabulate(neg_triggers, headers=['Token', 'n', 'pmi', 'neg_class_likelihood']))

In [19]:
if __name__ == "__main__":
    get_trigger_words(CLOZE_TEST_PATH, min_occurences = 10, token_ids = False)

Token            n          pmi    pos_class_likelihood
------------  ----  -----------  ----------------------
eventually      18   0.693147                 1
yes             11   0.606136                 0.916667
saved           10   0.597837                 0.909091
finally         24   0.575364                 0.888889
learned         15   0.567984                 0.882353
ended           11   0.526093                 0.846154
wonderful       11   0.526093                 0.846154
brought         11   0.526093                 0.846154
now             31   0.516216                 0.837838
first           20   0.510826                 0.833333
love            14   0.498991                 0.823529
things           9   0.492476                 0.818182
grateful        16   0.470004                 0.8
realized        16   0.470004                 0.8
even             8   0.470004                 0.8
right           15   0.456758                 0.789474
able            22   0.451985 