In [27]:
import nltk
from collections import defaultdict
import tqdm
import numpy as np
import torch
from transformers import GPT2Tokenizer as tok
from transformers import GPT2LMHeadModel as head
import pandas as pd
import math

In [2]:
N = 2

In [3]:
tok = lambda d, l='english': nltk.tokenize.word_tokenize(d, language=l, preserve_line=False)

In [4]:
"""
Citation: Zeman, Daniel; et al., 2024, 
  Universal Dependencies 2.15, LINDAT/CLARIAH-CZ digital library at the Institute of Formal and Applied Linguistics (ÚFAL), Faculty of Mathematics and Physics, Charles University, 
  http://hdl.handle.net/11234/1-5787.
"""
eng_data = open("ud-215/ud-treebanks-v2.15/UD_English-GUM/en_gum-ud-train.txt").read()
ngrams = nltk.ngrams(tok(eng_data), N)

In [5]:
table = defaultdict(lambda: defaultdict(int))

for bigram in tqdm.tqdm(ngrams):
    w1, w2 = bigram
    table[w1][w2] += 1

162760it [00:00, 1735219.68it/s]


In [6]:
eng_data_test = open("ud-215/ud-treebanks-v2.15/UD_English-GUM/en_gum-ud-test.txt").read()

In [25]:
perplexity = 1
for bigram in nltk.ngrams(tok(eng_data_test), N):
    w1, w2 = bigram
    if table[w1][w2] == 0:
        perplexity *= 2
    else:
        perplexity *= max(1 / (table[w1][w2] / (sum(table[w1].values()))), 2)


In [26]:
perplexity

inf

In [28]:
model = head.from_pretrained('gpt2')
tokenizer = tok.from_pretrained('gpt2')

In [31]:
s1 = '<|endoftext|>The coach smiled at the player tossed a frisbee.'
s2 = '<|endoftext|>L\'allenatore sorrise al giocatore che aveva lanciato il frisbee.'

In [32]:
def evaluate(token_ids):
    model.eval()
    with torch.no_grad():
        output = model(token_ids)
    softmax = torch.nn.Softmax(dim=-1) 
    return softmax(output.logits).squeeze(0)
    
token_ids = torch.Tensor(tokenizer.encode(s1)).int()
evaluate(token_ids)

tensor([[6.6199e-04, 2.4113e-02, 9.5428e-04,  ..., 1.5595e-08, 1.2056e-08,
         1.9246e-03],
        [4.5156e-07, 4.0817e-06, 8.7872e-07,  ..., 6.5628e-07, 2.2708e-08,
         4.6907e-06],
        [2.8929e-05, 3.3060e-05, 6.8108e-07,  ..., 4.3267e-09, 6.2591e-06,
         1.0636e-05],
        ...,
        [9.7112e-09, 1.7678e-09, 1.0148e-11,  ..., 4.9267e-15, 3.0244e-12,
         5.4265e-09],
        [1.3881e-04, 4.8386e-05, 1.2973e-07,  ..., 4.3097e-09, 5.6733e-08,
         1.8539e-05],
        [5.8190e-07, 2.5833e-05, 2.2614e-07,  ..., 1.1884e-11, 3.3631e-11,
         4.2481e-03]])

In [33]:
def log_prefix_probability(s, i):
    toks = tokenizer.encode(s)
    sx = evaluate(torch.Tensor(toks).int())

    return sum(torch.log(sx[j][toks[j + 1]]) for j in range(i))

-(log_prefix_probability(s1, 7) - log_prefix_probability(s1, 6))

tensor(12.4271)

In [46]:
log_prefix_probability(s1, 6)

tensor(-30.7355)

In [45]:
log_prefix_probability(s2, 6)

tensor(-37.2877)

In [None]:
class Surprisal:
    def __init__(self, model_name, input_file):
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.model = AutoModel.from_pretrained(model_name, output_hidden_states=True)

        self.model.eval()

    def evaluate(token_ids):
        with torch.no_grad():
            output = model(token_ids)
        softmax = torch.nn.Softmax(dim=-1) 
        return softmax(output.logits).squeeze(0)