In [1]:
%load_ext autoreload
%autoreload 2
import os
os.chdir("../")

In [2]:
import torch
import torch.nn.functional as F
from xent.tasks import Closure
from xent.models import M
from xent.lang import X
from xent.dataprocessing import Wikipedia
from xent.config import *

In [3]:
model = M("gpt2", "M0", base="base")
checker_model = M("gpt2", "M1-zero", base="closure")

In [4]:
corpus_generator = Wikipedia(split=0.8)
get_test_sample = corpus_generator.get_random_test_text

Loading dataset shards:   0%|          | 0/41 [00:00<?, ?it/s]

In [5]:
task = Closure(model)
synth = task.generate(get_test_sample, space="tokens")

In [6]:
cut = task.find_xstring(synth, X.xreturn)
CUT = cut + 6

In [7]:
checker_model.model.eval()
with torch.no_grad():
    print(synth.shape)
    logits = checker_model.model(synth).logits
    loss = F.cross_entropy(logits[0, CUT:-1], synth[0, CUT+1:], reduction="none")
    # Get the predicted token probabilities
    probs = F.softmax(logits[0, CUT:-1], dim=-1)
    # Get the indices of tokens with highest probabilities
    highest_prob_tokens = torch.argmax(probs, dim=-1)
    # Convert to list for easier inspection

torch.Size([1, 991])


In [8]:
print(model.detokenize(synth[0]))

 since 1991. The episodes are recorded before a live theatre audience, with two programmes being recorded at each performance and Naismith traditionally performs the duties of "warmup artist". This usually involves testing sound recording levels by means of a "patronising audience participation exercise" and a joke. Naismith also provides the voiceovers for the show, such as when the host talks about something appearing on the laser display board, he is "the mystery voice for listeners at home".

Personal life
He married Belinda Campbell in Oxfordshire in June 2002.

Books 
 The Little Book of Mornington Crescent. 2000. 
 with Graeme Garden and Barry Cryer: Hamish and Dougal: You'll Have Had Your Tea?. 2005. 
 Uxbridge English Dictionary (I'm Sorry I Haven't a Clue). 2005.

References

External links

1965 births
Alumni of the University of Cambridge
BBC
@##$$##@ closure(())>:ç%ç>:
 1991: 10
.: 1
 The: 2
 episodes: 10
 are: 2
 recorded: 5
 before: 6
 a: 4
 live: 4
 theatre: 8
 audience

In [9]:
word_origin = synth[0, CUT+4::4]
origin = synth[0, CUT+4+2::4]
word_genera = highest_prob_tokens[3::4]
genera = highest_prob_tokens[3+2::4]

print(f"{'actual':20}  | {'generated':21} | loss")
print("----------------------|-----------------------|--------")
for wo, o, wg, g, l in zip(word_origin, origin, word_genera, genera, loss[3+2::4]):
    wo_str = model.detokenize(wo)
    o_str = model.detokenize(o)
    wg_str = model.detokenize(wg) 
    g_str = model.detokenize(g)
    if '\n' in wo_str: wo_str = "\\n"
    if '\n' in wg_str: wg_str = "\\n"
    print(f"{wo_str:15} {o_str:5} | {wg_str:15} {g_str:5} | {l:.4f}")

actual                | generated             | loss
----------------------|-----------------------|--------
.                1    | .:               2    | 1.9139
 The             2    |  The             3    | 1.0323
 episodes        10   |  episodes        10   | 1.1869
 are             2    |  are             2    | 0.6892
 recorded        5    |  recorded        5    | 1.3148
 before          6    |  before          5    | 1.4863
 a               4    |  a               3    | 1.4978
 live            4    |  live            6    | 2.1502
 theatre         8    |  theatre         6    | 2.2560
 audience        3    |  audience        1    | 2.0893
,                2    | ,                2    | 0.5800
 with            3    |  with            3    | 0.3920
 two             5    |  two             5    | 0.6791
 programmes      7    |  programmes      7    | 1.6222
 being           3    |  being           3    | 0.9036
 recorded        2    |  recorded        2    | 0.9632
 at        