In [2]:
%load_ext autoreload
%autoreload 2

In [3]:
import torch
import torch.nn.functional as F

from transformers import AutoModelForCausalLM, AutoTokenizer

from synth import CrossEntropyDifferential

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
device = torch.device("cuda")

model_name = "gpt2"
model = AutoModelForCausalLM.from_pretrained(model_name).to(device)
tokenizer = AutoTokenizer.from_pretrained(model_name)

tokenizer.pad_token = tokenizer.eos_token
model.config.pad_token_id = model.config.eos_token_id

cross_entropy_differential = CrossEntropyDifferential(model, tokenizer, device)



In [5]:
model

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)

: 

In [5]:
Y = """ thing in the world, all the mozzarella and the tomato sauce on my margherita are amazing. 
My Italian friend Angelo told me that his favourite pizza is the quattroformaggi rossa with salsiccia. 
When I was a kid, I used to watch the pizza maker create his pizzas, he was from Romania but a very nice gentleman I have to say."""

prompt = f"""{Y}\n::INTRODUCE::"""
input = tokenizer(prompt, return_tensors='pt', padding=True).to(device)

outputs = torch.tensor([])
for i in range(3):
    torch.cuda.empty_cache()
    output = model.generate(
        input.input_ids,
        attention_mask=input.attention_mask,
        max_length=len(input.input_ids[0])+6,
        num_return_sequences=10,
        no_repeat_ngram_size=2,
        do_sample=True,
        temperature=1
    )
    outputs = torch.cat((outputs, output.to(torch.device('cpu'))), dim=0)

results = []
for i in output:
    answ = tokenizer.decode(i[len(input.input_ids[0]):], skip_special_tokens=True)
    results.append((
        cross_entropy_differential(answ, Y, diff=True).item(),
        answ
        ))
best_result = min(results, key=lambda x: x[0])

Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.
Setting `pad_token_id` to `eos_token_id`:50256 for open-end generation.


In [6]:
print(best_result)

(0.021950483322143555, '\nWet n Wild is')


In [7]:
with torch.no_grad():
    output = model(**input)

output

CausalLMOutputWithCrossAttentions(loss=None, logits=tensor([[[ -37.6448,  -37.5860,  -42.0073,  ...,  -45.8884,  -44.8885,
           -38.8211],
         [ -79.4698,  -79.1837,  -84.6170,  ...,  -86.7144,  -85.5297,
           -80.2997],
         [ -95.1870,  -93.1776,  -96.9971,  ...,  -97.7816,  -98.6849,
           -94.2064],
         ...,
         [ -84.8391,  -86.9780,  -86.3004,  ...,  -95.9692,  -99.2742,
           -84.2696],
         [ -79.4165,  -83.2990,  -82.5779,  ...,  -91.6714,  -92.3742,
           -79.7206],
         [-102.8867, -103.3748, -105.3497,  ..., -112.3392, -112.5212,
           -98.4017]]], device='cuda:0'), past_key_values=((tensor([[[[-1.7836,  2.1728,  0.4900,  ..., -1.1411, -1.0535,  1.7519],
          [-2.0899,  3.0310,  1.0003,  ..., -1.0146, -1.5198,  2.2534],
          [-2.2444,  2.6332,  1.9227,  ..., -0.6722, -1.5328,  2.0305],
          ...,
          [-1.6866,  2.8878,  2.1436,  ...,  0.0226, -2.2237,  0.5068],
          [-1.0117,  2.4791,  1.664