## Packages



In [None]:
!pip install datasets



In [None]:
from transformers import GPT2TokenizerFast, GPT2LMHeadModel

import torch
import textwrap
import warnings
warnings.filterwarnings('ignore')

## Helpers



In [None]:
##################################################
## helper function (nicer printing)
##################################################

def pretty_print(s):
    print("Output:\n" + 80 * '-')
    print(textwrap.fill(tokenizer.decode(s, skip_special_tokens=True),80))

## Obtaining a pretrained LLM



In [None]:

# model_to_use = "gpt2"
model_to_use = "gpt2-large"

print("Using model: ", model_to_use)

# get the tokenizer for the pre-trained LM you would like to use
tokenizer = GPT2TokenizerFast.from_pretrained(model_to_use)

# instantiate a model (causal LM)
model = GPT2LMHeadModel.from_pretrained(model_to_use,
                                        output_scores=True,
                                        pad_token_id=tokenizer.eos_token_id)

print(model.config)

Using model:  gpt2-large
GPT2Config {
  "_name_or_path": "gpt2-large",
  "activation_function": "gelu_new",
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "embd_pdrop": 0.1,
  "eos_token_id": 50256,
  "initializer_range": 0.02,
  "layer_norm_epsilon": 1e-05,
  "model_type": "gpt2",
  "n_ctx": 1024,
  "n_embd": 1280,
  "n_head": 20,
  "n_inner": null,
  "n_layer": 36,
  "n_positions": 1024,
  "output_scores": true,
  "pad_token_id": 50256,
  "reorder_and_upcast_attn": false,
  "resid_pdrop": 0.1,
  "scale_attn_by_inverse_layer_idx": false,
  "scale_attn_weights": true,
  "summary_activation": null,
  "summary_first_dropout": 0.1,
  "summary_proj_to_labels": true,
  "summary_type": "cls_index",
  "summary_use_proj": true,
  "task_specific_params": {
    "text-generation": {
      "do_sample": true,
      "max_length": 50
    }
  },
  "transformers_version": "4.40.2",
  "use_cache": true,
  "vocab_size": 50257
}



## Using the LLM for text generation



In [None]:
# text to expand
prompt = "Once a vampire fell in love with a pixie so that they"

# translate the prompt into tokens
input_tokens = tokenizer(prompt, return_tensors="pt").input_ids
print(input_tokens)

outputs = model.generate(input_tokens,
                         max_new_tokens=100,
                         do_sample=True,
                         top_k=50,
                       )

print("\nTop-k sampling:\n")
pretty_print(outputs[0])

#+begin_example
tensor([[ 7454,   257, 23952,  3214,   287,  1842,   351,   257,   279, 39291,
           523,   326,   484]])

Top-k sampling:

Output:
--------------------------------------------------------------------------------
Once a vampire fell in love with a pixie so that they could continue to breed,
their children were affected by the blood.  The blood turned the pixies into
human beings in the process and they became responsible for killing other
vampires, humans and creatures created by Satan himself.  They were killed in
the battle in 1082, as they attempted to feed on a witch named Anna.  Other
Names  German: Aigars von Fraunhilde (literally, "Aguaries of Fraunhilde") — The
witch
#+end_example

In [None]:
outputs = model.generate(input_tokens,
                         max_new_tokens=100,
                         num_beams=6,
                         no_repeat_ngram_size=4,
                         early_stopping=True
                         )

print("\nBeam search:\n")
pretty_print(outputs[0])

#+begin_example

Beam search:

Output:
--------------------------------------------------------------------------------
Once a vampire fell in love with a pixie so that they could feed on her blood,
the pixie would become a vampire herself, and the vampire would become a pixie
herself, and so on and so forth. The pixie would then become a vampire again,
and then a pixie again, and so forth and so on, until the pixie became a vampire
and the vampire became a pixie, and then the pixie was a vampire again and the
vampire was a pixie and so on.  The pixie would eventually become a
#+end_example

## Accessing next-word probabilities



In [None]:
labels        = torch.clone(input_tokens)
labels[0,0]   = -100
output_word2  = model(input_tokens[:,0:2], labels= labels[:,0:2])
output_prompt = model(input_tokens, labels=input_tokens)

# negative log-likelihood of provided labels
nll_word2  = output_word2.loss
nll_output = output_prompt.loss * input_tokens.size(1)
print("NLL of second word: ", nll_word2.item())
print("NLL of whole output:", nll_output.item())

NLL of second word:  3.040785789489746
NLL of whole output: 51.008323669433594

In [None]:
# logits of provided labels
print(output_word2.logits)
# next-word log probabilities:
print(torch.nn.functional.log_softmax(output_word2.logits, dim = 1))

tensor([[[ 2.3684,  0.9006, -4.1059,  ..., -6.9914, -4.4546,  0.0598],
         [-0.9339,  0.0542, -3.9052,  ..., -6.6439, -4.8402, -1.2681]]],
       grad_fn=<UnsafeViewBackward0>)
tensor([[[-0.0361, -0.3569, -0.7985,  ..., -0.8819, -0.5188, -0.2351],
         [-3.3384, -1.2034, -0.5978,  ..., -0.5344, -0.9044, -1.5630]]],
       grad_fn=<LogSoftmaxBackward0>)

## Accessing the embeddings (hidden states)



In [None]:
# set flag 'output_hidden_states' to true
output = model(input_tokens, output_hidden_states = True)

# this is a tuple with first element the embeddings of each token in the input
hidden_states = output.hidden_states
# so, access the first object from the tuple
embeddings = hidden_states[0]
# and print its size and content
print(embeddings.size())
print("Embedding of last word in input:\n", embeddings[0,0-1])

torch.Size([1, 13, 1280])
Embedding of last word in input:
 tensor([ 0.0360,  0.0201, -0.0314,  ...,  0.0598,  0.0014, -0.0129],
       grad_fn=<SelectBackward0>)

## [Excursion:] Using data from &rsquo;datasets&rsquo;



In [None]:
import math

test = load_dataset("wikitext", "wikitext-2-raw-v1", split="test")
encodings = tokenizer("\n\n".join(test["text"]), return_tensors="pt")

input_tokens = encodings.input_ids[:,10:50]

pretty_print(input_tokens[0])

output = model(input_tokens, labels = input_tokens)
print("Average NLL for wikipedia chunk", output.loss.item())

# Calculate the average cross-entropy loss
average_loss = output.loss.item()

# Calculate perplexity
perplexity = math.exp(output.loss.item())

# Display perplexity
print("Perplexity:", perplexity)

Output:
--------------------------------------------------------------------------------
  Robert Boulter is an English film, television and theatre actor. He had a
guest @-@ starring role on the television series The Bill in 2000. This was
followed by a starring role
Average NLL for wikipedia chunk 3.621708869934082
Perplexity: 37.401427417447366
