In [1]:
from transformers import GPT2LMHeadModel, GPT2Tokenizer

In [3]:
model = GPT2LMHeadModel.from_pretrained('gpt2')
tokenizer = GPT2Tokenizer.from_pretrained('gpt2')

In [4]:
# print all the layers of the model
print(model)

GPT2LMHeadModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2SdpaAttention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
)


In [5]:
print(tokenizer)

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	50256: AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
}


In [63]:
# Tokenize the input text
input_text = "basketball is a fun game to play"
inputs = tokenizer(input_text, return_tensors="pt")



In [59]:
# Pass the tokenized input to the model
# Ensure return_dict=True to get a detailed output including the logits
outputs = model(**inputs, output_hidden_states=True, return_dict=True)
print(outputs.logits.shape)
print(outputs.logits)



torch.Size([1, 7, 50257])
tensor([[[ -32.7464,  -31.6252,  -34.7533,  ...,  -40.4119,  -39.8658,
           -32.9583],
         [-113.1515, -111.2932, -114.8908,  ..., -123.0035, -120.1136,
          -116.1134],
         [-112.0150, -110.5219, -113.5100,  ..., -122.4088, -117.5997,
          -112.2077],
         ...,
         [ -91.6831,  -93.1320, -102.4978,  ..., -107.1496, -106.9473,
           -96.0684],
         [-140.7971, -141.8635, -146.4763,  ..., -150.3814, -149.1989,
          -142.0862],
         [-107.5068, -109.3211, -119.5146,  ..., -122.9288, -123.6888,
          -112.1354]]], grad_fn=<UnsafeViewBackward0>)


In [60]:
# calculate the probabilities of the words

probabilities = outputs.logits.softmax(dim=-1)

# Print the probability of the first word
print((probabilities[0, 2, :]).argmax())
print(outputs.logits[0, 2, :].argmax())

tensor(1049)
tensor(1049)


In [62]:
for i in range(outputs.logits.shape[1]):
    print(tokenizer.decode((probabilities[0, i, :]).argmax()))

print(input_text)

.
 a
 great
 game
,
 watch
.
basketball is a fun game to play


It works! the model correctly predicts the most likely next word! We learned that given an input text of dimension N, the model returns the logits for all the predictions up until N.