In [1]:
import torch
from transformers import GPT2DoubleHeadsModel, GPT2Tokenizer

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
version = "gpt2"

# GPT2Tokenizer

In [5]:
tokenizer: GPT2Tokenizer = GPT2Tokenizer.from_pretrained(version)
tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)

In [7]:
# Add a [CLS] to the vocabulary (we should train it also!)
num_added_tokens = tokenizer.add_special_tokens({"cls_token": "[CLS]"})
num_added_tokens

1

In [8]:
choices = ["Hello, my dog is cute [CLS]", "Hello, my cat is cute [CLS]"]
encoded_choices = [tokenizer.encode(s) for s in choices]
encoded_choices

[[15496, 11, 616, 3290, 318, 13779, 50257],
 [15496, 11, 616, 3797, 318, 13779, 50257]]

In [9]:
cls_token_location = [tokens.index(tokenizer.cls_token_id) for tokens in encoded_choices]
cls_token_location

[6, 6]

In [10]:
input_ids = torch.tensor(encoded_choices).unsqueeze(0).to(device, torch.float16)  # Batch size: 1, number of choices: 2
input_ids

tensor([[[15496,    11,   616,  3290,   318, 13779, 50257],
         [15496,    11,   616,  3797,   318, 13779, 50257]]], device='cuda:0')

In [11]:
mc_token_ids = torch.tensor([cls_token_location]).to(device, torch.float16)  # Batch size: 1
mc_token_ids

tensor([[6, 6]], device='cuda:0')

# GPT2DoubleHeadsModel

The GPT2 Model transformer with a language modeling and a multiple-choice classification head on top e.g. for RocStories/SWAG tasks. The two heads are two linear layers. The language modeling head has its weights tied to the input embeddings, the classification head takes as input the input of a specified classification token index in the input sequence).

In [12]:
model: GPT2DoubleHeadsModel = GPT2DoubleHeadsModel.from_pretrained(version, torch_dtype=torch.float16).to(device)
model

Some weights of GPT2DoubleHeadsModel were not initialized from the model checkpoint at gpt2 and are newly initialized: ['multiple_choice_head.summary.bias', 'multiple_choice_head.summary.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2DoubleHeadsModel(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (lm_head): Linear(in_features=768, out_features=50257, bias=False)
  (multiple_choice_head): SequenceSummary(
    (summary): Linear(in_features=768, out_features=1, bias=True)


In [13]:
# Update the model embeddings with the new vocabulary size
embedding_layer = model.resize_token_embeddings(len(tokenizer))
embedding_layer

Embedding(50258, 768)

In [14]:
model.eval()
with torch.inference_mode():
    outputs = model(input_ids, mc_token_ids=mc_token_ids)
outputs

GPT2DoubleHeadsModelOutput(loss=None, mc_loss=None, logits=tensor([[[[ -35.2362,  -35.3266,  -38.9754,  ...,  -43.9975,  -36.4580,
              0.7955],
          [-112.6171, -114.5832, -116.5724,  ..., -118.8059, -111.6917,
              3.3742],
          [ -88.7434,  -89.8643,  -93.1977,  ...,  -96.1782,  -92.1273,
              2.7337],
          ...,
          [-116.7280, -119.3949, -121.7259,  ..., -124.6102, -121.6092,
              3.4111],
          [ -77.4425,  -80.4463,  -88.0498,  ...,  -93.6345,  -84.0666,
              1.7587],
          [ -89.7752,  -89.5057,  -91.4326,  ...,  -96.7385,  -89.9483,
              2.4538]],

         [[ -35.2362,  -35.3266,  -38.9754,  ...,  -43.9975,  -36.4580,
              0.7955],
          [-112.6171, -114.5832, -116.5724,  ..., -118.8059, -111.6917,
              3.3742],
          [ -88.7434,  -89.8643,  -93.1977,  ...,  -96.1782,  -92.1273,
              2.7337],
          ...,
          [-115.4049, -118.4459, -120.5038,  ..., -122

In [19]:
lm_logits = outputs.logits
lm_logits

tensor([[[[ -35.2362,  -35.3266,  -38.9754,  ...,  -43.9975,  -36.4580,
              0.7955],
          [-112.6171, -114.5832, -116.5724,  ..., -118.8059, -111.6917,
              3.3742],
          [ -88.7434,  -89.8643,  -93.1977,  ...,  -96.1782,  -92.1273,
              2.7337],
          ...,
          [-116.7280, -119.3949, -121.7259,  ..., -124.6102, -121.6092,
              3.4111],
          [ -77.4425,  -80.4463,  -88.0498,  ...,  -93.6345,  -84.0666,
              1.7587],
          [ -89.7752,  -89.5057,  -91.4326,  ...,  -96.7385,  -89.9483,
              2.4538]],

         [[ -35.2362,  -35.3266,  -38.9754,  ...,  -43.9975,  -36.4580,
              0.7955],
          [-112.6171, -114.5832, -116.5724,  ..., -118.8059, -111.6917,
              3.3742],
          [ -88.7434,  -89.8643,  -93.1977,  ...,  -96.1782,  -92.1273,
              2.7337],
          ...,
          [-115.4049, -118.4459, -120.5038,  ..., -122.9722, -120.1530,
              3.3537],
          [ -78.10

In [20]:
mc_logits = outputs.mc_logits
mc_logits

tensor([[8.6657, 8.2169]], device='cuda:0')