In [1]:
import torch
from transformers import GPT2ForTokenClassification, GPT2Tokenizer

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
version = "gpt2"
text = "HuggingFace is a company based in Paris and New York"

# GPT2Tokenizer

In [4]:
tokenizer: GPT2Tokenizer = GPT2Tokenizer.from_pretrained(version)
tokenizer

GPT2Tokenizer(name_or_path='gpt2', vocab_size=50257, model_max_length=1024, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)

In [5]:
# 需要手动设置pad_token
tokenizer.pad_token = tokenizer.eos_token

## tokenizer([sequence])

In [6]:
inputs = tokenizer(
    text,                     # 句子batch
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
).to(device, torch.float16)    # https://github.com/huggingface/transformers/issues/16359

print(inputs.keys())
print(inputs["input_ids"])
print(inputs["attention_mask"]) # 对应是否是文字

dict_keys(['input_ids', 'attention_mask'])
tensor([[48098,  2667, 32388,   318,   257,  1664,  1912,   287,  6342,   290,
           968,  1971]], device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')


In [7]:
print(inputs["input_ids"])

tensor([[48098,  2667, 32388,   318,   257,  1664,  1912,   287,  6342,   290,
           968,  1971]], device='cuda:0')


# GPT2ForTokenClassification

GPT2 Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.

In [8]:
model: GPT2ForTokenClassification = GPT2ForTokenClassification.from_pretrained(version, torch_dtype=torch.float16).to(device)
model

Some weights of GPT2ForTokenClassification were not initialized from the model checkpoint at gpt2 and are newly initialized: ['classifier.bias', 'classifier.weight']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


GPT2ForTokenClassification(
  (transformer): GPT2Model(
    (wte): Embedding(50257, 768)
    (wpe): Embedding(1024, 768)
    (drop): Dropout(p=0.1, inplace=False)
    (h): ModuleList(
      (0-11): 12 x GPT2Block(
        (ln_1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (attn): GPT2Attention(
          (c_attn): Conv1D()
          (c_proj): Conv1D()
          (attn_dropout): Dropout(p=0.1, inplace=False)
          (resid_dropout): Dropout(p=0.1, inplace=False)
        )
        (ln_2): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
        (mlp): GPT2MLP(
          (c_fc): Conv1D()
          (c_proj): Conv1D()
          (act): NewGELUActivation()
          (dropout): Dropout(p=0.1, inplace=False)
        )
      )
    )
    (ln_f): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
  )
  (dropout): Dropout(p=0.1, inplace=False)
  (classifier): Linear(in_features=768, out_features=2, bias=True)
)

In [9]:
model.eval()
with torch.inference_mode():
    outputs = model(
        input_ids = inputs["input_ids"],
        attention_mask = inputs["attention_mask"],
    )
outputs

TokenClassifierOutput(loss=None, logits=tensor([[[-0.2198, -2.0408],
         [-0.0274, -5.5659],
         [-0.5673, -5.7001],
         [-0.4255, -5.8274],
         [ 0.0472, -5.3559],
         [-0.1623, -6.3321],
         [ 0.0589, -3.9881],
         [-0.0214, -4.8949],
         [-0.4345, -5.0491],
         [ 0.2037, -7.1237],
         [ 0.5711, -4.7993],
         [-0.1935, -4.3793]]], device='cuda:0'), hidden_states=None, attentions=None)

In [10]:
outputs.logits

tensor([[[-0.2198, -2.0408],
         [-0.0274, -5.5659],
         [-0.5673, -5.7001],
         [-0.4255, -5.8274],
         [ 0.0472, -5.3559],
         [-0.1623, -6.3321],
         [ 0.0589, -3.9881],
         [-0.0214, -4.8949],
         [-0.4345, -5.0491],
         [ 0.2037, -7.1237],
         [ 0.5711, -4.7993],
         [-0.1935, -4.3793]]], device='cuda:0')

In [11]:
outputs.logits.softmax(dim=-1)

tensor([[[8.6070e-01, 1.3930e-01],
         [9.9608e-01, 3.9169e-03],
         [9.9413e-01, 5.8655e-03],
         [9.9551e-01, 4.4878e-03],
         [9.9552e-01, 4.4826e-03],
         [9.9791e-01, 2.0873e-03],
         [9.8282e-01, 1.7176e-02],
         [9.9241e-01, 7.5879e-03],
         [9.9019e-01, 9.8090e-03],
         [9.9934e-01, 6.5686e-04],
         [9.9537e-01, 4.6304e-03],
         [9.8502e-01, 1.4982e-02]]], device='cuda:0')

In [15]:
predicted_token_class_ids = outputs.logits.argmax(-1)
predicted_token_class_ids

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')

In [16]:
[model.config.id2label[t.item()] for t in predicted_token_class_ids[0]]

['LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0',
 'LABEL_0']