In [2]:
import torch
from transformers import Data2VecTextForTokenClassification, AutoTokenizer
from PIL import Image
import requests

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
version = "facebook/data2vec-text-base"
text = "HuggingFace is a company based in Paris and New York"

# AutoTokenizer

In [5]:
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(version)
tokenizer

RobertaTokenizerFast(name_or_path='facebook/data2vec-text-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)

## special ids and tokens

In [6]:
print(tokenizer.all_special_ids)
print(tokenizer.all_special_tokens)

[0, 2, 3, 1, 50264]
['<s>', '</s>', '<unk>', '<pad>', '<mask>']


## processor

In [7]:
inputs = tokenizer(
    text = text,
    return_tensors = "pt",      # 返回数据格式 np pt tf jax
    padding = True,             # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,  # 如果使用max_length要将padding设置为 "max_length"
    add_special_tokens = False, # TokenClassification不添加特殊token,因为特殊token也会被分类,无意义
).to(device, torch.float16)
inputs

{'input_ids': tensor([[40710,  3923, 34892,    16,    10,   138,   716,    11,  2201,     8,
           188,   469]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [8]:
inputs['input_ids']

tensor([[40710,  3923, 34892,    16,    10,   138,   716,    11,  2201,     8,
           188,   469]], device='cuda:0')

# Data2VecTextForTokenClassification

Data2VecText Model with a token classification head on top (a linear layer on top of the hidden-states output) e.g. for Named-Entity-Recognition (NER) tasks.

In [9]:
model: Data2VecTextForTokenClassification = Data2VecTextForTokenClassification.from_pretrained(version, torch_dtype=torch.float16).to(device)
model

Some weights of Data2VecTextForTokenClassification were not initialized from the model checkpoint at facebook/data2vec-text-base and are newly initialized: ['classifier.weight', 'classifier.bias']
You should probably TRAIN this model on a down-stream task to be able to use it for predictions and inference.


Data2VecTextForTokenClassification(
  (data2vec_text): Data2VecTextModel(
    (embeddings): Data2VecTextForTextEmbeddings(
      (word_embeddings): Embedding(50265, 768, padding_idx=1)
      (position_embeddings): Embedding(514, 768, padding_idx=1)
      (token_type_embeddings): Embedding(1, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): Data2VecTextEncoder(
      (layer): ModuleList(
        (0-11): 12 x Data2VecTextLayer(
          (attention): Data2VecTextAttention(
            (self): Data2VecTextSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): Data2VecTextSelfOutput(
              (dense): Linear(in_featur

In [10]:
model.eval()
with torch.inference_mode():
    outputs = model(**inputs)
outputs

TokenClassifierOutput(loss=None, logits=tensor([[[ 0.0560, -0.2222],
         [-0.0379, -0.1015],
         [-0.0192, -0.1713],
         [ 0.0570, -0.2213],
         [-0.0308, -0.0979],
         [ 0.0099, -0.1354],
         [ 0.0579, -0.1366],
         [-0.0423, -0.0977],
         [-0.0051, -0.2859],
         [-0.0529, -0.1133],
         [-0.0538, -0.1152],
         [-0.0469, -0.1105]]], device='cuda:0'), hidden_states=None, attentions=None)

In [11]:
logits = outputs.logits
print(logits.shape)
print(logits)

torch.Size([1, 12, 2])
tensor([[[ 0.0560, -0.2222],
         [-0.0379, -0.1015],
         [-0.0192, -0.1713],
         [ 0.0570, -0.2213],
         [-0.0308, -0.0979],
         [ 0.0099, -0.1354],
         [ 0.0579, -0.1366],
         [-0.0423, -0.0977],
         [-0.0051, -0.2859],
         [-0.0529, -0.1133],
         [-0.0538, -0.1152],
         [-0.0469, -0.1105]]], device='cuda:0')


In [12]:
logits.argmax(-1)

tensor([[0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]], device='cuda:0')

In [13]:
text

'HuggingFace is a company based in Paris and New York'

In [15]:
tokenizer.tokenize(text)

['Hug',
 'ging',
 'Face',
 'Ġis',
 'Ġa',
 'Ġcompany',
 'Ġbased',
 'Ġin',
 'ĠParis',
 'Ġand',
 'ĠNew',
 'ĠYork']

In [14]:
tokenizer(text, add_special_tokens = False)

{'input_ids': [40710, 3923, 34892, 16, 10, 138, 716, 11, 2201, 8, 188, 469], 'attention_mask': [1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1]}