In [2]:
import torch
from transformers import Data2VecTextModel, AutoTokenizer
from PIL import Image
import requests

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
version = "facebook/data2vec-text-base"
text = ["a photo of 2 cats", "a photo of a dog", "a plane in the blue sky"]

# AutoTokenizer

In [5]:
tokenizer: AutoTokenizer = AutoTokenizer.from_pretrained(version)
tokenizer

RobertaTokenizerFast(name_or_path='facebook/data2vec-text-base', vocab_size=50265, model_max_length=512, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<unk>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'sep_token': AddedToken("</s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'cls_token': AddedToken("<s>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'mask_token': AddedToken("<mask>", rstrip=False, lstrip=True, single_word=False, normalized=True)}, clean_up_tokenization_spaces=True)

## special ids and tokens

In [6]:
print(tokenizer.all_special_ids)
print(tokenizer.all_special_tokens)

[0, 2, 3, 1, 50264]
['<s>', '</s>', '<unk>', '<pad>', '<mask>']


## processor

In [6]:
inputs = tokenizer(
    text = text,                # 可以为列表或单个string
    return_tensors = "pt",      # 返回数据格式 np pt tf jax
    padding = True,             # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,  # 如果使用max_length要将padding设置为 "max_length"
    add_special_tokens = True,  # text添加特殊key
).to(device, torch.float16)
inputs

{'input_ids': tensor([[    0,   102,  1345,     9,   132, 10017,     2,     1],
        [    0,   102,  1345,     9,    10,  2335,     2,     1],
        [    0,   102,  3286,    11,     5,  2440,  6360,     2]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [7]:
inputs["input_ids"]

tensor([[    0,   102,  1345,     9,   132, 10017,     2,     1],
        [    0,   102,  1345,     9,    10,  2335,     2,     1],
        [    0,   102,  3286,    11,     5,  2440,  6360,     2]],
       device='cuda:0')

# Data2VecTextModel(Encoder)

The bare Data2VecText Model for text transformer outputting raw hidden-states without any specific head on top.

In [8]:
model: Data2VecTextModel = Data2VecTextModel.from_pretrained(version, torch_dtype=torch.float16).to(device)
model

Some weights of the model checkpoint at facebook/data2vec-text-base were not used when initializing Data2VecTextModel: ['lm_head.dense.bias', 'lm_head.layer_norm.weight', 'lm_head.layer_norm.bias', 'lm_head.bias', 'lm_head.dense.weight']
- This IS expected if you are initializing Data2VecTextModel from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing Data2VecTextModel from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).
Some weights of Data2VecTextModel were not initialized from the model checkpoint at facebook/data2vec-text-base and are newly initialized: ['data2vec_text.pooler.dense.weight', 'data2vec_text.pooler.dense.bias']
You should probably TRAIN this model on a down-stream task to be able to use it f

Data2VecTextModel(
  (embeddings): Data2VecTextForTextEmbeddings(
    (word_embeddings): Embedding(50265, 768, padding_idx=1)
    (position_embeddings): Embedding(514, 768, padding_idx=1)
    (token_type_embeddings): Embedding(1, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): Data2VecTextEncoder(
    (layer): ModuleList(
      (0-11): 12 x Data2VecTextLayer(
        (attention): Data2VecTextAttention(
          (self): Data2VecTextSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): Data2VecTextSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-05, el

In [9]:
model.eval()
with torch.inference_mode():
    outputs = model(**inputs)
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 0.2198, -0.0450,  0.0257,  ..., -0.0156, -0.1477,  0.1281],
         [-0.0509, -0.2195, -0.2235,  ..., -0.0862,  0.0320,  0.0646],
         [-0.0436, -0.2229, -0.2218,  ..., -0.0922,  0.0268,  0.0648],
         ...,
         [-0.0318, -0.2363, -0.2247,  ..., -0.1017,  0.0320,  0.0439],
         [ 0.2022, -0.0683,  0.0710,  ...,  0.1358, -0.0658,  0.0317],
         [-0.0504, -0.2287, -0.2265,  ..., -0.0950,  0.0350,  0.0562]],

        [[ 0.2167, -0.0465,  0.0330,  ..., -0.0103, -0.1450,  0.1309],
         [-0.0503, -0.2002, -0.2133,  ..., -0.0886,  0.0212,  0.0546],
         [-0.0443, -0.2049, -0.2129,  ..., -0.0947,  0.0176,  0.0545],
         ...,
         [-0.0489, -0.2061, -0.2146,  ..., -0.0915,  0.0269,  0.0502],
         [ 0.2069, -0.0628,  0.0759,  ...,  0.1504, -0.0647,  0.0299],
         [-0.0430, -0.2117, -0.2160,  ..., -0.1011,  0.0226,  0.0481]],

        [[ 0.2076, -0.0443,  0.0135,  ..., -0.0358, -

In [10]:
print(outputs.last_hidden_state.shape)
print(outputs.last_hidden_state)

torch.Size([3, 8, 768])
tensor([[[ 0.2198, -0.0450,  0.0257,  ..., -0.0156, -0.1477,  0.1281],
         [-0.0509, -0.2195, -0.2235,  ..., -0.0862,  0.0320,  0.0646],
         [-0.0436, -0.2229, -0.2218,  ..., -0.0922,  0.0268,  0.0648],
         ...,
         [-0.0318, -0.2363, -0.2247,  ..., -0.1017,  0.0320,  0.0439],
         [ 0.2022, -0.0683,  0.0710,  ...,  0.1358, -0.0658,  0.0317],
         [-0.0504, -0.2287, -0.2265,  ..., -0.0950,  0.0350,  0.0562]],

        [[ 0.2167, -0.0465,  0.0330,  ..., -0.0103, -0.1450,  0.1309],
         [-0.0503, -0.2002, -0.2133,  ..., -0.0886,  0.0212,  0.0546],
         [-0.0443, -0.2049, -0.2129,  ..., -0.0947,  0.0176,  0.0545],
         ...,
         [-0.0489, -0.2061, -0.2146,  ..., -0.0915,  0.0269,  0.0502],
         [ 0.2069, -0.0628,  0.0759,  ...,  0.1504, -0.0647,  0.0299],
         [-0.0430, -0.2117, -0.2160,  ..., -0.1011,  0.0226,  0.0481]],

        [[ 0.2076, -0.0443,  0.0135,  ..., -0.0358, -0.1222,  0.1235],
         [-0.0471, -0

In [11]:
print(outputs.pooler_output.shape)
print(outputs.pooler_output)

torch.Size([3, 768])
tensor([[-0.0132, -0.1235, -0.3335,  ...,  0.2239, -0.0440, -0.2046],
        [-0.0118, -0.1264, -0.3320,  ...,  0.2229, -0.0477, -0.2024],
        [-0.0216, -0.1353, -0.3531,  ...,  0.2325, -0.0437, -0.2197]],
       device='cuda:0')


In [12]:
outputs.hidden_states