In [2]:
import torch
from transformers import Owlv2Processor, Owlv2TextModel

In [3]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [4]:
version = "google/owlv2-base-patch16-ensemble"
texts = ["a photo of 2 cats", "a photo of a dog", "a plane in the blue sky"]

# Owlv2Processor

In [5]:
processor: Owlv2Processor = Owlv2Processor.from_pretrained(version)
processor

Owlv2Processor:
- image_processor: Owlv2ImageProcessor {
  "do_normalize": true,
  "do_pad": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "Owlv2ImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "processor_class": "Owlv2Processor",
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 960,
    "width": 960
  }
}

- tokenizer: CLIPTokenizerFast(name_or_path='google/owlv2-base-patch16-ensemble', vocab_size=49408, model_max_length=16, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|startoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '!'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	0: AddedToken("!", rstrip=False, lstrip=False, single_word=False, normalized=False, special=True),
	49406: AddedToken("<|startoftext|>", rstrip=False

## processor

In [10]:
inputs = processor(
    text = texts,               # 可以为列表或单个string
    return_tensors = "pt",      # 返回数据格式 np pt tf jax
    padding = True,             # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,  # 如果使用max_length要将padding设置为 "max_length"
    add_special_tokens = True,  # text添加特殊key
).to(device, torch.float16)
inputs

{'input_ids': tensor([[49406,   320,  1125,   539,   273,  3989, 49407,     0],
        [49406,   320,  1125,   539,   320,  1929, 49407,     0],
        [49406,   320,  5363,   530,   518,  1746,  2390, 49407]],
       device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')}

In [11]:
inputs["input_ids"]

tensor([[49406,   320,  1125,   539,   273,  3989, 49407,     0],
        [49406,   320,  1125,   539,   320,  1929, 49407,     0],
        [49406,   320,  5363,   530,   518,  1746,  2390, 49407]],
       device='cuda:0')

## batch_decode

In [12]:
print(processor.batch_decode(inputs["input_ids"]))
print(processor.batch_decode(inputs["input_ids"], skip_special_tokens=True))

['<|startoftext|>a photo of 2 cats <|endoftext|>!', '<|startoftext|>a photo of a dog <|endoftext|>!', '<|startoftext|>a plane in the blue sky <|endoftext|>']
['a photo of 2 cats', 'a photo of a dog', 'a plane in the blue sky']


## decode

In [13]:
print(processor.decode(inputs["input_ids"][0]))
print(processor.decode(inputs["input_ids"][0], skip_special_tokens=True))

<|startoftext|>a photo of 2 cats <|endoftext|>!
a photo of 2 cats


# Owlv2TextModel

In [15]:
text_model: Owlv2TextModel = Owlv2TextModel.from_pretrained(version, torch_dtype=torch.float16).to(device)
text_model

Owlv2TextModel(
  (text_model): Owlv2TextTransformer(
    (embeddings): Owlv2TextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(16, 512)
    )
    (encoder): Owlv2Encoder(
      (layers): ModuleList(
        (0-11): 12 x Owlv2EncoderLayer(
          (self_attn): Owlv2Attention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): Owlv2MLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,),

In [16]:
text_model.eval()
with torch.inference_mode():
    outputs = text_model(**inputs)
outputs

BaseModelOutputWithPooling(last_hidden_state=tensor([[[-1.8305e-02,  2.3669e-02,  2.3141e-02,  ..., -2.8261e-01,
           1.5751e-01, -4.0075e-01],
         [ 5.8490e-01,  6.6315e-01,  1.1305e+00,  ..., -9.7600e-01,
           4.5851e-01, -6.4424e-01],
         [-2.4948e-01,  4.0482e-01,  1.5877e+00,  ...,  8.9683e-01,
           8.3434e-01, -1.5613e+00],
         ...,
         [ 8.0426e-01,  4.5553e-01, -7.7922e-02,  ..., -1.1559e+00,
           1.5640e+00, -9.3029e-02],
         [ 1.3706e+00,  1.4272e+00,  5.0517e-01,  ..., -5.6414e-01,
           9.8100e-01,  3.6156e-01],
         [ 1.5388e+00,  1.1661e+00,  5.8623e-01,  ..., -6.0757e-01,
           4.8703e-01,  4.3506e-01]],

        [[-1.8305e-02,  2.3669e-02,  2.3141e-02,  ..., -2.8261e-01,
           1.5751e-01, -4.0075e-01],
         [ 5.8490e-01,  6.6315e-01,  1.1305e+00,  ..., -9.7600e-01,
           4.5851e-01, -6.4424e-01],
         [-2.4948e-01,  4.0482e-01,  1.5877e+00,  ...,  8.9683e-01,
           8.3434e-01, -1.5613e

In [17]:
# 最后一层的输出
outputs.last_hidden_state.shape

torch.Size([3, 8, 512])

In [18]:
# 对文字长度进行pool
outputs.pooler_output.shape

torch.Size([3, 512])

In [19]:
outputs.hidden_states

In [20]:
outputs.attentions