In [1]:
import torch
from transformers import CLIPTokenizer, CLIPTextModelWithProjection

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
version = "openai/clip-vit-base-patch32"
sequence = ["a photo of 2 cats", "a photo of a dog", "a plane in the blue sky"]
max_length = 20

# CLIPTokenizer

In [4]:
tokenizer: CLIPTokenizer = CLIPTokenizer.from_pretrained(version)
tokenizer

CLIPTokenizer(name_or_path='openai/clip-vit-base-patch32', vocab_size=49408, model_max_length=77, is_fast=False, padding_side='right', truncation_side='right', special_tokens={'bos_token': AddedToken("<|startoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'eos_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'unk_token': AddedToken("<|endoftext|>", rstrip=False, lstrip=False, single_word=False, normalized=True), 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)

## tokenizer([sequence])

In [5]:
inputs = tokenizer(
    sequence,                           # 句子batch
    truncation = True,                  # 超出max_length截断处理
    padding = True,                     # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,          # 最长长度,不设置默认为模型最大长度
    add_special_tokens = True,          # text添加特殊key
    return_length = True,               # 返回有效长度
    return_overflowing_tokens = False,  # 返回所有的文本片段（由于文本比较长，默认情况下超过预设截断长度的token会被丢失。如果设置了return_overflowing_tokens=True则会返回所有的token片段）。
    return_tensors = "pt"               # 返回数据格式 np pt tf jax
).to(device, torch.float16)

print(inputs.keys())
print(inputs["input_ids"])      # 对应文字id 添加了['<|startoftext|>', '<|endoftext|>'],和encode相同
print(inputs["attention_mask"]) # 对应是否是文字
print(inputs["length"])         # 对应有效文字长度

dict_keys(['input_ids', 'length', 'attention_mask'])
tensor([[49406,   320,  1125,   539,   273,  3989, 49407, 49407],
        [49406,   320,  1125,   539,   320,  1929, 49407, 49407],
        [49406,   320,  5363,   530,   518,  1746,  2390, 49407]],
       device='cuda:0')
tensor([[1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1, 1, 1]], device='cuda:0')
tensor([7, 7, 8], device='cuda:0')


# CLIPTextModelWithProjection

CLIP Text Model with a projection layer on top (a linear layer on top of the pooled output).

In [6]:
text_model: CLIPTextModelWithProjection = CLIPTextModelWithProjection.from_pretrained(version, torch_dtype=torch.float16).to(device)
text_model

CLIPTextModelWithProjection(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm(

In [7]:
text_model.eval()
with torch.inference_mode():
    outputs = text_model(
        input_ids = inputs["input_ids"],
        attention_mask = inputs["attention_mask"],
    )
outputs

CLIPTextModelOutput(text_embeds=tensor([[ 0.3023,  0.0104, -0.6035,  ..., -0.5600, -0.2901, -0.1535],
        [ 0.0932,  0.2764, -0.4137,  ..., -0.5852, -0.2590,  0.1193],
        [ 0.0693, -0.1149, -0.2575,  ..., -0.0537, -0.0485,  0.0403]],
       device='cuda:0'), last_hidden_state=tensor([[[ 0.3393,  0.1165,  0.1020,  ...,  0.2468,  0.5906,  0.1013],
         [ 1.9753, -0.5844,  0.3685,  ...,  1.1658,  0.8050, -0.9801],
         [ 1.0580, -0.9600,  1.0018,  ..., -0.5155, -0.1437, -1.9444],
         ...,
         [ 1.3097, -0.8255,  1.2024,  ...,  0.3621,  0.1972, -1.5122],
         [-0.0836, -0.2263,  0.3341,  ...,  0.2727, -0.2437, -1.7002],
         [-0.1426, -0.2440,  0.3692,  ...,  0.2555, -0.2330, -1.7032]],

        [[ 0.3393,  0.1165,  0.1020,  ...,  0.2468,  0.5906,  0.1013],
         [ 1.9753, -0.5844,  0.3685,  ...,  1.1658,  0.8050, -0.9801],
         [ 1.0580, -0.9600,  1.0018,  ..., -0.5155, -0.1437, -1.9444],
         ...,
         [-0.1433, -0.5163,  1.7099,  ..., -0

In [8]:
# 投影后的结果
# get_text_features的结果
print(outputs.text_embeds.shape)
print(outputs.text_embeds)

torch.Size([3, 512])
tensor([[ 0.3023,  0.0104, -0.6035,  ..., -0.5600, -0.2901, -0.1535],
        [ 0.0932,  0.2764, -0.4137,  ..., -0.5852, -0.2590,  0.1193],
        [ 0.0693, -0.1149, -0.2575,  ..., -0.0537, -0.0485,  0.0403]],
       device='cuda:0')


In [9]:
# 最后一层的输出,和CLIPTextModel结果相同
outputs.last_hidden_state.shape

torch.Size([3, 8, 512])

In [10]:
outputs.hidden_states

In [11]:
outputs.attentions