In [1]:
import torch
from transformers import ChineseCLIPProcessor, ChineseCLIPTextModel

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

In [3]:
version = "OFA-Sys/chinese-clip-vit-base-patch16"
max_length = 20
text = ["杰尼龟", "妙蛙种子", "小火龙", "皮卡丘"]

# ChineseCLIPProcessor

In [4]:
processor: ChineseCLIPProcessor = ChineseCLIPProcessor.from_pretrained(version)
processor

ChineseCLIPProcessor:
- image_processor: ChineseCLIPImageProcessor {
  "crop_size": {
    "height": 224,
    "width": 224
  },
  "do_center_crop": false,
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "feature_extractor_type": "ChineseCLIPFeatureExtractor",
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "ChineseCLIPImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

- tokenizer: BertTokenizerFast(name_or_path='OFA-Sys/chinese-clip-vit-base-patch16', vocab_size=21128, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'unk_token': '[UNK]', 'sep_token': '[SEP]', 'pad_token': '[PAD]', 'cls_token': '[CLS]', 'mask_token': '[MASK]'}, clean_up_tokenization_spaces=True)

## processor

In [5]:
inputs = processor(
    text = text,                # 可以为列表或单个string
    return_tensors = "pt",      # 返回数据格式 np pt tf jax
    padding = True,             # 填充方式选择 [True, 'longest', 'max_length', 'do_not_pad']
    # max_length = max_length,  # 如果使用max_length要将padding设置为 "max_length"
    add_special_tokens = True,  # text添加特殊key
).to(device, torch.float16)
inputs

{'input_ids': tensor([[ 101, 3345, 2225, 7991,  102,    0],
        [ 101, 1975, 6032, 4905, 2094,  102],
        [ 101, 2207, 4125, 7987,  102,    0],
        [ 101, 4649, 1305,  687,  102,    0]], device='cuda:0'), 'token_type_ids': tensor([[0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0],
        [0, 0, 0, 0, 0, 0]], device='cuda:0'), 'attention_mask': tensor([[1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 0]], device='cuda:0')}

In [6]:
inputs["input_ids"]

tensor([[ 101, 3345, 2225, 7991,  102,    0],
        [ 101, 1975, 6032, 4905, 2094,  102],
        [ 101, 2207, 4125, 7987,  102,    0],
        [ 101, 4649, 1305,  687,  102,    0]], device='cuda:0')

In [7]:
inputs["attention_mask"]

tensor([[1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 1],
        [1, 1, 1, 1, 1, 0],
        [1, 1, 1, 1, 1, 0]], device='cuda:0')

# CLIPTextModel

The text model from CHINESE_CLIP without any head or projection on top.

In [8]:
text_model: ChineseCLIPTextModel = ChineseCLIPTextModel.from_pretrained(version, torch_dtype=torch.float16).to(device)
text_model

Some weights of the model checkpoint at OFA-Sys/chinese-clip-vit-base-patch16 were not used when initializing ChineseCLIPTextModel: ['text_model.encoder.layer.10.attention.self.value.weight', 'text_model.encoder.layer.6.output.LayerNorm.bias', 'vision_model.encoder.layers.0.layer_norm2.weight', 'vision_model.pre_layrnorm.weight', 'text_model.encoder.layer.5.attention.output.LayerNorm.bias', 'text_model.encoder.layer.5.intermediate.dense.weight', 'vision_model.encoder.layers.3.mlp.fc1.bias', 'text_model.encoder.layer.9.output.dense.weight', 'vision_model.encoder.layers.5.layer_norm2.weight', 'text_model.encoder.layer.5.attention.output.dense.weight', 'logit_scale', 'vision_model.encoder.layers.3.mlp.fc2.weight', 'text_model.encoder.layer.6.intermediate.dense.bias', 'vision_model.encoder.layers.4.self_attn.v_proj.bias', 'vision_model.encoder.layers.10.self_attn.k_proj.weight', 'text_model.encoder.layer.3.attention.self.query.bias', 'vision_model.encoder.layers.2.layer_norm1.weight', 'vis

ChineseCLIPTextModel(
  (embeddings): ChineseCLIPTextEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): ChineseCLIPTextEncoder(
    (layer): ModuleList(
      (0-11): 12 x ChineseCLIPTextLayer(
        (attention): ChineseCLIPTextAttention(
          (self): ChineseCLIPTextSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): ChineseCLIPTextSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, ele

In [9]:
text_model.eval()
with torch.inference_mode():
    outputs = text_model(
        input_ids = inputs["input_ids"],
        attention_mask = inputs["attention_mask"],
    )
outputs

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[ 4.2313e-01, -7.0572e-01,  1.8726e-01,  ...,  7.4099e-01,
           1.1313e+00, -7.1933e-01],
         [-1.0530e+00, -4.2926e-01, -1.1825e-01,  ..., -6.3781e-01,
          -6.3158e-01, -6.6794e-01],
         [ 1.0290e+00, -6.0097e-02, -1.2589e+00,  ...,  1.2395e+00,
          -5.7168e-01, -1.4174e-01],
         [-3.3115e-01, -6.0512e-01, -5.8363e-01,  ..., -1.2125e-01,
           3.3261e-02, -6.6432e-01],
         [ 4.2290e-01, -2.2424e-01, -7.1796e-01,  ...,  3.1229e-01,
           2.2947e-01,  4.7062e-01],
         [-1.2025e+00,  3.4781e-01, -6.4434e-02,  ..., -9.3520e-02,
           1.1780e+00, -1.3417e+00]],

        [[ 7.7450e-01, -5.5769e-01,  7.1401e-01,  ...,  9.1673e-01,
           1.2666e+00, -3.2234e-01],
         [ 2.0000e-02,  3.1104e-01,  3.4901e-01,  ...,  7.3439e-01,
           1.7366e-01, -7.1819e-01],
         [ 3.5890e-01,  7.2860e-01,  1.1539e+00,  ...,  7.1729e-01,
          -8.2187e-01, -4.6

In [10]:
# 最后一层的输出
outputs.last_hidden_state.shape

torch.Size([4, 6, 768])

In [11]:
# 对文字长度进行pool
outputs.pooler_output.shape

torch.Size([4, 768])

In [12]:
outputs.hidden_states

In [13]:
outputs.attentions