In [1]:
import torch
from transformers import (
    ChineseCLIPConfig,
    ChineseCLIPModel,
    ChineseCLIPTextConfig,
    ChineseCLIPTextModel,
    ChineseCLIPVisionConfig,
    ChineseCLIPVisionModel,
)

In [2]:
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
device

device(type='cuda', index=0)

# ChineseCLIPConfig

In [3]:
# Initializing a ChineseCLIPConfig with OFA-Sys/chinese-clip-vit-base-patch16 style configuration
config = ChineseCLIPConfig()
config

ChineseCLIPConfig {
  "_commit_hash": null,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "logit_scale_init_value": 2.6592,
  "model_type": "chinese_clip",
  "projection_dim": 512,
  "text_config": {
    "_name_or_path": "",
    "add_cross_attention": false,
    "architectures": null,
    "attention_probs_dropout_prob": 0.1,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": null,
    "chunk_size_feed_forward": 0,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "early_stopping": false,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": null,
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "id2label": {
      "0": "LABEL_0",
      "1": "LABEL_1"
    },
    "initializer_fact

In [4]:
model: ChineseCLIPModel = ChineseCLIPModel(config).to(device)
model

ChineseCLIPModel(
  (text_model): ChineseCLIPTextModel(
    (embeddings): ChineseCLIPTextEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): ChineseCLIPTextEncoder(
      (layer): ModuleList(
        (0-11): 12 x ChineseCLIPTextLayer(
          (attention): ChineseCLIPTextAttention(
            (self): ChineseCLIPTextSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): ChineseCLIPTextSelfOutput(
              (dense): Linear(in_features=768, out_features=7

In [5]:
model.config

ChineseCLIPConfig {
  "_commit_hash": null,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "logit_scale_init_value": 2.6592,
  "model_type": "chinese_clip",
  "projection_dim": 512,
  "text_config": {
    "_name_or_path": "",
    "add_cross_attention": false,
    "architectures": null,
    "attention_probs_dropout_prob": 0.1,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": null,
    "chunk_size_feed_forward": 0,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "early_stopping": false,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": null,
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "id2label": {
      "0": "LABEL_0",
      "1": "LABEL_1"
    },
    "initializer_fact

# ChineseCLIPTextConfig

In [6]:
# Initializing a ChineseCLIPTextConfig and ChineseCLIPVisionConfig configuration
text_config = ChineseCLIPTextConfig()
text_config

ChineseCLIPTextConfig {
  "attention_probs_dropout_prob": 0.1,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.1,
  "hidden_size": 768,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "max_position_embeddings": 512,
  "model_type": "chinese_clip_text_model",
  "num_attention_heads": 12,
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "position_embedding_type": "absolute",
  "transformers_version": "4.31.0",
  "type_vocab_size": 2,
  "use_cache": true,
  "vocab_size": 30522
}

In [10]:
text_model: ChineseCLIPTextModel = ChineseCLIPTextModel(text_config).to(device)

# CLIPVisionConfig

In [11]:
vision_config = ChineseCLIPVisionConfig()
vision_config

ChineseCLIPVisionConfig {
  "attention_dropout": 0.0,
  "hidden_act": "quick_gelu",
  "hidden_size": 768,
  "image_size": 224,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-05,
  "model_type": "chinese_clip_vision_model",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 32,
  "projection_dim": 512,
  "transformers_version": "4.31.0"
}

In [12]:
vision_model: ChineseCLIPVisionModel = ChineseCLIPVisionModel(vision_config).to(device)

# from_text_vision_configs

In [13]:
config = ChineseCLIPConfig.from_text_vision_configs(
    text_config = text_config,
    vision_config = vision_config
)
config

ChineseCLIPConfig {
  "_commit_hash": null,
  "initializer_factor": 1.0,
  "initializer_range": 0.02,
  "logit_scale_init_value": 2.6592,
  "model_type": "chinese_clip",
  "projection_dim": 512,
  "text_config": {
    "_name_or_path": "",
    "add_cross_attention": false,
    "architectures": null,
    "attention_probs_dropout_prob": 0.1,
    "bad_words_ids": null,
    "begin_suppress_tokens": null,
    "bos_token_id": null,
    "chunk_size_feed_forward": 0,
    "cross_attention_hidden_size": null,
    "decoder_start_token_id": null,
    "diversity_penalty": 0.0,
    "do_sample": false,
    "early_stopping": false,
    "encoder_no_repeat_ngram_size": 0,
    "eos_token_id": null,
    "exponential_decay_length_penalty": null,
    "finetuning_task": null,
    "forced_bos_token_id": null,
    "forced_eos_token_id": null,
    "hidden_act": "gelu",
    "hidden_dropout_prob": 0.1,
    "hidden_size": 768,
    "id2label": {
      "0": "LABEL_0",
      "1": "LABEL_1"
    },
    "initializer_fact

In [14]:
model: ChineseCLIPModel = ChineseCLIPModel(config).to(device)