In [2]:
import torch

Documentation for the clip model is at https://huggingface.co/docs/transformers/v4.42.0/en/model_doc/clip#transformers.CLIPTextModel

Use the text encoder in CLIP to encode images

In [3]:
from transformers import AutoTokenizer, CLIPTextModelWithProjection
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-large-patch14")
textModel = CLIPTextModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")

  from .autonotebook import tqdm as notebook_tqdm


In [4]:
# print out the text model's architecture
print(textModel)

CLIPTextModelWithProjection(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm(

In [5]:
text = "A photo of a cat and a dog."

inputs = tokenizer(text, return_tensors="pt", padding=True)
outputs = textModel(**inputs)


In [6]:
type(outputs)

transformers.models.clip.modeling_clip.CLIPTextModelOutput

In [7]:

print(f"Text features: {outputs.last_hidden_state.shape}")
print(f"Text features: {outputs.last_hidden_state}")

print(f"poller output:  {outputs.text_embeds.shape}")
print(f"poller output:  {outputs.text_embeds}")


Text features: torch.Size([1, 11, 768])
Text features: tensor([[[-0.3884,  0.0229, -0.0522,  ..., -0.4899, -0.3066,  0.0675],
         [ 0.0290, -1.3258,  0.3085,  ..., -0.5257,  0.9768,  0.6652],
         [ 1.1565,  0.1318,  0.7895,  ..., -2.1024, -1.1519, -0.3311],
         ...,
         [-1.3661,  0.4994,  2.1031,  ..., -1.2776, -0.5481, -0.0732],
         [-1.1322,  1.0092,  2.5098,  ..., -0.7523, -0.4378,  0.4130],
         [-1.5944,  0.8174,  3.0335,  ...,  0.6500, -0.5789, -0.0236]]],
       grad_fn=<NativeLayerNormBackward0>)
poller output:  torch.Size([1, 768])
poller output:  tensor([[ 1.1577e-01,  2.7692e-01,  3.2398e-01, -1.9000e-01,  6.5517e-01,
          1.0890e-01,  6.6157e-01, -5.1472e-02, -9.7006e-02, -5.8035e-01,
         -1.5322e-01, -3.2100e-01,  1.5443e-01,  2.4305e-01, -1.0212e-01,
          2.0283e-02,  7.2655e-02,  1.3206e-01, -4.8887e-01,  1.2021e-01,
         -2.6471e-01,  1.0733e-01,  3.5001e-01,  7.7082e-01, -2.2825e-01,
         -7.4334e-02, -5.6145e-01, -9

In [8]:

from transformers import CLIPImageProcessor, CLIPVisionModelWithProjection
image_processor = CLIPImageProcessor.from_pretrained("openai/clip-vit-large-patch14")
vision_model = CLIPVisionModelWithProjection.from_pretrained("openai/clip-vit-large-patch14")

In [9]:
print(image_processor)

CLIPImageProcessor {
  "crop_size": {
    "height": 224,
    "width": 224
  },
  "do_center_crop": true,
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "CLIPImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "shortest_edge": 224
  }
}



In [10]:
print(vision_model)

CLIPVisionModelWithProjection(
  (vision_model): CLIPVisionTransformer(
    (embeddings): CLIPVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
      (position_embedding): Embedding(257, 1024)
    )
    (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-23): 24 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=1024, out_featur

In [11]:
import requests
from PIL import Image

url = "http://images.cocodataset.org/val2017/000000039769.jpg"

image = Image.open(requests.get(url, stream=True).raw)
image.show()

In [12]:
image = image_processor.preprocess(image)

In [13]:
print(image['pixel_values'][0].shape)

(3, 224, 224)


In [14]:
output = vision_model(torch.tensor(image['pixel_values'][0]).unsqueeze(0))

In [15]:
output.last_hidden_state.shape

torch.Size([1, 257, 1024])

In [16]:
output.image_embeds.shape

torch.Size([1, 768])

In [33]:
from model.seq2seq_encoder_modleing import CustomTransformerDecoder

# Import the parent module
import model.seq2seq_encoder_modleing

# Reload the parent module
import importlib
importlib.reload(model.seq2seq_encoder_modleing)

# Re-import the updated class if necessary
from model.seq2seq_encoder_modleing import CustomTransformerDecoder

In [34]:
model = CustomTransformerDecoder(128, 8, 1024, 0.1, 6)

AssertionError: embed_dim must be divisible by num_heads