In [13]:
import torch
from transformers import pipeline
from transformers import AutoTokenizer, CLIPTextModel

Documentation for the clip model is at https://huggingface.co/docs/transformers/v4.42.0/en/model_doc/clip#transformers.CLIPTextModel

Use the text encoder in CLIP to encode images

In [14]:
tokenizer = AutoTokenizer.from_pretrained("openai/clip-vit-large-patch14")
textModel = CLIPTextModel.from_pretrained("openai/clip-vit-large-patch14")

In [27]:
# print out the text model's architecture
print(textModel)

CLIPTextModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 768)
      (position_embedding): Embedding(77, 768)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPAttention(
            (k_proj): Linear(in_features=768, out_features=768, bias=True)
            (v_proj): Linear(in_features=768, out_features=768, bias=True)
            (q_proj): Linear(in_features=768, out_features=768, bias=True)
            (out_proj): Linear(in_features=768, out_features=768, bias=True)
          )
          (layer_norm1): LayerNorm((768,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=768, out_features=3072, bias=True)
            (fc2): Linear(in_features=3072, out_features=768, bias=True)
          )
          (layer_norm2): LayerNorm((768,), eps=1e

In [26]:
text = "A photo of a cat and a dog."

inputs = tokenizer(text, return_tensors="pt", padding=True)
outputs = textModel(**inputs)

print(f"Text features: {outputs.last_hidden_state.shape}")
print(f"Text features: {outputs.last_hidden_state}")

print(f"poller output:  {outputs.pooler_output.shape}")
print(f"poller output:  {outputs.pooler_output}")


Text features: torch.Size([1, 11, 768])
Text features: tensor([[[-0.3884,  0.0229, -0.0522,  ..., -0.4899, -0.3066,  0.0675],
         [ 0.0290, -1.3258,  0.3085,  ..., -0.5257,  0.9768,  0.6652],
         [ 1.1565,  0.1318,  0.7895,  ..., -2.1024, -1.1519, -0.3311],
         ...,
         [-1.3661,  0.4994,  2.1031,  ..., -1.2776, -0.5481, -0.0732],
         [-1.1322,  1.0092,  2.5098,  ..., -0.7523, -0.4378,  0.4130],
         [-1.5944,  0.8174,  3.0335,  ...,  0.6500, -0.5789, -0.0236]]],
       grad_fn=<NativeLayerNormBackward0>)
poller output:  torch.Size([1, 768])
poller output:  tensor([[-1.5944e+00,  8.1737e-01,  3.0335e+00, -3.9715e-01, -1.5791e+00,
         -4.0794e-01, -1.7446e+00, -5.4422e-01,  1.6605e-02, -5.3731e-01,
         -1.7802e-01, -8.7173e-02, -1.5956e+00, -2.1102e-01, -7.0768e-01,
          2.9283e-01, -1.1573e+00, -7.9795e-01,  7.5977e-01,  3.3996e+00,
          9.5154e-01,  1.2283e+00, -1.7141e+00,  3.7402e-01,  2.1903e-01,
         -2.1255e+00, -5.5613e-01, -9

Text features: torch.Size([1, 7, 768])
Text features: tensor([[[-0.3884,  0.0229, -0.0522,  ..., -0.4899, -0.3066,  0.0675],
         [ 0.0290, -1.3258,  0.3085,  ..., -0.5257,  0.9768,  0.6652],
         [ 1.1565,  0.1318,  0.7895,  ..., -2.1024, -1.1519, -0.3311],
         ...,
         [ 0.5830, -0.1372,  2.1518,  ..., -1.0520, -0.1529,  0.0968],
         [-0.0781,  0.9827,  0.6915,  ..., -2.8869,  0.0210, -0.4127],
         [-1.2166, -0.5148,  0.4800,  ..., -0.1383,  0.8114,  0.5570]]],
       grad_fn=<NativeLayerNormBackward0>)
