In [None]:
import os

import huggingface_hub
import numpy as np
import tensorflow as tf
from PIL import Image
from transformers import AutoProcessor, TFCLIPVisionModel

MODEL_NAME = "openai/clip-vit-base-patch32"
CACHE_DIR = "/tmp"
IMAGES_FOLDER = "./examples"

In [None]:
processor = AutoProcessor.from_pretrained(
    MODEL_NAME,
    cache_dir=CACHE_DIR,
)

In [None]:
model = TFCLIPVisionModel.from_pretrained(
    MODEL_NAME,
    cache_dir=CACHE_DIR,
)

In [None]:
test_files = [
    os.path.join(IMAGES_FOLDER, f) for f in os.listdir(IMAGES_FOLDER) if os.path.isfile(os.path.join(IMAGES_FOLDER, f))
]
test_files

In [None]:
for file in test_files:
    image = Image.open(file).convert("RGB")
    inputs = processor(images=image, return_tensors="tf")
    print(f"Inputs Size: {inputs['pixel_values'].shape}  file: {file}")
    # inputs is of shape (1, 3, 224, 224) for the model
    # convert to shape (1, 224, 224, 3)
    # inputs["pixel_values"] = tf.transpose(inputs["pixel_values"], perm=[0, 2, 3, 1])
    print(f"Transpose: {inputs['pixel_values'].shape}")
    outputs = model(**inputs)
    pooled_output = outputs.last_hidden_state

    print(f"Embeddings Size: {pooled_output.shape} ")