In [None]:
from transformers import CLIPProcessor, CLIPVisionModelWithProjection
import torch, pathlib
from PIL import Image

device = "cuda" if torch.cuda.is_available() else "cpu"

model_name = "openai/clip-vit-large-patch14" 
processor  = CLIPProcessor.from_pretrained(model_name)
vision_enc = CLIPVisionModelWithProjection.from_pretrained(model_name).to(device)
vision_enc.eval()

  from .autonotebook import tqdm as notebook_tqdm
Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.52, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPVisionModelWithProjection(
  (vision_model): CLIPVisionTransformer(
    (embeddings): CLIPVisionEmbeddings(
      (patch_embedding): Conv2d(3, 1024, kernel_size=(14, 14), stride=(14, 14), bias=False)
      (position_embedding): Embedding(257, 1024)
    )
    (pre_layrnorm): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-23): 24 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (v_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (q_proj): Linear(in_features=1024, out_features=1024, bias=True)
            (out_proj): Linear(in_features=1024, out_features=1024, bias=True)
          )
          (layer_norm1): LayerNorm((1024,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=1024, out_fe

In [None]:
print(f"device {device}")

device cpu


: 

In [None]:
import cv2
import numpy as np

video_path = '../movie.mp4'
cap = cv2.VideoCapture(video_path)

print(f"number of frames: {int(cap.get(cv2.CAP_PROP_FRAME_COUNT))}")
print(f"frame rate: {cap.get(cv2.CAP_PROP_FPS)}")
print(f"length in seconds: {cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS)}")

seconds = np.arange(0, cap.get(cv2.CAP_PROP_FRAME_COUNT) / cap.get(cv2.CAP_PROP_FPS))
seconds

number of frames: 87720
frame rate: 24.0
length in seconds: 3655.0


array([0.000e+00, 1.000e+00, 2.000e+00, ..., 3.652e+03, 3.653e+03,
       3.654e+03], shape=(3655,))

In [None]:
from PIL import Image

for sec in seconds:
    cap.set(cv2.CAP_PROP_POS_MSEC, sec*1000) 
    ret, frame = cap.read()

    # save frame as image
    if ret:
        frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB) 
        image = Image.fromarray(frame)
        image.save(f"../frames/{int(sec)}.jpg")
    else:
        print(f"Failed to read frame at {sec} seconds.")

Failed to read frame at 3655.0 seconds.


In [None]:
# apply CLIP to the frames
import os
frames_path = pathlib.Path("../frames")

features = []
for frame_file in seconds:
    print(f"Processing {frames_path}/{int(frame_file)}.jpg")
    image = Image.open(f"{frames_path}/{int(frame_file)}.jpg").convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    
    with torch.no_grad():
        outputs = vision_enc(**inputs)
    
    image_features = outputs.image_embeds
    print(f"feature shape: {image_features.shape}")
    
    features.append(image_features.cpu().numpy())

features = np.concatenate(features, axis=0)
print(f"features shape: {features.shape}")

In [25]:
features = np.concatenate(features, axis=0)
print(f"features shape: {features.shape}")

features shape: (3655, 768)


In [26]:
# save features as mat
import scipy.io as sio
sio.savemat("vision_clip_features.mat", {"features": features})