In [None]:
import os
import cv2
import torch
from PIL import Image
import numpy as np
from torchvision import transforms
from transformers import CLIPProcessor, CLIPModel

# ---- Config ----
VIDEO_PATH = "sample_outdoor_video.mp4"
FRAME_SAVE_PATH = "frames/"
FRAME_INTERVAL = 60  # every N frames
MODEL_NAME = "openai/clip-vit-base-patch32"

# ---- Create frame directory ----
os.makedirs(FRAME_SAVE_PATH, exist_ok=True)

# ---- Load CLIP model ----
device = "cuda" if torch.cuda.is_available() else "cpu"
model = CLIPModel.from_pretrained(MODEL_NAME).to(device)
processor = CLIPProcessor.from_pretrained(MODEL_NAME)

# ---- Step 1: Extract Frames ----
def extract_frames(video_path, interval, output_dir):
    cap = cv2.VideoCapture(video_path)
    frame_count = 0
    saved_count = 0
    
    while True:
        ret, frame = cap.read()
        if not ret:
            break

        if frame_count % interval == 0:
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            image_pil = Image.fromarray(frame_rgb)
            image_path = os.path.join(output_dir, f"frame_{saved_count:04d}.png")
            image_pil.save(image_path)
            saved_count += 1

        frame_count += 1
    cap.release()

# ---- Step 2: Get CLIP Embeddings ----
def get_clip_embedding(image_path):
    image = Image.open(image_path).convert("RGB")
    inputs = processor(images=image, return_tensors="pt").to(device)
    with torch.no_grad():
        embedding = model.get_image_features(**inputs)
    return embedding.squeeze().cpu().numpy()

# ---- Run Pipeline ----
print("Extracting frames...")
extract_frames(VIDEO_PATH, FRAME_INTERVAL, FRAME_SAVE_PATH)

print("Computing embeddings...")
embeddings = []
image_files = sorted([f for f in os.listdir(FRAME_SAVE_PATH) if f.endswith(".png")])

for filename in image_files:
    path = os.path.join(FRAME_SAVE_PATH, filename)
    emb = get_clip_embedding(path)
    embeddings.append((filename, emb))

print(f"Computed embeddings for {len(embeddings)} frames")

# Optional: Save embeddings to file
np.save("clip_embeddings.npy", embeddings)
print("Saved embeddings to clip_embeddings.npy")
