In [None]:
import open_clip
import cv2
from PIL import Image
import torch
import numpy as np
from glob import glob
import os
import h5py
from tqdm import tqdm
from collections import defaultdict


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'
model, _, preprocess = open_clip.create_model_and_transforms(
    model_name='ViT-H-14-quickgelu',
    pretrained='dfn5b',
    device=device
)
model.eval().cuda()

output_dir = '/content/drive/MyDrive/AIC/features'    # Save 1 .npy per image
os.makedirs(output_dir, exist_ok=True)


In [None]:
root_dir = "/content/drive/MyDrive/AIC/keyframes"
image_paths = sorted(glob(f"{root_dir}/**/*.jpg", recursive=True))
print(f"Found {len(image_paths)} images.")

In [None]:
batch_size=256

folder_to_images = defaultdict(list)
for img_path in image_paths:
    folder_to_images[os.path.dirname(img_path)].append(img_path)

for folder_path, image_files in tqdm(folder_to_images.items(), desc="Processing folders"):
    # Create matching output subfolder

    rel_path = os.path.relpath(folder_path, root_dir)
    save_folder = os.path.join(output_dir, rel_path)
    os.makedirs(save_folder, exist_ok=True)

    # Sort image paths for consistent processing
    image_files = sorted(image_files)

    # Process in batches
    for i in range(0, len(image_files), batch_size):
        batch_files = image_files[i:i + batch_size]
        batch_images = []
        valid_files = []

        for img_path in batch_files:
            try:
                image = Image.open(img_path).convert('RGB')
                image_tensor = preprocess(image)
                batch_images.append(image_tensor)
                valid_files.append(img_path)
            except Exception as e:
                print(f"Error loading {img_path}: {e}")

        if not batch_images:
            continue

        # Encode features
        input_tensor = torch.stack(batch_images).to(device)
        with torch.no_grad():
            features = model.encode_image(input_tensor)
            features = features / features.norm(dim=-1, keepdim=True)

        # Save each feature with same relative path
        for img_path, feat in zip(valid_files, features):
            rel_img_path = os.path.relpath(img_path, root_dir)
            save_path = os.path.join(output_dir, os.path.splitext(rel_img_path)[0] + '.npy')
            os.makedirs(os.path.dirname(save_path), exist_ok=True)
            np.save(save_path, feat.cpu().numpy())