In [15]:
import json
import re
import torch
import numpy as np
from PIL import Image
from difflib import SequenceMatcher
from transformers import CLIPProcessor, CLIPModel
from sentence_transformers import SentenceTransformer
from sklearn.metrics.pairwise import cosine_similarity
import os





#### How Caption Similarity is Calculated

The script calculates caption similarity using Sentence Transformers (BERT-based embeddings). Each caption is converted into a numerical vector representation using the "all-MiniLM-L6-v2" model. These embeddings capture the semantic meaning of the text. To compare two captions, the script uses cosine similarity, which measures how close the two vectors are in high-dimensional space. If the similarity score is greater than **0.85**, the captions are considered duplicates, and the older frame may be replaced with the newer one.

#### How Image Similarity is Calculated

The script calculates image similarity using CLIP embeddings, which encode images into feature vectors that represent their content. Each frame is passed through OpenAI’s CLIP model, generating a vector that describes the image in a way that captures semantic and structural details. The script then uses cosine similarity to compare these vectors—if the similarity is greater than **0.9**, the images are considered visually redundant. In such cases, the older frame is replaced by the newer one.

In [18]:

# Load CLIP model for image similarity
clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# Load Sentence Transformer for caption similarity
caption_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

def extract_frame_number(frame_name):
    """Extracts the numerical part of the frame name."""
    match = re.search(r'frame_(\d+)', frame_name)
    return int(match.group(1)) if match else float('inf')

def extract_video_id_from_filename(filename):
    """Extracts the YouTube video ID from a filename like 'Ilg3gGewQ5U_processed.json'"""
    match = re.search(r'([0-9A-Za-z_-]{11})_processed', filename)
    return match.group(1) if match else None

def get_image_embedding(image_path):
    """Extracts CLIP embedding for an image."""
    image = Image.open(image_path).convert("RGB")
    inputs = clip_processor(images=image, return_tensors="pt")
    with torch.no_grad():
        embedding = clip_model.get_image_features(**inputs)
    return embedding.squeeze().numpy()

def get_caption_embedding(caption):
    """Extracts sentence embedding for a caption using Sentence Transformers."""
    return caption_model.encode(caption)

def cosine_sim(vec1, vec2):
    """Computes cosine similarity between two vectors."""
    return cosine_similarity([vec1], [vec2])[0][0]

def filter_frames(json_file, similarity_threshold=0.85, image_threshold=0.9, base_image_folder="processed_frames"):
    """
    Filters frames based on both caption similarity and image similarity.
    """
    video_id = extract_video_id_from_filename(os.path.basename(json_file))
    if not video_id:
        raise ValueError("Could not extract video ID from JSON filename.")
    
    image_folder = f"{base_image_folder}/{video_id}"
    
    with open(json_file, "r", encoding="utf-8") as f:
        data = json.load(f)

    seen_captions = {}  # Stores caption embeddings
    seen_images = {}  # Stores image embeddings
    filtered_frames = []  # Stores selected frames

    for frame in data:
        caption = frame["caption"]
        frame_path = f"{image_folder}/{frame['frame']}"
        frame_number = extract_frame_number(frame["frame"])

        # Compute embeddings
        caption_embedding = get_caption_embedding(caption)
        image_embedding = get_image_embedding(frame_path)

        # Check caption similarity
        found_similar_caption = None
        for seen_caption, seen_caption_embedding in seen_captions.items():
            if cosine_sim(caption_embedding, seen_caption_embedding) > similarity_threshold:
                found_similar_caption = seen_caption
                break

        # Check image similarity
        found_similar_image = None
        for seen_image_path, seen_image_embedding in seen_images.items():
            if cosine_sim(image_embedding, seen_image_embedding) > image_threshold:
                found_similar_image = seen_image_path
                break

        if found_similar_caption or found_similar_image:
            # If a similar frame exists, replace it only if the new frame is more recent
            existing_frame_index = next(
                (i for i, f in enumerate(filtered_frames) if f["caption"] == found_similar_caption or f["frame"] == found_similar_image),
                None
            )

            if existing_frame_index is not None and extract_frame_number(filtered_frames[existing_frame_index]["frame"]) < frame_number:
                filtered_frames[existing_frame_index] = frame  # Replace old frame with newer one
                seen_captions[found_similar_caption] = caption_embedding
                seen_images[found_similar_image] = image_embedding
        else:
            filtered_frames.append(frame)
            seen_captions[caption] = caption_embedding
            seen_images[frame_path] = image_embedding

    # Sort frames by their chronological order
    filtered_frames.sort(key=lambda x: extract_frame_number(x["frame"]))

    # Save the filtered data
    output_file = json_file.replace(".json", "_filtered.json")
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(filtered_frames, f, indent=4, ensure_ascii=False)

    print(f"Filtered JSON saved to: {output_file}")
    return filtered_frames



In [19]:
# Example usage:
filtered_data = filter_frames("output_data/Ilg3gGewQ5U_processed.json")
filtered_data

Filtered JSON saved to: output_data/Ilg3gGewQ5U_processed_filtered.json


[{'frame': 'frame_0.jpg',
  'label': 'matchstick',
  'caption': 'a black background with a white and red flower',
  'extracted_text': ''},
 {'frame': 'frame_25.jpg',
  'label': 'web site, website, internet site, site',
  'caption': 'pi rec intuitive walkthon derivative in commatic graphs',
  'extracted_text': '* Recap\n*Intuitive walkthrough\n\n* Derivatives in\ncomputational graphs\n\na) = o(2\\)\n\nCol...) = (a — y)?\n\nAL) = wHgE—D 4 yD)\n\nDesired\noutput\n\npre\n\n'},
 {'frame': 'frame_50.jpg',
  'label': 'breastplate, aegis, egis',
  'caption': 'a computer generated image of a red and blue line',
  'extracted_text': ''},
 {'frame': 'frame_70.jpg',
  'label': 'analog clock',
  'caption': 'what is the cost of difference?',
  'extracted_text': 'What’s the “cost”\n\n- of this difference?\n\nN\nWSN ZO ELG cS N 0 0\nSO ew, ge O2\nRES LO Wes! A 8:\nSSG NEG er a\nWoe: PO AESO SIAN ZI O4\n784 9: Xe KSA Et\nPINT EER Fe: Os\nYC ° et\n\n'},
 {'frame': 'frame_75.jpg',
  'label': 'scoreboard',

### Just Caption Code

In [None]:


# # Load CLIP model for image similarity
# clip_model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32")
# clip_processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")

# # Load Sentence Transformer for caption similarity
# caption_model = SentenceTransformer("sentence-transformers/all-MiniLM-L6-v2")

# def similar(a, b):
#     """Computes similarity ratio between two strings using SequenceMatcher."""
#     return SequenceMatcher(None, a, b).ratio()

# def extract_frame_number(frame_name):
#     """Extracts the numerical part of the frame name."""
#     match = re.search(r'frame_(\d+)', frame_name)
#     return int(match.group(1)) if match else float('inf')

# def get_image_embedding(image_path):
#     """Extracts CLIP embedding for an image."""
#     image = Image.open(image_path).convert("RGB")
#     inputs = clip_processor(images=image, return_tensors="pt")
#     with torch.no_grad():
#         embedding = clip_model.get_image_features(**inputs)
#     return embedding.squeeze().numpy()

# def get_caption_embedding(caption):
#     """Extracts sentence embedding for a caption using Sentence Transformers."""
#     return caption_model.encode(caption)

# def cosine_sim(vec1, vec2):
#     """Computes cosine similarity between two vectors."""
#     return cosine_similarity([vec1], [vec2])[0][0]

# def filter_frames(json_file, similarity_threshold=0.85, image_threshold=0.9, image_folder="processed_frames/"):
#     """
#     Filters frames based on both caption similarity and image similarity.
#     """
#     with open(json_file, "r", encoding="utf-8") as f:
#         data = json.load(f)

#     seen_captions = {}
#     seen_images = {}

#     for frame in data:
#         caption = frame["caption"]
#         frame_path = f"{image_folder}/{frame['frame']}"
#         frame_number = extract_frame_number(frame["frame"])

#         # Compute embeddings
#         caption_embedding = get_caption_embedding(caption)
#         image_embedding = get_image_embedding(frame_path)

#         # Check caption similarity
#         found_similar_caption = None
#         for seen_caption, seen_caption_embedding in seen_captions.items():
#             if cosine_sim(caption_embedding, seen_caption_embedding) > similarity_threshold:
#                 found_similar_caption = seen_caption
#                 break

#         # Check image similarity
#         found_similar_image = None
#         for seen_image_path, seen_image_embedding in seen_images.items():
#             if cosine_sim(image_embedding, seen_image_embedding) > image_threshold:
#                 found_similar_image = seen_image_path
#                 break

#         if found_similar_caption or found_similar_image:
#             # If a similar frame exists, replace it only if the new frame is more recent
#             existing_frame = seen_captions.get(found_similar_caption, seen_images.get(found_similar_image))
#             if existing_frame and extract_frame_number(existing_frame["frame"]) < frame_number:
#                 if found_similar_caption:
#                     seen_captions[found_similar_caption] = frame
#                 if found_similar_image:
#                     seen_images[found_similar_image] = image_embedding
#         else:
#             seen_captions[caption] = frame
#             seen_images[frame_path] = image_embedding

#     # Sort frames by their chronological order
#     filtered_frames = sorted(seen_captions.values(), key=lambda x: extract_frame_number(x["frame"]))

#     # Save the filtered data
#     output_file = json_file.replace(".json", "_filtered.json")
#     with open(output_file, "w", encoding="utf-8") as f:
#         json.dump(filtered_frames, f, indent=4, ensure_ascii=False)

#     print(f"Filtered JSON saved to: {output_file}")
#     return filtered_frames

# # Example usage:
# filtered_data = filter_frames("output_data/Ilg3gGewQ5U_processed.json")


In [None]:
# # Example usage:
# filtered_data = filter_frames("output_data/Ilg3gGewQ5U_processed.json")
# filtered_data