Frame process

In [None]:
import os
import cv2
from tqdm import tqdm
import torch
from torch.utils.data import Dataset
import numpy as np
from transformers import ViTModel, ViTImageProcessor
from PIL import Image
import json

def extract_frames_from_video(video_path, frame_interval=30, resize=(224, 224)):
    """
    Extract frames from a video at a fixed interval and resize them.

    Args:
        video_path (str): Path to the video file.
        frame_interval (int): Interval in frames for extraction (e.g., 30 = one frame every ~1s at 30 FPS).
        resize (tuple): Desired output size (width, height).

    Returns:
        tuple:
            frames (list[PIL.Image]): List of extracted frames as PIL Images.
            fps (float): Frames per second of the original video.
    """
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    frames = []
    frame_count = 0

    while True:
        ret, frame = cap.read()
        if not ret:
            break
        if frame_count % frame_interval == 0:
            # Convert BGR to RGB
            frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
            # Resize frame to (224,224)
            frame_resized = cv2.resize(frame_rgb, resize)
            pil_img = Image.fromarray(frame_resized)
            frames.append(pil_img)
        frame_count += 1
    cap.release()
    return frames, fps


class VideoFeatureDataset(Dataset):
    """
    PyTorch Dataset for loading pre-extracted video frame features from .npy files.

    Each .npy file contains a tensor of shape (num_frames, num_tokens, feature_dim).
    """
    def __init__(self, feature_dir):
        """
        Args:
            feature_dir (str): Directory containing .npy feature files.
        """
        self.feature_dir = feature_dir
        self.feature_files = [f for f in os.listdir(feature_dir) if f.endswith(".npy")]

    def __len__(self):
        return len(self.feature_files)

    def __getitem__(self, idx):
        """
        Load features for the given index.

        Args:
            idx (int): Index of the feature file.

        Returns:
            tuple:
                features (Tensor): Extracted features as a float tensor.
                video_id (str): Name of the video (file name without extension).
        """
        file_name = self.feature_files[idx]
        path = os.path.join(self.feature_dir, file_name)
        features = np.load(path)
        features = torch.from_numpy(features).float()
        video_id = os.path.splitext(file_name)[0]
        return features, video_id


def setup_model(device='cpu'):
    """
    Initialize the Vision Transformer (ViT) feature extractor and projection layer.

    Args:
        device (str): Device to place model on ('cpu', 'cuda', or 'mps').

    Returns:
        tuple:
            vit_model (ViTModel): Pretrained ViT model.
            f_fc (torch.nn.Linear): Linear projection layer (768 -> 256).
            processor (ViTImageProcessor): Image processor for preprocessing frames.
    """
    model_name = "google/vit-base-patch16-224-in21k"
    processor = ViTImageProcessor.from_pretrained(model_name)
    vit_model = ViTModel.from_pretrained(model_name)
    vit_model.to(device)
    f_fc = torch.nn.Linear(768, 256).to(device)
    return vit_model, f_fc, processor


def extract_256d_features_tokenwise(vit_model, f_fc, processor, images, device):
    """
    Extract token-wise features and project them to 256-dim using a linear layer.

    Args:
        vit_model (ViTModel): Pretrained ViT model.
        f_fc (torch.nn.Linear): Projection layer (768 -> 256).
        processor (ViTImageProcessor): Preprocessor for ViT.
        images (list[PIL.Image]): List of frames to process.
        device (str): Device for inference.

    Returns:
        torch.Tensor: Tensor of shape (num_frames, num_tokens, 256).
    """
    vit_model.eval()
    f_fc.eval()
    all_features = []
    batch_size = 32

    with torch.no_grad():
        for i in range(0, len(images), batch_size):
            batch_imgs = images[i:i+batch_size]
            inputs = processor(images=batch_imgs, return_tensors="pt")
            inputs = {k: v.to(device) for k, v in inputs.items()}
            outputs = vit_model(**inputs)
            token_embeddings = outputs.last_hidden_state # (B, num_tokens, 768)
            features_256d = f_fc(token_embeddings)       # Project to 256D
            all_features.append(features_256d.cpu())
    return torch.cat(all_features, dim=0)


if __name__ == "__main__":
    device = "mps" if torch.backends.mps.is_available() else "cpu"

    # Paths for input videos and output embeddings
    video_folder = "/Users/jumita/Downloads/Book5"
    frame_feature_dir = "/Users/jumita/Downloads/final/frame"
    os.makedirs(frame_feature_dir, exist_ok=True)

    video_extensions = [".mp4", ".avi", ".mov", ".mkv"]
    video_paths = [
        os.path.join(video_folder, f)
        for f in os.listdir(video_folder)
        if os.path.isfile(os.path.join(video_folder, f)) and os.path.splitext(f)[1].lower() in video_extensions
    ]

    vit_model, f_fc, processor = setup_model(device=device)

    for video_path in tqdm(video_paths, desc="Extracting frame features"):
        video_name = os.path.splitext(os.path.basename(video_path))[0]
        frames, fps = extract_frames_from_video(video_path, frame_interval=30)

        if len(frames) == 0:
            print(f"[Warning] No frames extracted from {video_name}, skipping.")
            continue

        features = extract_256d_features_tokenwise(vit_model, f_fc, processor, frames, device)

        # Save extracted features to .npy
        save_feat_path = os.path.join(frame_feature_dir, f"{video_name}.npy")
        np.save(save_feat_path, features.numpy())

    # Save config info for reference
    config = {
        "model_name": "google/vit-base-patch16-224-in21k",
        "feature_dim": 256,
        "frame_interval": 30,
        "device": device,
        "num_videos": len(video_paths)
    }
    config_path = os.path.join(frame_feature_dir, "feature_config.json")
    with open(config_path, "w") as f:
        json.dump(config, f, indent=4)
    print(f"[Config Saved] {config_path}")


Extracting frame features: 100%|██████████| 39/39 [01:22<00:00,  2.13s/it]

[Config Saved] /Users/jumitatran/Downloads/not_in_excel/frame/feature_config.json





OCR and sound - extract, fine tune and embed

In [None]:
import os
import cv2
import re
import numpy as np
from tqdm import tqdm
import torch
import whisper
import easyocr
import nltk
from nltk.corpus import words
from datasets import Dataset
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn
import json
import glob
import pandas as pd

os.environ["TOKENIZERS_PARALLELISM"] = "false"

# ===== NLP normalization =====
nltk.download("words", quiet=True)
english_vocab = set(w.lower() for w in words.words())

def normalize(word):
    """
    Normalize a word by:
    - Removing non-alphanumeric characters.
    - Converting to lowercase.
    """
    return re.sub(r'\W+', '', word).lower()

def is_english_word(word):
    """
    Check if a word is an English word or a number.
    
    Returns:
        bool: True if word is in NLTK's English vocabulary or is numeric.
    """
    norm = normalize(word)
    return norm.isdigit() or norm in english_vocab

def clean_and_filter_english(text):
    """
    Clean a text string by filtering out non-English words.

    Args:
        text (str): Input text.

    Returns:
        str: Filtered string containing only English words and numbers.
    """
    word_list = text.strip().split()
    return " ".join([w for w in word_list if is_english_word(w)])

# ===== OCR from image =====
def extract_ocr_text(frame, reader):
    """
    Extract and clean OCR text from an image frame using EasyOCR.

    Args:
        frame (np.ndarray): Frame image from video (BGR format).
        reader (easyocr.Reader): EasyOCR reader object.

    Returns:
        str: Cleaned OCR text from the frame.
    """
    results = reader.readtext(frame, detail=1, paragraph=False)
    filtered = []
    for bbox, text, conf in results:
        text = clean_and_filter_english(text)
        if text:
            filtered.append(text)
    return " ".join(filtered)

# ===== ASR transcript lookup by timestamp =====
def find_transcript_for_time(segments, timestamp):
    """
    Find ASR transcript text for a given timestamp by searching Whisper's output segments.

    Args:
        segments (list): List of Whisper transcription segments with 'start', 'end', 'text'.
        timestamp (float): Timestamp (seconds) to look up.

    Returns:
        str: Transcript text for the given time (empty string if not found).
    """
    for segment in segments:
        if segment['start'] <= timestamp <= segment['end']:
            return segment['text']
    return ""

# ===== Extract text from video =====
def extract_texts_from_video(video_path, whisper_model, reader, fps_interval=5):
    """
    Extract combined text from video using:
    - Whisper ASR for audio transcription.
    - EasyOCR for text in frames.

    Args:
        video_path (str): Path to the video file.
        whisper_model: Whisper model for ASR.
        reader: EasyOCR reader object.
        fps_interval (int): Interval in seconds for OCR frame sampling.

    Returns:
        str: Combined (OCR + ASR) text extracted from video.
    """
    cap = cv2.VideoCapture(video_path)
    fps = cap.get(cv2.CAP_PROP_FPS)
    interval = int(fps * fps_interval) if fps > 0 else 1
    frame_id = 0

    results = whisper_model.transcribe(video_path, language="en", verbose=False)
    segments = results.get("segments", [])
    all_texts = []

    while cap.isOpened():
        ret, frame = cap.read()
        if not ret:
            break
        if frame_id % interval == 0:
            ocr_text = extract_ocr_text(frame, reader)
            timestamp = frame_id / fps if fps > 0 else 0
            asr_text = find_transcript_for_time(segments, timestamp)
            asr_text = clean_and_filter_english(asr_text)
            ocr_text = clean_and_filter_english(ocr_text)

            combined_text = (asr_text + " " + ocr_text).strip()
            if combined_text:
                all_texts.append(combined_text)
        frame_id += 1

    cap.release()
    merged_text = " ".join(all_texts).strip()
    return merged_text

# ===== Text embedding model with projection =====
class BERTEmbeddingWithProjection(nn.Module):
    """
    BERT-based text embedding extractor with linear projection to 256D.
    """
    def __init__(self, output_dim=256):
        super().__init__()
        self.base = AutoModel.from_pretrained("bert-base-uncased")
        self.proj = nn.Linear(self.base.config.hidden_size, output_dim)

    def forward(self, input_ids, attention_mask=None, token_type_ids=None):
        out = self.base(input_ids=input_ids, attention_mask=attention_mask, token_type_ids=token_type_ids)
        cls = out.last_hidden_state[:, 0, :]
        return self.proj(cls)

def create_and_save_text_embeddings(texts, video_ids, model, tokenizer, device="cpu", batch_size=8, save_dir="./"):
    """
    Create and save text embeddings for a list of texts.

    Args:
        texts (list[str]): List of extracted texts.
        video_ids (list[str]): Corresponding video IDs.
        model (nn.Module): Text embedding model.
        tokenizer: HuggingFace tokenizer for input encoding.
        device (str): Device to run inference on ("cpu", "cuda", "mps").
        batch_size (int): Number of samples per batch.
        save_dir (str): Directory to save .npy embeddings.
    """
    os.makedirs(save_dir, exist_ok=True)
    model.eval()
    for i in range(0, len(texts), batch_size):
        batch_texts = texts[i:i+batch_size]
        batch_ids = video_ids[i:i+batch_size]

        enc = tokenizer(batch_texts, return_tensors="pt", padding=True, truncation=True, max_length=128)
        enc = {k: v.to(device) for k, v in enc.items()}
        with torch.no_grad():
            out = model(**enc)
            embeddings = out.cpu().numpy()

        for vid, emb in zip(batch_ids, embeddings):
            save_path = os.path.join(save_dir, f"{vid}.npy")
            np.save(save_path, emb)
            print(f"Saved embedding for {vid} at {save_path}")

# ===== Main execution =====
if __name__ == "__main__":
    folder_path = "/Users/jumitatran/Downloads/Book5"
    device = "cpu"

    whisper_model = whisper.load_model("small")
    use_gpu_easyocr = (device == "cuda")
    reader = easyocr.Reader(['en'], gpu=use_gpu_easyocr)

    video_files = glob.glob(os.path.join(folder_path, "*.mp4"))
    print(f"Found {len(video_files)} video files.")

    all_texts = []
    video_ids = []

    # Extract text from videos
    for video_path in tqdm(video_files):
        merged_text = extract_texts_from_video(video_path, whisper_model, reader)
        if not merged_text:
            merged_text = ""
        all_texts.append(merged_text)
        video_ids.append(os.path.splitext(os.path.basename(video_path))[0])

    # Load labels for reference
    df = pd.read_excel("/Users/jumitatran/Downloads/Book5.xlsx")
    labels = df["label"].tolist()

    # Initialize embedding model
    embed_model = BERTEmbeddingWithProjection(output_dim=256).to(device)
    embed_model.eval()
    embed_tokenizer = AutoTokenizer.from_pretrained("bert-base-uncased")

    # Save extracted text to JSON for inspection
    output_text_path = "/Users/jumitatran/Downloads/extracted_texts2.json"
    with open(output_text_path, "w", encoding="utf-8") as f:
        json.dump(dict(zip(video_ids, all_texts)), f, ensure_ascii=False, indent=2)
    print(f"Saved extracted texts for {len(video_ids)} videos at {output_text_path}")
    # Create embeddings for all texts
    create_and_save_text_embeddings(
        all_texts,
        video_ids,
        embed_model,
        embed_tokenizer,
        device=device,
        batch_size=8,
        save_dir="/Users/jumitatran/Downloads/not_in_excel/text"
    )
    print(f"Saved {len(video_ids)} embeddings to folder: /Users/jumitatran/Downloads/not_in_excel/text")


Using CPU. Note: This module is much faster with a GPU.


Found 10 video files.


100%|██████████| 17912/17912 [00:13<00:00, 1288.64frames/s]
100%|██████████| 2080/2080 [00:11<00:00, 189.04frames/s]
100%|██████████| 9209/9209 [00:08<00:00, 1098.20frames/s]
 39%|███▉      | 1950/4950 [00:01<00:02, 1240.89frames/s]
  0%|          | 0/831 [00:00<?, ?frames/s]
100%|██████████| 766/766 [00:00<00:00, 898.24frames/s]
100%|██████████| 16941/16941 [00:08<00:00, 1931.86frames/s]
100%|██████████| 3692/3692 [00:13<00:00, 264.87frames/s]
100%|██████████| 487/487 [00:00<00:00, 706.55frames/s]
100%|██████████| 1272/1272 [00:01<00:00, 1149.54frames/s]
100%|██████████| 10/10 [09:16<00:00, 55.63s/it]


Saved extracted texts for 10 videos at /Users/jumitatran/Downloads/extracted_texts2.json


  return forward_call(*args, **kwargs)


Saved embedding for 7504644259056831766 at /Users/jumitatran/Downloads/not_in_excel/text/7504644259056831766.npy
Saved embedding for 7496083643278822699 at /Users/jumitatran/Downloads/not_in_excel/text/7496083643278822699.npy
Saved embedding for 7520700854324694294 at /Users/jumitatran/Downloads/not_in_excel/text/7520700854324694294.npy
Saved embedding for 7515801813078101279 at /Users/jumitatran/Downloads/not_in_excel/text/7515801813078101279.npy
Saved embedding for 7509974643906710806 at /Users/jumitatran/Downloads/not_in_excel/text/7509974643906710806.npy
Saved embedding for 7517408801704643895 at /Users/jumitatran/Downloads/not_in_excel/text/7517408801704643895.npy
Saved embedding for 7496217736339541270 at /Users/jumitatran/Downloads/not_in_excel/text/7496217736339541270.npy
Saved embedding for 7522994842775850270 at /Users/jumitatran/Downloads/not_in_excel/text/7522994842775850270.npy
Saved embedding for 7518306957858475282 at /Users/jumitatran/Downloads/not_in_excel/text/7518306

Metadata embedding

In [None]:
import os
import numpy as np
import pandas as pd
import torch
from transformers import AutoTokenizer, AutoModel
import torch.nn as nn

class TikTokProcessorWithProjection(nn.Module):
    """
    Text embedding processor for TikTok captions using BERT with a projection layer.

    This class:
    - Loads a pretrained BERT model (default: bert-base-uncased).
    - Adds a linear projection layer to reduce embedding dimension (e.g. 768 → 256).
    - Encodes a list of texts into projected embeddings.

    Args:
        model_name (str): HuggingFace model name (default "bert-base-uncased").
        output_dim (int): Target dimensionality of output embeddings (default 256).
        device (str): Device to run the model ("cpu" / "cuda").
    """
    def __init__(self, model_name="bert-base-uncased", output_dim=256, device="cpu"):
        super().__init__()
        self.device = device
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        self.bert = AutoModel.from_pretrained(model_name)
        self.proj = nn.Linear(self.bert.config.hidden_size, output_dim)  # 768->256
        self.to(device)

    def forward(self, texts, batch_size=32, max_length=256):
        """
        Generate embeddings for a list of texts.

        Args:
            texts (list[str]): List of input texts to encode.
            batch_size (int): Number of samples per batch for processing.
            max_length (int): Max token length for truncation.

        Returns:
            np.ndarray: Numpy array of shape (len(texts), output_dim).
        """
        embeddings = []
        self.eval()
        with torch.no_grad():
            for i in range(0, len(texts), batch_size):
                batch = texts[i:i+batch_size]
                inputs = self.tokenizer(batch, padding=True, truncation=True,
                                        max_length=max_length, return_tensors="pt").to(self.device)
                outputs = self.bert(**inputs, return_dict=True)
                cls_emb = outputs.last_hidden_state[:, 0, :]  
                projected = self.proj(cls_emb)
                embeddings.append(projected.cpu().numpy())
        return np.vstack(embeddings)


def load_data_from_excel(excel_path, desc_col='description', id_col='video_id'):
    """
    Load video descriptions and IDs from an Excel file.

    Args:
        excel_path (str): Path to the Excel file.
        desc_col (str): Column name containing video descriptions.
        id_col (str): Column name containing video IDs.

    Returns:
        tuple: (list of descriptions, list of video IDs)
    """
    df = pd.read_excel(excel_path)
    df = df.dropna(subset=[desc_col, id_col])
    descriptions = df[desc_col].astype(str).tolist()
    video_ids = df[id_col].astype(str).tolist()
    return descriptions, video_ids


if __name__ == "__main__":
    device = "mps" if torch.backends.mps.is_available() else (
    "cuda" if torch.cuda.is_available() else "cpu")

    excel_path = "/Users/jumita/Downloads/Book1.xlsx"
    output_dir = "/Users/jumita/Downloads/excel/caption"
    os.makedirs(output_dir, exist_ok=True)

    # Load data
    all_texts, video_ids = load_data_from_excel(excel_path)

    # Create model processor
    processor = TikTokProcessorWithProjection(model_name="bert-base-uncased", output_dim=256, device=device)

    # Compute embeddings with projection layer
    reduced_embeddings = processor(all_texts, batch_size=32)

    # Save vector 256-dim
    for vid, emb in zip(video_ids, reduced_embeddings):
        np.save(os.path.join(output_dir, f"{vid}.npy"), emb)

    print(f"Saved {len(video_ids)} embeddings (256-d) to: {output_dir}")


Saved 74 embeddings (256-d) to: /Users/jumita/Downloads/excel/caption
