### Natural language media search
### Goals & scope
- Given one or many video/audio files and a natural-language query, return the most relevant segments with: start/end timestamps, transcript snippet, thumbnail(s), and a relevance score.
- **MVP scope** : Single-machine prototype that can index and search a handful of videos (hours of content).
- **Core capabilities** to learn: audio transcription and alignment, audio/visual/text embeddings, vector indexing (FAISS), multimodal retrieval & reranking.

In [143]:
# media_preprocessor.py
import ffmpeg
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import math
import uuid
import shutil
import os
import json
import sys
import subprocess  # <-- Added for ffprobe

# Optional VAD dependency
try:
    import webrtcvad
    import wave
    _VAD_AVAILABLE = True
except Exception:
    _VAD_AVAILABLE = False
    print("Warning: 'webrtcvad' not found. VAD segmentation will be disabled.")


class MediaPreprocessor:
    def __init__(self,
                 keyframe_interval: int = 5,
                 audio_segment_length: int = 30,
                 target_sample_rate: int = 16000,
                 frame_size: Tuple[int, int] = (224, 224),
                 temp_dir: str = "./media_processed",
                 target_lufs: float = -16.0,
                 use_vad: bool = True,           # <-- Default changed
                 scene_detect: bool = True,      # <-- Default changed
                 vad_aggressiveness: int = 2,    # <-- NEW: Configurable
                 scene_threshold: float = 0.4,   # <-- NEW: Configurable
                 verbose: bool = False           # <-- NEW: For logging
                ):
        """
        keyframe_interval: seconds between extracted keyframes (if scene_detect=False)
        audio_segment_length: seconds per chunk (used only if use_vad=False)
        target_sample_rate: e.g., 16000 for Whisper
        frame_size: (width, height) for resizing keyframes (224,224)
        temp_dir: root for all outputs (a per-file UUID subdir will be created)
        target_lufs: target loudness in LUFS for loudnorm (default -16)
        use_vad: if True, attempt VAD-based segmentation (requires webrtcvad)
        scene_detect: if True, use ffprobe scene-change selection instead of fixed interval
        vad_aggressiveness: VAD sensitivity (0=least, 3=most aggressive)
        scene_threshold: Scene change sensitivity (0.0=every frame, 1.0=no frames)
        verbose: If True, prints all ffmpeg/ffprobe command output
        """
        self.keyframe_interval = int(keyframe_interval)
        self.audio_segment_length = int(audio_segment_length)
        self.target_sample_rate = int(target_sample_rate)
        self.frame_size = frame_size
        self.root_temp_dir = Path(temp_dir)
        self.root_temp_dir.mkdir(parents=True, exist_ok=True)
        self.target_lufs = float(target_lufs)
        self.use_vad = bool(use_vad) and _VAD_AVAILABLE
        self.scene_detect = bool(scene_detect)
        
        # --- NEWLY ADDED PARAMETERS ---
        self.vad_aggressiveness = int(vad_aggressiveness)
        self.scene_threshold = float(scene_threshold)
        self.verbose = bool(verbose)

        print("✅ MediaPreprocessor initialized:")
        print(f"  - Keyframe interval: {self.keyframe_interval}s")
        print(f"  - Audio segment length: {self.audio_segment_length}s")
        print(f"  - Target sample rate: {self.target_sample_rate}Hz")
        print(f"  - Frame size: {self.frame_size}")
        print(f"  - Root temp directory: {self.root_temp_dir.resolve()}")
        print(f"  - Target LUFS: {self.target_lufs}")
        print(f"  - VAD enabled: {self.use_vad} (webrtcvad available: {_VAD_AVAILABLE})")
        if self.use_vad:
            print(f"    - VAD Aggressiveness: {self.vad_aggressiveness}")
        print(f"  - Scene-detect keyframes: {self.scene_detect}")
        if self.scene_detect:
            print(f"    - Scene Threshold: {self.scene_threshold}")
        print(f"  - Verbose logging: {self.verbose}")

    # --------------------
    def _make_run_dir(self, file_path: Path) -> Path:
        """
        Create a unique directory for processing this file to avoid collisions.
        """
        uid = uuid.uuid4().hex[:8]
        out_dir = self.root_temp_dir / f"{file_path.stem}_{uid}"
        out_dir.mkdir(parents=True, exist_ok=True)
        return out_dir

    # --------------------
    def _get_duration(self, file_path: Path) -> float:
        probe = ffmpeg.probe(str(file_path))
        return float(probe['format']['duration'])

    # --------------------
    def _extract_audio(self, file_path: Path, out_dir: Path, normalize: bool = True) -> Path:
        """
        Extract audio as 16kHz mono PCM WAV (pcm_s16le).
        Uses loudnorm single-pass if normalize=True.
        """
        out_path = out_dir / f"{file_path.stem}_audio.wav"
        stream = ffmpeg.input(str(file_path))

        if normalize:
            ffmpeg_stream = stream.filter('loudnorm', I=self.target_lufs, TP=-1.5, LRA=7)
        else:
            ffmpeg_stream = stream

        try:
            (
                ffmpeg_stream
                .output(str(out_path),
                        format='wav',
                        acodec='pcm_s16le',
                        ac=1,  # mono
                        ar=self.target_sample_rate)
                .overwrite_output()
                .run(quiet=(not self.verbose)) # <-- Use verbose flag
            )
        except ffmpeg.Error as e:
            raise RuntimeError(f"ffmpeg failed extracting audio: {e.stderr.decode() if e.stderr else e}") from e

        return out_path

    # --------------------
    def _segment_audio_fixed(self, audio_path: Path, out_dir: Path) -> List[Dict]:
        """
        Fixed-length segmentation using ffmpeg segment (deterministic).
        Returns list of {path, start_sec, end_sec}
        """
        print(f"  Segmenting audio with fixed {self.audio_segment_length}s intervals...")
        duration = self._get_duration(audio_path)
        segments = []
        num_chunks = math.ceil(duration / self.audio_segment_length)

        for i in range(num_chunks):
            start = i * self.audio_segment_length
            seg_len = min(self.audio_segment_length, max(0.0, duration - start))
            if seg_len < 1.0: # Skip tiny trailing segments
                continue
                
            seg_name = f"{audio_path.stem}_chunk_{i:04d}_{int(start)}s.wav"
            chunk_path = out_dir / seg_name

            try:
                (
                    ffmpeg
                    .input(str(audio_path), ss=start, t=seg_len)
                    .output(str(chunk_path),
                            format='wav',
                            acodec='pcm_s16le',
                            ac=1,
                            ar=self.target_sample_rate)
                    .overwrite_output()
                    .run(quiet=(not self.verbose)) # <-- Use verbose flag
                )
            except ffmpeg.Error as e:
                raise RuntimeError(f"ffmpeg failed segmenting audio: {e.stderr.decode() if e.stderr else e}") from e

            segments.append({
                "path": str(chunk_path.resolve()),
                "start_sec": float(start),
                "end_sec": float(start + seg_len)
            })
        print(f"    Created {len(segments)} fixed-length audio segments.")
        return segments

    # --------------------
    def _segment_audio_vad(self, audio_path: Path, out_dir: Path) -> List[Dict]:
        """
        VAD-based segmentation using webrtcvad.
        Produces speech-only chunks.
        """
        if not _VAD_AVAILABLE:
            print("  webrtcvad not available; falling back to fixed segmentation.")
            return self._segment_audio_fixed(audio_path, out_dir)
        
        print(f"  Segmenting audio with VAD (Aggressiveness: {self.vad_aggressiveness})...")

        # read WAV
        with wave.open(str(audio_path), 'rb') as wf:
            sample_rate = wf.getframerate()
            assert sample_rate == self.target_sample_rate, f"VAD requires {self.target_sample_rate}Hz, but file is {sample_rate}Hz"
            assert wf.getnchannels() == 1, "VAD expects mono WAV"
            width = wf.getsampwidth()
            assert width == 2, "VAD expects 16-bit PCM (2 bytes)"
            pcm = wf.readframes(wf.getnframes())

        vad = webrtcvad.Vad(self.vad_aggressiveness) # <-- Use configurable param

        # webrtcvad supports 10, 20, 30 ms frames
        frame_ms = 30
        bytes_per_frame = int(sample_rate * (frame_ms / 1000.0) * width)
        frames = [pcm[i:i+bytes_per_frame] for i in range(0, len(pcm), bytes_per_frame)]

        voiced_flags = []
        for f in frames:
            if len(f) < bytes_per_frame:
                f = f.ljust(bytes_per_frame, b'\0') # Pad last frame
            try:
                voiced_flags.append(vad.is_speech(f, sample_rate))
            except Exception:
                voiced_flags.append(False)

        # group contiguous voiced frames
        segments = []
        i = 0
        while i < len(voiced_flags):
            if voiced_flags[i]:
                start_frame = i
                while i < len(voiced_flags) and voiced_flags[i]:
                    i += 1
                end_frame = i - 1
                
                # --- This logic is simplified to just save the raw bytes ---
                # --- Re-encoding with ffmpeg is safer and more robust ---
                start_time = start_frame * (frame_ms / 1000.0)
                end_time = (end_frame + 1) * (frame_ms / 1000.0)
                seg_len = end_time - start_time
                
                # Filter out very short segments
                if seg_len < 0.5: # 500ms minimum
                    continue

                seg_name = f"{audio_path.stem}_vad_{start_time:.3f}s_{end_time:.3f}s.wav"
                chunk_path = out_dir / seg_name
                
                try:
                    (
                        ffmpeg
                        .input(str(audio_path), ss=start_time, t=seg_len)
                        .output(str(chunk_path), format='wav', acodec='pcm_s16le', ac=1, ar=self.target_sample_rate)
                        .overwrite_output()
                        .run(quiet=(not self.verbose)) # <-- Use verbose flag
                    )
                except ffmpeg.Error as e:
                    print(f"    Warning: ffmpeg failed extracting VAD segment: {e.stderr.decode() if e.stderr else e}")
                    continue

                segments.append({"path": str(chunk_path.resolve()),
                                 "start_sec": float(start_time),
                                 "end_sec": float(end_time)})
            else:
                i += 1
        
        print(f"    Found {len(segments)} voiced audio segments.")
        # if no voiced segments found, fallback to fixed
        if len(segments) == 0:
            print("    No voiced segments found, falling back to fixed intervals.")
            return self._segment_audio_fixed(audio_path, out_dir)
        return segments

    # --------------------
    def _extract_keyframes(self, file_path: Path, out_dir: Path) -> List[Dict]:
        """
        Extract keyframes.
        If scene_detect=True, uses ffprobe to find scene-changes.
        Otherwise, extracts at fixed intervals.
        """
        frames = []
        
        # --- CRITICAL BUG FIX: Re-implemented scene detection ---
        if self.scene_detect:
            print(f"  Extracting keyframes via scene-detect (Threshold: {self.scene_threshold})...")
            
            # Step 1: Use ffprobe to get the exact timestamps of scene changes
            ffprobe_cmd = [
                "ffprobe",
                "-v", "error",
                "-f", "lavfi",
                "-i", f"movie={str(file_path.resolve())},select='gt(scene,{self.scene_threshold})'",
                "-show_frames",
                "-show_entries", "frame=pkt_pts_time",
                "-of", "csv=p=0"
            ]
            
            try:
                if self.verbose:
                    print(f"    Running ffprobe cmd: {' '.join(ffprobe_cmd)}")
                result = subprocess.run(ffprobe_cmd, capture_output=True, text=True, check=True)
                timestamps = [float(t) for t in result.stdout.splitlines()]
                print(f"    Found {len(timestamps)} scene changes.")
            except Exception as e:
                print(f"    ffprobe scene-detect failed: {e}. Falling back to fixed interval.")
                self.scene_detect = False # Disable for this run
                return self._extract_keyframes(file_path, out_dir) # Recurse with fallback

            # Step 2: Loop through timestamps and extract one frame at each
            for idx, ts in enumerate(timestamps):
                out_path = out_dir / f"{file_path.stem}_scene_{idx:05d}_{ts:.3f}s.jpg"
                try:
                    (
                        ffmpeg
                        .input(str(file_path), ss=ts) # Seek to the exact timestamp
                        .output(str(out_path), 
                                vframes=1, # Extract exactly one frame
                                q=2)       # High quality JPEG
                        .filter("scale", self.frame_size[0], self.frame_size[1])
                        .overwrite_output()
                        .run(quiet=(not self.verbose)) # <-- Use verbose flag
                    )
                    frames.append({"path": str(out_path.resolve()), "timestamp": ts})
                except ffmpeg.Error as e:
                    print(f"    Warning: Failed to extract frame at {ts}s: {e.stderr.decode() if e.stderr else e}")
            
            print(f"    Successfully extracted {len(frames)} scene-change frames.")
            return frames

        # --- This is the fallback "Fixed interval" logic ---
        print(f"  Extracting keyframes via fixed-interval ({self.keyframe_interval}s)...")
        out_pattern = str(out_dir / f"{file_path.stem}_frame_%06d.jpg")
        try:
            fps_value = 1.0 / max(1, self.keyframe_interval)
            (
                ffmpeg
                .input(str(file_path))
                .filter("fps", fps=fps_value)
                .filter("scale", self.frame_size[0], self.frame_size[1])
                .output(out_pattern, vsync="vfr", format='image2', q=2)
                .overwrite_output()
                .run(quiet=(not self.verbose)) # <-- Use verbose flag
            )
        except ffmpeg.Error as e:
            raise RuntimeError(f"ffmpeg fixed-interval keyframe extraction failed: {e.stderr.decode() if e.stderr else e}") from e

        frame_files = sorted(out_dir.glob(f"{file_path.stem}_frame_*.jpg"))
        for idx, p in enumerate(frame_files):
            timestamp = float(idx * self.keyframe_interval)
            frames.append({"path": str(p.resolve()), "timestamp": timestamp})
        
        print(f"    Extracted {len(frames)} fixed-interval frames.")
        return frames

    # --------------------
    def _process_video_file(self, file_path: Path) -> Dict:
        run_dir = self._make_run_dir(file_path)
        print(f"Processing video -> working directory: {run_dir}")

        print("  Extracting normalized audio...")
        audio_path = self._extract_audio(file_path, out_dir=run_dir, normalize=True)
        
        print("  Segmenting audio...")
        if self.use_vad:
            audio_segments = self._segment_audio_vad(audio_path, out_dir=run_dir)
        else:
            audio_segments = self._segment_audio_fixed(audio_path, out_dir=run_dir)

        print("  Extracting keyframes...")
        keyframes = self._extract_keyframes(file_path, out_dir=run_dir)
        duration = self._get_duration(file_path)

        print(f"✅ Video processing complete for: {file_path.name}")
        return {
            "file_type": "video",
            "original_file": str(file_path.resolve()),
            "working_dir": str(run_dir.resolve()),
            "audio_segments": audio_segments,
            "video_keyframes": keyframes,
            "metadata": {
                "duration": duration,
                "frame_size": self.frame_size,
                "sample_rate": self.target_sample_rate
            }
        }

    # --------------------
    def _process_audio_file(self, file_path: Path) -> Dict:
        run_dir = self._make_run_dir(file_path)
        print(f"Processing audio -> working directory: {run_dir}")

        print("  Extracting normalized audio...")
        audio_path = self._extract_audio(file_path, out_dir=run_dir, normalize=True)
        
        print("  Segmenting audio...")
        if self.use_vad:
            audio_segments = self._segment_audio_vad(audio_path, out_dir=run_dir)
        else:
            audio_segments = self._segment_audio_fixed(audio_path, out_dir=run_dir)

        duration = self._get_duration(audio_path)
        
        print(f"✅ Audio processing complete for: {file_path.name}")
        return {
            "file_Type": "audio",
            "original_file": str(file_path.resolve()),
            "working_dir": str(run_dir.resolve()),
            "audio_segments": audio_segments,
            "metadata": {
                "duration": duration,
                "sample_rate": self.target_sample_rate
            }
        }

    # --------------------
    def process_media_file(self, file_path: str) -> Dict:
        file_path = Path(file_path)
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        # --- Expanded file extensions ---
        video_exts = {'.mp4', '.mkv', '.mov', '.avi', '.webm', '.flv', '.wmv', '.mpeg', '.mpg', '.m4v', '.m2ts'}
        audio_exts = {'.mp3', '.wav', '.flac', '.aac', '.m4a', '.ogg', '.opus', '.wma', '.aiff'}

        ext = file_path.suffix.lower()
        if ext in video_exts:
            return self._process_video_file(file_path)
        elif ext in audio_exts:
            return self._process_audio_file(file_path)
        else:
            raise ValueError(f"Unsupported file format: {ext}. Supported video: {video_exts}, Supported audio: {audio_exts}")

    # --------------------
    def cleanup_run_dir(self, run_dir: str):
        """
        Remove a previous working directory if needed.
        """
        p = Path(run_dir)
        if p.exists() and p.is_dir():
            shutil.rmtree(p)
            print(f"Removed working dir: {p}")



In [145]:
import json

# --- Fix ---
# 1. Define your media file path directly
my_media_file = "vid2.mp4"

# 2. Initialize the preprocessor
mp = MediaPreprocessor(use_vad=False, scene_detect=False)

try:
    # 3. Call the method with your path
    result = mp.process_media_file(my_media_file)
    json_str= json.dumps(result, indent=2)
    print(json_str)
    with open("result.json", "w") as f:
      f.write(json_str)
except FileNotFoundError:
    print(f"Error: Could not find the file at: {my_media_file}")
except Exception as e:
    print(f"An error occurred: {e}")

✅ MediaPreprocessor initialized:
  - Keyframe interval: 5s
  - Audio segment length: 30s
  - Target sample rate: 16000Hz
  - Frame size: (224, 224)
  - Root temp directory: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed
  - Target LUFS: -16.0
  - VAD enabled: False (webrtcvad available: True)
  - Scene-detect keyframes: False
  - Verbose logging: False
Processing video -> working directory: media_processed/vid2_fcd1acd8
  Extracting normalized audio...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


  Segmenting audio...
  Segmenting audio with fixed 30s intervals...


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Av

    Created 32 fixed-length audio segments.
  Extracting keyframes...
  Extracting keyframes via fixed-interval (5s)...
    Extracted 187 fixed-interval frames.
✅ Video processing complete for: vid2.mp4
{
  "file_type": "video",
  "original_file": "/Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/vid2.mp4",
  "working_dir": "/Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8",
  "audio_segments": [
    {
      "path": "/Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0000_0s.wav",
      "start_sec": 0.0,
      "end_sec": 30.0
    },
    {
      "path": "/Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0001_30s.wav",
      "start_sec": 30.0,
      "end_sec": 60.0
    },
    {
      "path": "/Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0002_60s.wav",
      "start_sec": 60.0,
      "end_sec": 9

huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [146]:
json_str = json.dumps(result, indent=2)
with open("sample.json", "w") as f:
    f.write(json_str)

In [147]:
processed_media_data = result

In [None]:
import json
import concurrent.futures # Keep import in case we want to switch back easily
import os

import whisper
# Proceed only if processed_media_data is available AND whisper is imported and valid
if ('processed_media_data' in globals() and processed_media_data is not None ):
    print("✅ 'processed_media_data' is defined and Whisper is available. Proceeding with transcription.")

    # 1️⃣ Load the Whisper model
    # Use the loaded model from a previous cell if available, otherwise load it
    # Use the imported whisper name
    if 'whisper_model' not in globals() or whisper_model is None:
         whisper_model = whisper.load_model("base")  # you can also use "small", "medium", or "large"
         print("✅ Whisper model loaded.")
    else:
         print("✅ Using existing Whisper model.")


    # Define a function to transcribe a single segment (this function is still useful even without parallel execution)
    def transcribe_segment(segment_data):
        audio_path = segment_data["path"]
        print(f"🎙️ Transcribing segment: {audio_path}")
        try:
            # Transcribe with word timestamps (useful for alignment later)
            # Use the transcribe method from the loaded whisper model
            result = whisper_model.transcribe(audio_path, word_timestamps=True)

            # Add a check for the expected result type
            if isinstance(result, dict) and "text" in result:
                print(f"Transcription successful for: {audio_path}")
                segment_data["transcription"] = result["text"]
                segment_data["word_timestamps"] = result["segments"]
            else:
                # Handle unexpected return type
                error_message = f"Unexpected transcription result type for {audio_path}: {type(result)}. Result: {result}"
                print(f"Error: {error_message}")
                segment_data["transcription"] = f"Error: {error_message}" # Store error message in transcription
                segment_data["word_timestamps"] = []

            return segment_data
        except Exception as e:
            print(f"Error transcribing segment {audio_path}: {e}")
            segment_data["transcription"] = f"Error: {e}" # Store exception message in transcription
            segment_data["word_timestamps"] = []
            return segment_data


    # 4️⃣ Transcribe each audio segment sequentially (removed parallel processing)
    print("\nStarting sequential transcription...")
    for segment in processed_media_data["audio_segments"]:
        transcribe_segment(segment) # Call the transcription function for each segment

    print("\n✅ Sequential transcription complete!")

    # 5️⃣ (Optional) Save all results as JSON
    # Ensure working_dir exists in processed_media_data
    if "working_dir" in processed_media_data:
        output_json_path = processed_media_data["working_dir"] + "/asr_output.json"
        try:
            with open(output_json_path, "w", encoding="utf-8") as f:
                json.dump(processed_media_data, f, indent=2)
            print(f"✅ Transcription results saved to: {output_json_path}")
        except Exception as e:
            print(f"Error saving transcription results to {output_json_path}: {e}")
    else:
        print("Warning: 'working_dir' not found in processed_media_data. Skipping saving results.")

else:
    if not whisper_available:
        print("Whisper library is not available or does not have 'load_model'. Skipping transcription.")
    elif 'processed_media_data' not in globals() or processed_media_data is None:
         print("'processed_media_data' is not defined or is None. Skipping transcription.")

✅ 'processed_media_data' is defined and Whisper is available. Proceeding with transcription.
✅ Using existing Whisper model.

Starting sequential transcription...
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0000_0s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0000_0s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0001_30s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0001_30s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0002_60s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0002_60s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0003_90s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0003_90s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0004_120s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0004_120s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0005_150s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0005_150s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0006_180s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0006_180s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0007_210s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0007_210s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0008_240s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0008_240s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0009_270s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0009_270s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0010_300s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0010_300s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0011_330s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0011_330s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0012_360s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0012_360s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0013_390s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0013_390s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0014_420s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0014_420s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0015_450s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0015_450s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0016_480s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0016_480s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0017_510s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0017_510s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0018_540s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0018_540s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0019_570s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0019_570s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0020_600s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0020_600s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0021_630s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0021_630s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0022_660s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0022_660s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0023_690s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0023_690s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0024_720s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0024_720s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0025_750s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0025_750s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0026_780s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0026_780s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0027_810s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0027_810s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0028_840s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


Transcription successful for: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0028_840s.wav
🎙️ Transcribing segment: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid2_fcd1acd8/vid2_audio_chunk_0029_870s.wav


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)


In [None]:
# Check if 'processed_media_data' is defined and has audio segments
if 'processed_media_data' in globals() and processed_media_data is not None and "audio_segments" in processed_media_data:
    print("--- Transcriptions ---")
    # Iterate through audio segments and print the transcription
    for i, segment in enumerate(processed_media_data["audio_segments"]):
        transcription = segment.get("transcription", "No transcription available")
        start_time = segment.get("start_sec", "N/A")
        end_time = segment.get("end_sec", "N/A")
        print(f"Segment {i+1} ({start_time:.2f}s - {end_time:.2f}s): {transcription}")
    print("--- End of Transcriptions ---")
else:
    print("'processed_media_data' is not defined or does not contain audio segments. Please run the preprocessing and transcription steps first.")

Sentence Transformers to convert the transcript snippets into dense vector representations suitable for similarity search against a natural language query.

In [96]:
import torch
from transformers import CLIPProcessor, CLIPModel
from PIL import Image
import librosa
import numpy as np
import faiss
import json

# Use GPU if available
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using device: {device}")

# Load the pre-trained CLIP model
model_name = "openai/clip-vit-large-patch14"
clip_model = CLIPModel.from_pretrained(model_name).to(device)
clip_processor = CLIPProcessor.from_pretrained(model_name)

Using device: cpu


config.json: 0.00B [00:00, ?B/s]

model.safetensors:   0%|          | 0.00/1.71G [00:00<?, ?B/s]

preprocessor_config.json:   0%|          | 0.00/316 [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/905 [00:00<?, ?B/s]

vocab.json: 0.00B [00:00, ?B/s]

merges.txt: 0.00B [00:00, ?B/s]

tokenizer.json: 0.00B [00:00, ?B/s]

special_tokens_map.json:   0%|          | 0.00/389 [00:00<?, ?B/s]

In [None]:
all_vectors = []
all_metadata = []

# Get the embedding dimension from the model config
embed_dim = clip_model.config.text_config.hidden_size #

# --- 1. Embed Text Transcriptions ---
print("Embedding text transcriptions...")
for segment in processed_media_data["audio_segments"]:
    transcription = segment.get("transcription", "")
    if not transcription or transcription.startswith("Error:"):
        continue

    # Process and embed the text
    inputs = processor(text=transcription, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        text_features = clip_model.get_text_features(**inputs)
    
    # Normalize and store
    text_vector = text_features.cpu().numpy().astype('float32')
    faiss.normalize_L2(text_vector) # Normalize for FAISS L2 search
    all_vectors.append(text_vector)
    all_metadata.append({
        "type": "text",
        "content": transcription,
        "start_sec": segment["start_sec"],
        "end_sec": segment["end_sec"]
    })

# --- 2. Embed Video Keyframes ---
print("Embedding video keyframes...")
for frame in processed_media_data["video_keyframes"]:
    try:
        image = Image.open(frame["path"])
        
        # Process and embed the image
        inputs = processor(images=image, return_tensors="pt").to(device)
        with torch.no_grad():
            image_features = clip_model.get_image_features(**inputs)
        
        # Normalize and store
        image_vector = image_features.cpu().numpy().astype('float32')
        faiss.normalize_L2(image_vector)
        all_vectors.append(image_vector)
        all_metadata.append({
            "type": "image",
            "path": frame["path"],
            "timestamp": frame["timestamp"]
        })
    except Exception as e:
        print(f"Error processing image {frame['path']}: {e}")

# --- 3. Embed Audio Segments (as Spectrograms) ---
print("Embedding audio spectrograms...")
for segment in processed_media_data["audio_segments"]:
    try:
        # Load audio file
        y, sr = librosa.load(segment["path"], sr=16000) # Use sample rate from metadata if needed
        
        # Create a Mel spectrogram
        S = librosa.feature.melspectrogram(y=y, sr=sr, n_mels=128)
        S_db = librosa.power_to_db(S, ref=np.max)
        
        # Normalize and convert to 3-channel PIL Image (to mimic RGB)
        S_norm = (S_db - S_db.min()) / (S_db.max() - S_db.min())
        S_img_array = (S_norm * 255).astype(np.uint8)
        S_pil = Image.fromarray(S_img_array).convert("RGB")
        
        # Process and embed the spectrogram *as an image*
        inputs = processor(images=S_pil, return_tensors="pt").to(device)
        with torch.no_grad():
            audio_features = clip_model.get_image_features(**inputs)
        
        # Normalize and store
        audio_vector = audio_features.cpu().numpy().astype('float32')
        faiss.normalize_L2(audio_vector)
        all_vectors.append(audio_vector)
        all_metadata.append({
            "type": "audio",
            "path": segment["path"],
            "start_sec": segment["start_sec"],
            "end_sec": segment["end_sec"]
        })
    except Exception as e:
        print(f"Error processing audio {segment['path']}: {e}")

print(f"Total assets indexed: {len(all_vectors)}")

# Stack all vectors into a single NumPy array
index_vectors = np.vstack(all_vectors)

In [None]:
# Create a FAISS index
# IndexFlatL2 uses L2 distance (Euclidean)
# Since we normalized our vectors, L2 distance is equivalent to cosine similarity, which is what CLIP uses.
index = faiss.IndexFlatL2(embed_dim)

# Add all our vectors to the index
index.add(index_vectors)

print(f"FAISS index built with {index.ntotal} vectors.")

# You can save the index and metadata for later
# faiss.write_index(index, "my_video.index")
# with open("my_video_metadata.json", "w") as f:
#    json.dump(all_metadata, f)

In [None]:
import os
import base64
from IPython.display import display, HTML, Video

# --- Make sure all your previous imports are loaded ---
# import torch
# from transformers import CLIPProcessor, CLIPModel
# from PIL import Image
# import librosa
# import numpy as np
# import faiss
# import json

In [None]:
def search_video(text_query, k=5):
    """
    Embeds a text query, searches the FAISS index, and returns the top k results.
    """
    print(f"\nSearching for: '{text_query}'")
    
    # 1. Embed the text query
    inputs = clip_processor(text=text_query, return_tensors="pt").to(device)
    with torch.no_grad():
        query_features = clip_model.get_text_features(**inputs)
    
    # 2. Normalize the query vector
    query_vector = query_features.cpu().numpy().astype('float32')
    faiss.normalize_L2(query_vector)
    
    # 3. Search the FAISS index
    D, I = index.search(query_vector, k)
    
    # 4. Package up the results
    results = []
    print("Top results found. Generating clips...")
    for i in range(k):
        result_index = I[0][i]
        result_metadata = all_metadata[result_index]
        distance = D[0][i]
        results.append({
            "rank": i + 1,
            "metadata": result_metadata,
            "distance": distance
        })
        
    return results



In [None]:
def display_search_results_with_clips(results, original_video_path, working_dir, clip_duration=5):
    """
    Generates and displays search results with embedded video clips.
    
    Assumes 'ffmpeg' is installed and accessible in your system's PATH.
    """
    
    # We will build a single HTML string to display all results
    html_output = "<div>"
    
    for res in results:
        metadata = res["metadata"]
        distance = res["distance"]
        rank = res["rank"]
        
        # --- 1. Determine Timeframe & Content ---
        content_info = ""
        if metadata["type"] == "text" or metadata["type"] == "audio":
            start_sec = metadata["start_sec"]
            duration = metadata["end_sec"] - metadata["start_sec"]
            content_info = f"<b>[{metadata['type'].upper()}]</b> at {start_sec:.2f}s"
            if metadata['type'] == 'text':
                content_info += f": '<i>{metadata['content']}</i>'"
        
        elif metadata["type"] == "image":
            # For an image, create a clip centered on the timestamp
            start_sec = max(0, metadata["timestamp"] - (clip_duration / 2))
            duration = clip_duration
            content_info = f"<b>[IMAGE]</b> at {metadata['timestamp']:.2f}s (Source frame: {metadata['path']})"
        
        # --- 2. Generate the Clip using FFmpeg ---
        clip_filename = f"search_result_rank_{rank}.mp4"
        clip_output_path = os.path.join(working_dir, clip_filename)
        
        # FFmpeg command:
        # -y: Overwrite output file
        # -ss: Seek to start time
        # -i: Input file
        # -t: Duration of the clip
        # -c:v libx264: Re-encode video (safer for clips)
        # -preset ultrafast: Encode very quickly
        # -c:a aac: Re-encode audio
        # -vf "scale=480:-1": Resize to 480px width, maintain aspect ratio
        ffmpeg_command = (
            f"ffmpeg -y -ss {start_sec} -i \"{original_video_path}\" "
            f"-t {duration} -c:v libx264 -preset ultrafast -c:a aac -vf \"scale=480:-1\" "
            f"\"{clip_output_path}\""
        )
        
        video_html = ""
        try:
            # Run the command (silence output with >/dev/null 2>&1)
            os.system(f"{ffmpeg_command} >/dev/null 2>&1")
            
            # --- 3. Embed Video using Base64 ---
            # This is robust and works well in all notebooks
            with open(clip_output_path, "rb") as f:
                video_data = f.read()
            video_base64 = base64.b64encode(video_data).decode("utf-8")
            video_src = f"data:video/mp4;base64,{video_base64}"
            
            video_html = f'<video controls width="480" src="{video_src}" type="video/mp4">Your browser does not support the video tag.</video>'
            
        except Exception as e:
            video_html = f"<p>Error generating video clip: {e}</p>"

        # --- 4. Build HTML for this result ---
        html_output += f"""
        <div style="border: 1px solid #ccc; border-radius: 8px; padding: 16px; margin-bottom: 16px; display: flex; align-items: top; flex-wrap: wrap; background-color: #f9f9f9;">
            <div style="flex: 1; min-width: 300px; padding-right: 16px;">
                <h3 style="margin-top:0;">Rank {rank} (Distance: {distance:.4f})</h3>
                <p>{content_info}</p>
            </div>
            <div style="flex-shrink: 0;">
                {video_html}
            </div>
        </div>
        """

    html_output += "</div>"
    
    # Display the final combined HTML
    display(HTML(html_output))

In [None]:
# --- Load your processed_media_data first ---
# (Assuming 'processed_media_data' is loaded from your JSON file)
# with open("path/to/your/asr_output.json", "r") as f:
#     processed_media_data = json.load(f)

# Get the two paths we need from your main data object
original_video_path = processed_media_data["original_file"]
working_dir = processed_media_data["working_dir"]

# --- 1. Run the search ---
query = "people falling"
search_results = search_video(query, k=5)
# --- Try it out! ---

# --- 2. Display results with clips ---
display_search_results_with_clips(search_results, original_video_path, working_dir)

In [140]:
search_video(query, k=5)


Searching for: 'boooooom'
Top results found. Generating clips...


[{'rank': 1,
  'metadata': {'type': 'text',
   'content': " just as Sunday's was and so far touch would be. It's not been any dastardly, feel dastardly challenges. I'm not saying there's not going to be masherano on the ball plays it forward chabby in space turns plays inside doing the yesterday's seat back from and roads in the yesterday. Be gay now. Pedaling space on the full side. Come and tell us looking but I'm not sure you want to give pedal of that much space.",
   'start_sec': 1560.0,
   'end_sec': 1590.0},
  'distance': 0.5497296},
 {'rank': 2,
  'metadata': {'type': 'text',
   'content': " the first book in the evening. Kiddia there. It's also five Sunday. Oh, and it was the wonderful board forward there. I think there's Buschets. Clips it through for Pedro. They're beating again. I think it's going to be a",
   'start_sec': 1860.0,
   'end_sec': 1890.0},
  'distance': 0.5507071},
 {'rank': 3,
  'metadata': {'type': 'text',
   'content': " It's always the danger of course we 