### Natural language media search
### Goals & scope
- Given one or many video/audio files and a natural-language query, return the most relevant segments with: start/end timestamps, transcript snippet, thumbnail(s), and a relevance score.
- **MVP scope** : Single-machine prototype that can index and search a handful of videos (hours of content).
- **Core capabilities** to learn: audio transcription and alignment, audio/visual/text embeddings, vector indexing (FAISS), multimodal retrieval & reranking.

In [4]:
# Required packages

In [1]:
# media_preprocessor.py
import ffmpeg
from pathlib import Path
from typing import List, Dict, Tuple, Optional
import math
import uuid
import shutil
import os
import json
import sys

# Optional VAD dependency
try:
    import webrtcvad
    import wave
    _VAD_AVAILABLE = True
except Exception:
    _VAD_AVAILABLE = False

class MediaPreprocessor:
    def __init__(self,
                 keyframe_interval: int = 5,
                 audio_segment_length: int = 30,
                 target_sample_rate: int = 16000,
                 frame_size: Tuple[int, int] = (224, 224),
                 temp_dir: str = "./media_processed",
                 target_lufs: float = -16.0,
                 use_vad: bool = False,
                 scene_detect: bool = False):
        """
        keyframe_interval: seconds between extracted keyframes (if scene_detect=False)
        audio_segment_length: seconds per chunk (used only if use_vad=False)
        target_sample_rate: e.g., 16000 for Whisper
        frame_size: (width, height) for resizing keyframes (224,224)
        temp_dir: root for all outputs (a per-file UUID subdir will be created)
        target_lufs: target loudness in LUFS for loudnorm (default -16)
        use_vad: if True, attempt VAD-based segmentation (requires webrtcvad)
        scene_detect: if True, use ffmpeg scene-change selection instead of fixed interval
        """
        self.keyframe_interval = int(keyframe_interval)
        self.audio_segment_length = int(audio_segment_length)
        self.target_sample_rate = int(target_sample_rate)
        self.frame_size = frame_size
        self.root_temp_dir = Path(temp_dir)
        self.root_temp_dir.mkdir(parents=True, exist_ok=True)
        self.target_lufs = float(target_lufs)
        self.use_vad = bool(use_vad) and _VAD_AVAILABLE
        self.scene_detect = bool(scene_detect)

        print("✅ MediaPreprocessor initialized:")
        print(f"  - Keyframe interval: {self.keyframe_interval}s")
        print(f"  - Audio segment length: {self.audio_segment_length}s")
        print(f"  - Target sample rate: {self.target_sample_rate}Hz")
        print(f"  - Frame size: {self.frame_size}")
        print(f"  - Root temp directory: {self.root_temp_dir.resolve()}")
        print(f"  - Target LUFS: {self.target_lufs}")
        print(f"  - VAD enabled: {self.use_vad} (webrtcvad available: {_VAD_AVAILABLE})")
        print(f"  - Scene-detect keyframes: {self.scene_detect}")

    # --------------------
    def _make_run_dir(self, file_path: Path) -> Path:
        """
        Create a unique directory for processing this file to avoid collisions.
        """
        uid = uuid.uuid4().hex[:8]
        out_dir = self.root_temp_dir / f"{file_path.stem}_{uid}"
        out_dir.mkdir(parents=True, exist_ok=True)
        return out_dir

    # --------------------
    def _get_duration(self, file_path: Path) -> float:
        probe = ffmpeg.probe(str(file_path))
        return float(probe['format']['duration'])

    # --------------------
    def _extract_audio(self, file_path: Path, out_dir: Path, normalize: bool = True) -> Path:
        """
        Extract audio as 16kHz mono PCM WAV (pcm_s16le).
        Uses loudnorm single-pass if normalize=True. For better accuracy you could
        run 2-pass loudnorm (analyze then apply measured params).
        """
        out_path = out_dir / f"{file_path.stem}_audio.wav"

        stream = ffmpeg.input(str(file_path))

        if normalize:
            # Simple single-pass loudnorm. For best fidelity: run two-pass approach (analyze, then apply).
            ffmpeg_stream = stream.filter('loudnorm', I=self.target_lufs, TP=-1.5, LRA=7)
        else:
            ffmpeg_stream = stream

        try:
            (
                ffmpeg_stream
                .output(str(out_path),
                        format='wav',
                        acodec='pcm_s16le',
                        ac=1,  # mono
                        ar=self.target_sample_rate)
                .overwrite_output()
                .run(quiet=True)
            )
        except ffmpeg.Error as e:
            # include stderr for debugging
            raise RuntimeError(f"ffmpeg failed extracting audio: {e.stderr.decode() if e.stderr else e}") from e

        return out_path

    # --------------------
    def _segment_audio_fixed(self, audio_path: Path, out_dir: Path) -> List[Dict]:
        """
        Fixed-length segmentation using ffmpeg segment (deterministic).
        Returns list of {path, start_sec, end_sec}
        """
        duration = self._get_duration(audio_path)
        segments = []
        num_chunks = math.ceil(duration / self.audio_segment_length)

        for i in range(num_chunks):
            start = i * self.audio_segment_length
            seg_len = min(self.audio_segment_length, max(0.0, duration - start))
            start_str = f"{start:.3f}"
            seg_name = f"{audio_path.stem}_chunk_{i:04d}_{int(start)}s.wav"
            chunk_path = out_dir / seg_name

            try:
                (
                    ffmpeg
                    .input(str(audio_path), ss=start, t=seg_len)
                    .output(str(chunk_path),
                            format='wav',
                            acodec='pcm_s16le',
                            ac=1,
                            ar=self.target_sample_rate)
                    .overwrite_output()
                    .run(quiet=True)
                )
            except ffmpeg.Error as e:
                raise RuntimeError(f"ffmpeg failed segmenting audio: {e.stderr.decode() if e.stderr else e}") from e

            segments.append({
                "path": str(chunk_path),
                "start_sec": float(start),
                "end_sec": float(start + seg_len)
            })

        return segments

    # --------------------
    def _segment_audio_vad(self, audio_path: Path, out_dir: Path, aggressiveness: int = 2) -> List[Dict]:
        """
        Very simple VAD-based segmentation using webrtcvad.
        Produces speech-only chunks by grouping contiguous voiced frames.
        Requirements: webrtcvad installed.
        Note: This is a basic implementation and may need tuning for production.
        """
        if not _VAD_AVAILABLE:
            print("webrtcvad not available; falling back to fixed segmentation.")
            return self._segment_audio_fixed(audio_path, out_dir)

        # read WAV
        with wave.open(str(audio_path), 'rb') as wf:
            sample_rate = wf.getframerate()
            assert wf.getnchannels() == 1, "VAD expects mono WAV"
            width = wf.getsampwidth()
            pcm = wf.readframes(wf.getnframes())

        vad = webrtcvad.Vad(aggressiveness)

        # frame size in ms. webrtcvad supports 10,20,30
        frame_ms = 30
        bytes_per_frame = int(sample_rate * (frame_ms / 1000.0) * width)
        frames = [pcm[i:i+bytes_per_frame] for i in range(0, len(pcm), bytes_per_frame)]

        voiced_flags = [False] * len(frames)
        for i, f in enumerate(frames):
            if len(f) < bytes_per_frame:
                # pad last frame
                f = f.ljust(bytes_per_frame, b'\0')
            try:
                voiced_flags[i] = vad.is_speech(f, sample_rate)
            except Exception:
                voiced_flags[i] = False

        # group contiguous voiced frames
        segments = []
        i = 0
        while i < len(voiced_flags):
            if voiced_flags[i]:
                start_frame = i
                while i < len(voiced_flags) and voiced_flags[i]:
                    i += 1
                end_frame = i - 1
                start_time = start_frame * (frame_ms / 1000.0)
                end_time = (end_frame + 1) * (frame_ms / 1000.0)
                seg_name = f"{audio_path.stem}_vad_{int(start_time)}s_{int(end_time)}s.wav"
                chunk_path = out_dir / seg_name
                seg_len = end_time - start_time
                # extract segment with ffmpeg for robust encoding
                try:
                    (
                        ffmpeg
                        .input(str(audio_path), ss=start_time, t=seg_len)
                        .output(str(chunk_path), format='wav', acodec='pcm_s16le', ac=1, ar=self.target_sample_rate)
                        .overwrite_output()
                        .run(quiet=True)
                    )
                except ffmpeg.Error as e:
                    raise RuntimeError(f"ffmpeg failed extracting VAD segment: {e.stderr.decode() if e.stderr else e}") from e

                segments.append({"path": str(chunk_path),
                                 "start_sec": float(start_time),
                                 "end_sec": float(end_time)})
            else:
                i += 1

        # if no voiced segments found, fallback to fixed
        if len(segments) == 0:
            return self._segment_audio_fixed(audio_path, out_dir)
        return segments

    # --------------------
    def _extract_keyframes(self, file_path: Path, out_dir: Path) -> List[Dict]:
        """
        Extract keyframes and produce list of {"path": str, "timestamp": float}.
        If scene_detect=True, extract frames only at scene changes; otherwise extract
        at fixed intervals (every keyframe_interval seconds).
        """
        frames = []
        if self.scene_detect:
            # Use scene-change detection
            out_pattern = str(out_dir / f"{file_path.stem}_frame_scene_%04d.jpg")
            try:
                (
                    ffmpeg
                    .input(str(file_path))
                    .filter("select", "gt(scene,0.4)")
                    .filter("scale", self.frame_size[0], self.frame_size[1])
                    .output(out_pattern, vsync="vfr", format='image2')
                    .overwrite_output()
                    .run(quiet=True)
                )
            except ffmpeg.Error as e:
                raise RuntimeError(f"ffmpeg scene-detect failed: {e.stderr.decode() if e.stderr else e}") from e

            # No exact timestamps provided; we will estimate by probing each frame file via ffmpeg.probe (slow),
            # or compute using ffmpeg select expression to include pts — for simplicity, estimate by reading
            # the creation order and mapping approximate times using frame count * keyframe_interval as fallback.
            frame_files = sorted(out_dir.glob(f"{file_path.stem}_frame_scene_*.jpg"))
            # try to read pts via ffprobe per frame — expensive; fallback to index*interval
            for idx, p in enumerate(frame_files):
                frames.append({"path": str(p), "timestamp": float(idx * self.keyframe_interval)})
            return frames

        # Fixed interval
        out_pattern = str(out_dir / f"{file_path.stem}_frame_%06d.jpg")
        try:
            # fps=1/N -> extract one frame every N seconds; ffmpeg expects fraction or float
            fps_value = 1.0 / max(1, self.keyframe_interval)
            (
                ffmpeg
                .input(str(file_path))
                .filter("fps", fps=fps_value)
                .filter("scale", self.frame_size[0], self.frame_size[1])
                .output(out_pattern, vsync="vfr", format='image2')
                .overwrite_output()
                .run(quiet=True)
            )
        except ffmpeg.Error as e:
            raise RuntimeError(f"ffmpeg fixed-interval keyframe extraction failed: {e.stderr.decode() if e.stderr else e}") from e

        frame_files = sorted(out_dir.glob(f"{file_path.stem}_frame_*.jpg"))
        # map each frame to a timestamp: frame_index * keyframe_interval (approximate)
        for idx, p in enumerate(frame_files):
            timestamp = float(idx * self.keyframe_interval)
            frames.append({"path": str(p), "timestamp": timestamp})

        return frames

    # --------------------
    def _process_video_file(self, file_path: Path) -> Dict:
        run_dir = self._make_run_dir(file_path)
        print(f"Processing video -> working directory: {run_dir}")

        audio_path = self._extract_audio(file_path, out_dir=run_dir, normalize=True)
        if self.use_vad:
            audio_segments = self._segment_audio_vad(audio_path, out_dir=run_dir)
        else:
            audio_segments = self._segment_audio_fixed(audio_path, out_dir=run_dir)

        keyframes = self._extract_keyframes(file_path, out_dir=run_dir)
        duration = self._get_duration(file_path)

        return {
            "file_type": "video",
            "original_file": str(file_path.resolve()),
            "working_dir": str(run_dir.resolve()),
            "audio_segments": audio_segments,
            "video_keyframes": keyframes,
            "metadata": {
                "duration": duration,
                "frame_size": self.frame_size,
                "sample_rate": self.target_sample_rate
            }
        }

    # --------------------
    def _process_audio_file(self, file_path: Path) -> Dict:
        run_dir = self._make_run_dir(file_path)
        print(f"Processing audio -> working directory: {run_dir}")

        audio_path = self._extract_audio(file_path, out_dir=run_dir, normalize=True)
        if self.use_vad:
            audio_segments = self._segment_audio_vad(audio_path, out_dir=run_dir)
        else:
            audio_segments = self._segment_audio_fixed(audio_path, out_dir=run_dir)

        duration = self._get_duration(audio_path)
        return {
            "file_type": "audio",
            "original_file": str(file_path.resolve()),
            "working_dir": str(run_dir.resolve()),
            "audio_segments": audio_segments,
            "metadata": {
                "duration": duration,
                "sample_rate": self.target_sample_rate
            }
        }

    # --------------------
    def process_media_file(self, file_path: str) -> Dict:
        file_path = Path(file_path)
        if not file_path.exists():
            raise FileNotFoundError(f"File not found: {file_path}")

        video_exts = {'.mp4', '.avi', '.mov', '.mkv', '.wmv', '.flv', '.webm'}
        audio_exts = {'.wav', '.mp3', '.flac', '.aac', '.ogg', '.m4a'}

        ext = file_path.suffix.lower()
        if ext in video_exts:
            return self._process_video_file(file_path)
        elif ext in audio_exts:
            return self._process_audio_file(file_path)
        else:
            raise ValueError(f"Unsupported file format: {ext}")

    # --------------------
    def cleanup_run_dir(self, run_dir: str):
        """
        Remove a previous working directory if needed.
        """
        p = Path(run_dir)
        if p.exists() and p.is_dir():
            shutil.rmtree(p)
            print(f"Removed working dir: {p}")

if __name__ == "__main__":
    # Quick CLI test (runs when the file is executed directly)
    if len(sys.argv) < 2:
        print("Usage: python media_preprocessor.py <path-to-media-file>")
        sys.exit(1)

    mp = MediaPreprocessor(use_vad=False, scene_detect=False)
    result = mp.process_media_file(sys.argv[1])
    print(json.dumps(result, indent=2))


✅ MediaPreprocessor initialized:
  - Keyframe interval: 5s
  - Audio segment length: 30s
  - Target sample rate: 16000Hz
  - Frame size: (224, 224)
  - Root temp directory: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed
  - Target LUFS: -16.0
  - VAD enabled: False (webrtcvad available: True)
  - Scene-detect keyframes: False


FileNotFoundError: File not found: -f

In [2]:
mymediaprocessor = MediaPreprocessor()



✅ MediaPreprocessor initialized:
  - Keyframe interval: 5s
  - Audio segment length: 30s
  - Target sample rate: 16000Hz
  - Frame size: (224, 224)
  - Root temp directory: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed
  - Target LUFS: -16.0
  - VAD enabled: False (webrtcvad available: True)
  - Scene-detect keyframes: False


In [4]:
import json

# --- Fix ---
# 1. Define your media file path directly
my_media_file = "vid1.mp4" 

# 2. Initialize the preprocessor
mp = MediaPreprocessor(use_vad=False, scene_detect=False)

try:
    # 3. Call the method with your path
    result = mp.process_media_file(my_media_file)
    print(json.dumps(result, indent=2))

except FileNotFoundError:
    print(f"Error: Could not find the file at: {my_media_file}")
except Exception as e:
    print(f"An error occurred: {e}")

✅ MediaPreprocessor initialized:
  - Keyframe interval: 5s
  - Audio segment length: 30s
  - Target sample rate: 16000Hz
  - Frame size: (224, 224)
  - Root temp directory: /Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed
  - Target LUFS: -16.0
  - VAD enabled: False (webrtcvad available: True)
  - Scene-detect keyframes: False
Processing video -> working directory: media_processed/vid1_c801eee7
{
  "file_type": "video",
  "original_file": "/Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/vid1.mp4",
  "working_dir": "/Users/pratyushkhanal/Desktop/seniorseminar/media_nlp/media_processed/vid1_c801eee7",
  "audio_segments": [
    {
      "path": "media_processed/vid1_c801eee7/vid1_audio_chunk_0000_0s.wav",
      "start_sec": 0.0,
      "end_sec": 30.0
    },
    {
      "path": "media_processed/vid1_c801eee7/vid1_audio_chunk_0001_30s.wav",
      "start_sec": 30.0,
      "end_sec": 60.0
    },
    {
      "path": "media_processed/vid1_c801eee7/vid1_audio_chunk_000

## Implementation of parrallel ASR implementation

In [None]:
import os
import multiprocessing
import json
from pathlib import Path
from typing import List, Dict
from multiprocessing import Pool, cpu_count
import time # For simulating work

# --- Mock imports for required external libraries (Install these in a real project) ---
try:
    import torch
    # from transformers import CLIPProcessor, CLIPModel # Actual imports
    # import whisper
    _ML_AVAILABLE = True
except ImportError:
    print("Warning: torch/ML libraries not found. Using pure mock logic.")
    _ML_AVAILABLE = False
# ----------------------------------------------------------------------------------

# ==============================================================================
# ML MOCK FUNCTIONS
# Replace these with actual Whisper and CLIP model loading and inference logic
# ==============================================================================

# Model state to avoid reloading in child processes (though best practice is to load in each worker or use joblib)
# For simplicity with multiprocessing.Pool.map, we keep this simple.
_WHISPER_MODEL = None 
_CLIP_PROCESSOR = None
_CLIP_MODEL = None

def _load_whisper_model(model_name: str = "base"):
    """Mocks Whisper model loading."""
    global _WHISPER_MODEL
    if _WHISPER_MODEL is None:
        print(f"Loading MOCK Whisper model: {model_name}...")
        # In real code: _WHISPER_MODEL = whisper.load_model(model_name)
        _WHISPER_MODEL = {"model": f"whisper_{model_name}"}
    return _WHISPER_MODEL

def _load_clip_assets(model_name: str = "openai/clip-vit-base-patch32"):
    """Mocks CLIP model and processor loading."""
    global _CLIP_PROCESSOR, _CLIP_MODEL
    if _CLIP_MODEL is None:
        print(f"Loading MOCK CLIP assets: {model_name}...")
        # In real code: 
        # _CLIP_PROCESSOR = CLIPProcessor.from_pretrained(model_name)
        # _CLIP_MODEL = CLIPModel.from_pretrained(model_name)
        # _CLIP_MODEL.to("cuda" if torch.cuda.is_available() else "cpu")
        _CLIP_PROCESSOR = {"processor": "clip_proc"}
        _CLIP_MODEL = {"model": "clip_model"}
    return _CLIP_PROCESSOR, _CLIP_MODEL

def _run_whisper_inference_worker(chunk_path: str) -> Dict:
    """
    Worker function for parallel ASR. Processes a single audio chunk.
    NOTE: In a real scenario, you'd load the model inside the worker 
    or use `joblib.Parallel` with a shared model, or pass the model config.
    """
    try:
        # model = _load_whisper_model() # Reloading the model is often needed for processes
        
        # --- Mock Transcription Logic ---
        print(f"  [ASR Worker] Transcribing chunk: {Path(chunk_path).name}")
        time.sleep(1) # Simulate work
        
        transcription_text = f"This is the transcribed audio from segment starting at {Path(chunk_path).stem.split('_')[-1]}."
        
        return {
            "chunk_path": chunk_path,
            "transcription": transcription_text,
            "success": True
        }
    except Exception as e:
        print(f"Error transcribing {chunk_path}: {e}")
        return {
            "chunk_path": chunk_path,
            "transcription": "",
            "success": False,
            "error": str(e)
        }


# ==============================================================================
# MVP PHASE 1 IMPLEMENTATION
# ==============================================================================

class ProcessingPipeline:
    
    # --------------------------------------------------------------------------
    # 1. Parallel ASR Implementation (Highest Priority)
    # --------------------------------------------------------------------------
    
    @staticmethod
    def parallel_asr_transcription(audio_segments: List[Dict], model_name: str = "base", max_processes: int = 4) -> List[Dict]:
        """
        Uses Python's multiprocessing Pool to transcribe audio chunks in parallel.
        """
        print("🚀 Starting Parallel ASR Transcription...")
        
        chunk_paths = [seg['path'] for seg in audio_segments]
        
        # Limit the number of processes to prevent system overload
        num_processes = min(max_processes, cpu_count())
        print(f"Using {num_processes} parallel processes for ASR.")

        with Pool(processes=num_processes) as pool:
            # map applies the worker function to every chunk_path in the list
            results = pool.map(_run_whisper_inference_worker, chunk_paths)

        # Combine results back into the segment structure
        transcribed_segments = []
        for segment in audio_segments:
            result = next(r for r in results if r['chunk_path'] == segment['path'])
            
            segment['asr_result'] = {
                "transcription": result.get('transcription', ''),
                "success": result['success']
            }
            transcribed_segments.append(segment)

        print("✅ Parallel ASR Transcription Complete.")
        return transcribed_segments

    # --------------------------------------------------------------------------
    # 2. Scalable Embedding Extraction (Memory Management)
    # --------------------------------------------------------------------------
    
    @staticmethod
    def scalable_embedding_extraction(keyframes: List[Dict], batch_size: int = 16):
        """
        Processes keyframes in fixed batches to prevent GPU memory overflow 
        during CLIP embedding computation.
        """
        if not keyframes:
            return []

        print(f"🖼️ Starting Scalable Embedding Extraction with batch size: {batch_size}...")
        
        # Load ML assets (once) - MOCK
        _load_clip_assets() 
        # device = "cuda" if torch.cuda.is_available() else "cpu"
        # frame_width, frame_height = keyframes[0].get("frame_size", (224, 224)) # If metadata was stored
        
        embeddings = []
        
        # Iterate through keyframes in fixed batches
        for i in range(0, len(keyframes), batch_size):
            batch = keyframes[i:i + batch_size]
            
            # 1. Load images for the current batch (PIL.Image or similar)
            # image_paths = [frame['path'] for frame in batch]
            # images = [Image.open(p).convert("RGB") for p in image_paths]
            
            print(f"  [Embedding] Processing batch {i//batch_size + 1}/{math.ceil(len(keyframes)/batch_size)} ({len(batch)} frames)")
            time.sleep(0.5) # Simulate work
            
            try:
                # 2. Pre-process and move to device
                # inputs = _CLIP_PROCESSOR(images=images, return_tensors="pt", padding=True).to(device)
                
                # 3. Compute embeddings
                # with torch.no_grad():
                    # image_features = _CLIP_MODEL.get_image_features(**inputs)
                    # batch_embeddings = image_features / image_features.norm(p=2, dim=-1, keepdim=True)
                    # batch_embeddings_list = batch_embeddings.cpu().numpy().tolist()

                # --- Mock Embedding Logic ---
                mock_embedding_size = 512 
                batch_embeddings_list = [
                    [round(float(i + j + k*10) / 1000, 6) for j in range(mock_embedding_size)] 
                    for k in range(len(batch))
                ]
                
                # 4. Store results
                for frame, embedding in zip(batch, batch_embeddings_list):
                    frame['embedding'] = embedding
                    embeddings.append(frame)

            except Exception as e:
                print(f"Error processing batch starting at index {i}: {e}. Skipping batch.")
                # Append frames without embeddings to preserve structure
                embeddings.extend(batch) 
                continue

        print("✅ Scalable Embedding Extraction Complete.")
        return embeddings

    # --------------------------------------------------------------------------
    # Orchestrator
    # --------------------------------------------------------------------------
    
    @classmethod
    def run_pipeline(cls, preprocessor_result: Dict) -> Dict:
        """
        Orchestrates the MVP Phase 1 tasks using the preprocessor's output.
        """
        print("\n--- MVP Phase 1 Processing Started ---")
        
        # 1. Parallel ASR (runs on audio segments)
        if preprocessor_result.get("audio_segments"):
            transcribed_segments = cls.parallel_asr_transcription(
                audio_segments=preprocessor_result["audio_segments"], 
                model_name="base" 
            )
            preprocessor_result["audio_segments"] = transcribed_segments
        else:
            print("Skipping ASR: No audio segments found.")

        print("\n" + "-"*30 + "\n")

        # 2. Scalable Embedding Extraction (runs on keyframes)
        if preprocessor_result.get("video_keyframes"):
            processed_keyframes = cls.scalable_embedding_extraction(
                keyframes=preprocessor_result["video_keyframes"], 
                batch_size=16
            )
            preprocessor_result["video_keyframes"] = processed_keyframes
        else:
            print("Skipping Embedding Extraction: No video keyframes found.")
            
        print("\n--- MVP Phase 1 Processing Complete ---")
        return preprocessor_result

MEDIA_FILE_PATH = "vid1.mp4" 
# e.g., MEDIA_FILE_PATH = "/Users/username/Videos/sample.mp4" 
# --------------------------

if MEDIA_FILE_PATH == "path/to/your/media/file.mp4":
    print("🛑 Please update the 'MEDIA_FILE_PATH' variable with a valid file path before running.")
else:
    try:
        print(f"Starting processing for file: {MEDIA_FILE_PATH}")
        
        # 1. Run Preprocessing
        mp = MediaPreprocessor(keyframe_interval=5, audio_segment_length=30, use_vad=False, scene_detect=False)
        preprocessor_output = mp.process_media_file(MEDIA_FILE_PATH)
        
        print("\n--- Preprocessing Output Summary ---")
        print(f"Working Directory: {preprocessor_output['working_dir']}")
        print(f"Segments: {len(preprocessor_output.get('audio_segments', []))}, Frames: {len(preprocessor_output.get('video_keyframes', []))}")

        # 2. Run the MVP Pipeline
        final_result = ProcessingPipeline.run_pipeline(preprocessor_output)
        
        # 3. Output Final Result
        print("\n--- Final Consolidated Result (Summary) ---")
        # Ensure we only print the first few items for brevity
        result_summary = {
            "file_type": final_result["file_type"],
            "working_dir": final_result["working_dir"],
            "audio_segments_head": final_result.get("audio_segments", [])[:2],
            "video_keyframes_head": final_result.get("video_keyframes", [])[:2]
        }
        print(json.dumps(result_summary, indent=2))
        
        # Optional: Cleanup the temporary directory
        # mp.cleanup_run_dir(preprocessor_output["working_dir"]) 

    except FileNotFoundError as e:
        print(f"\n! Error: {e}. Check the 'MEDIA_FILE_PATH'.")
    except RuntimeError as e:
        print(f"\n! Error during FFmpeg operations: {e}. Check your FFmpeg installation and file integrity.")
    except Exception as e:
        print(f"\n! An unexpected error occurred: {e}")

In [2]:
import yt_dlp

In [3]:
help(yt_dlp)

Help on package yt_dlp:

NAME
    yt_dlp

PACKAGE CONTENTS
    YoutubeDL
    __main__
    __pyinstaller (package)
    aes
    cache
    compat (package)
    cookies
    dependencies (package)
    downloader (package)
    extractor (package)
    globals
    jsinterp
    minicurses
    networking (package)
    options
    plugins
    postprocessor (package)
    socks
    update
    utils (package)
    version
    webvtt

CLASSES
    builtins.object
        yt_dlp.YoutubeDL.YoutubeDL
    
    class YoutubeDL(builtins.object)
     |  YoutubeDL(params=None, auto_init=True)
     |  
     |  YoutubeDL class.
     |  
     |  YoutubeDL objects are the ones responsible of downloading the
     |  actual video file and writing it to disk if the user has requested
     |  it, among some other tasks. In most cases there should be one per
     |  program. As, given a video URL, the downloader doesn't know how to
     |  extract all the needed information, task that InfoExtractors do, it
     |  has 

In [None]:
y