```
conda init
conda create -n get-wav-and-jpg python=3.10 -y
conda activate get-wav-and-jpg
conda install -c conda-forge ffmpeg
pip install ipykernel
```

In [1]:
import os
import random
import subprocess
from pathlib import Path

In [None]:
ROOT = "H:/VoxCeleb2/vox2_test" # Using test dataset as a smaller dataset
MP4_DIR = Path(ROOT) / "mp4"
WAV_DIR = Path(ROOT) / "wav"
JPG_DIR = Path(ROOT) / "jpg"

In [3]:
def ensure_dir(path: Path):
    if not path.exists():
        path.mkdir(parents=True, exist_ok=True)

In [5]:
def extract_wav(mp4_path: Path, wav_path: Path):
    ensure_dir(wav_path.parent)
    cmd = [
        "ffmpeg",
        "-y",                # overwrite
        "-i", str(mp4_path),
        "-vn",               # no video
        "-ac", "1",          # mono
        "-ar", "22050",      # resample
        "-acodec", "pcm_s16le",
        str(wav_path)
    ]
    subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

In [6]:
def extract_random_jpg(mp4_path: Path, jpg_path: Path):
    ensure_dir(jpg_path.parent)

    probe = subprocess.run(
        ["ffprobe", "-v", "error", "-show_entries", "format=duration",
         "-of", "default=noprint_wrappers=1:nokey=1", str(mp4_path)],
        capture_output=True, text=True
    )

    try:
        duration = float(probe.stdout.strip())
    except:
        print(f"Could not read duration for {mp4_path}")
        return

    random_time = random.uniform(0.5, max(0.5, duration - 0.5))

    cmd = [
        "ffmpeg",
        "-y",
        "-ss", str(random_time),
        "-i", str(mp4_path),
        "-vframes", "1",     # capture 1 frame
        str(jpg_path)
    ]
    subprocess.run(cmd, stdout=subprocess.DEVNULL, stderr=subprocess.DEVNULL)

In [8]:
def process_vox2():
    mp4_files = list(MP4_DIR.rglob("*.mp4"))
    total = len(mp4_files)

    print(f"Found {total} MP4 files.")

    for i, mp4_path in enumerate(mp4_files, start=1):
        # Recreate structure
        relative = mp4_path.relative_to(MP4_DIR)

        wav_out = WAV_DIR / relative.with_suffix(".wav")
        jpg_out = JPG_DIR / relative.with_suffix(".jpg")

        # Print only every 100 items and the very first
        if i % 100 == 0 or i == 1:
            print(f"[{i}/{total}] Processing {mp4_path}")

        extract_wav(mp4_path, wav_out)
        extract_random_jpg(mp4_path, jpg_out)

    print("DONE.")

In [9]:
process_vox2()

Found 780 MP4 files.
[1/780] Processing H:\VoxCeleb2\vox2_preprocess_test\mp4\id00017\01dfn2spqyE\00001.mp4
[100/780] Processing H:\VoxCeleb2\vox2_preprocess_test\mp4\id00017\lZf1RB6l5Gs\00140.mp4
[200/780] Processing H:\VoxCeleb2\vox2_preprocess_test\mp4\id00061\46tdg4vE31g\00017.mp4
[300/780] Processing H:\VoxCeleb2\vox2_preprocess_test\mp4\id00061\jT6eew_nWz4\00168.mp4
[400/780] Processing H:\VoxCeleb2\vox2_preprocess_test\mp4\id00061\nrI2uwhFFto\00255.mp4
[500/780] Processing H:\VoxCeleb2\vox2_preprocess_test\mp4\id00081\5sk7wU0pjgU\00029.mp4
[600/780] Processing H:\VoxCeleb2\vox2_preprocess_test\mp4\id00081\oICidf_tyKI\00171.mp4
[700/780] Processing H:\VoxCeleb2\vox2_preprocess_test\mp4\id00081\wAAMEC1OsRc\00258.mp4
DONE.
