```
conda init
conda create -n get-wav-and-jpg python=3.10 -y
conda activate get-wav-and-jpg
conda install -c conda-forge ffmpeg
pip install ipykernel
```

In [2]:
import os
import random
import subprocess
from pathlib import Path

In [3]:
ROOT = "../VoxCeleb2/vox2_test" # Using test dataset as a smaller dataset
MP4_DIR = Path(ROOT) / "mp4"
WAV_DIR = Path(ROOT) / "wav"
JPG_DIR = Path(ROOT) / "jpg"

In [4]:
def ensure_dir(path: Path):
    if not path.exists():
        path.mkdir(parents=True, exist_ok=True)

In [5]:
def extract_wav(mp4_path: Path, wav_path: Path):
    ensure_dir(wav_path.parent)
    if wav_path.exists():  # Skip if already exists
        return
    
    cmd = [
        "ffmpeg",
        "-y",                # overwrite
        "-i", str(mp4_path),
        "-vn",               # no video
        "-ac", "1",          # mono
        "-ar", "22050",      # resample
        "-acodec", "pcm_s16le",
        str(wav_path)
    ]
    
    subprocess.run(
        cmd,
        check=True,
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
        timeout=30
    )

In [6]:
def extract_random_jpg(mp4_path: Path, jpg_path: Path):
    ensure_dir(jpg_path.parent)
    if jpg_path.exists():  # Skip if already exists
        return

    probe = subprocess.run(
        ["ffprobe", "-v", "error", "-show_entries", "format=duration",
         "-of", "default=noprint_wrappers=1:nokey=1", str(mp4_path)],
        capture_output=True, text=True
    )

    try:
        duration = float(probe.stdout.strip())
    except:
        print(f"Could not read duration for {mp4_path}")
        return

    random_time = random.uniform(0.5, max(0.5, duration - 0.5))

    cmd = [
        "ffmpeg",
        "-y",
        "-ss", str(random_time),
        "-i", str(mp4_path),
        "-vframes", "1",     # capture 1 frame
        str(jpg_path)
    ]
    
    subprocess.run(
        cmd,
        check=True,
        stdout=subprocess.DEVNULL,
        stderr=subprocess.DEVNULL,
        timeout=30
    )

In [7]:
def process_vox2(speaker_id: str):
    search_dir = MP4_DIR / speaker_id
    if not search_dir.exists():
        print(f"Speaker folder does not exist: {search_dir}")
        return

    # Find MP4 files
    mp4_files = list(search_dir.rglob("*.mp4"))
    total = len(mp4_files)

    if total == 0:
        print("No MP4 files found in this speaker folder.")
        return

    # Process files
    for i, mp4_path in enumerate(mp4_files, start=1):
        # Preserve same relative structure under wav/jpg
        relative = mp4_path.relative_to(MP4_DIR)

        wav_out = WAV_DIR / relative.with_suffix(".wav")
        jpg_out = JPG_DIR / relative.with_suffix(".jpg")

        # Print only every 100 items and the first
        if i % 100 == 0 or i == 1:
            print(f"[{i}/{total}] Processing {mp4_path}")
        # print(f"[{i}/{total}] Processing {mp4_path}")

        try:
            extract_wav(mp4_path, wav_out)
            extract_random_jpg(mp4_path, jpg_out)
        except Exception as e:
            print(f"ERROR on file {mp4_path} -> {e}")
            continue

    print("DONE.")

In [8]:
speaker_ids = [
    name for name in os.listdir(MP4_DIR)
    if os.path.isdir(os.path.join(MP4_DIR, name))
]
print(len(speaker_ids), "speakers found.")

118 speakers found.


In [None]:
# Manual loop to process each speaker
for index in range(0, len(speaker_ids)):
    print(f"Processing speaker {index + 1}/{len(speaker_ids)}: {speaker_ids[index]}")
    process_vox2(speaker_ids[index])

Processing speaker 116/118: id08701
[1/500] Processing ..\VoxCeleb2\vox2_test\mp4\id08701\61Al05HARgA\00001.mp4
[100/500] Processing ..\VoxCeleb2\vox2_test\mp4\id08701\AbRetKmm0_8\00100.mp4
[200/500] Processing ..\VoxCeleb2\vox2_test\mp4\id08701\eh_ZROTTUiY\00298.mp4
[300/500] Processing ..\VoxCeleb2\vox2_test\mp4\id08701\nb6GRAMAOQI\00372.mp4
[400/500] Processing ..\VoxCeleb2\vox2_test\mp4\id08701\WSQHS8_XjEw\00218.mp4
[500/500] Processing ..\VoxCeleb2\vox2_test\mp4\id08701\_Ysb9mVibbk\00254.mp4
DONE.
Processing speaker 117/118: id08911
[1/97] Processing ..\VoxCeleb2\vox2_test\mp4\id08911\1R-KmZoaX8A\00001.mp4
DONE.
Processing speaker 118/118: id09017
[1/360] Processing ..\VoxCeleb2\vox2_test\mp4\id09017\1ItrXArzN_A\00001.mp4
[100/360] Processing ..\VoxCeleb2\vox2_test\mp4\id09017\iTefLRE0Y_o\00199.mp4
[200/360] Processing ..\VoxCeleb2\vox2_test\mp4\id09017\PLNK1g5w4FY\00103.mp4
[300/360] Processing ..\VoxCeleb2\vox2_test\mp4\id09017\X8FqYhoc2xU\00144.mp4
DONE.
