# AIM : Download VoxCeleb1, extract audio samples and evaluate their quality to build the metadata csv

## Libraries

In [1]:
!pip install pesq
!pip install pystoi
!pip install librosa
!pip install torchaudio
!pip install pandas
!pip install tqdm
!pip install datasets soundfile


Collecting pesq
  Using cached pesq-0.0.4-cp312-cp312-win_amd64.whl
Installing collected packages: pesq
Successfully installed pesq-0.0.4



[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


Collecting pystoi
  Using cached pystoi-0.4.1-py2.py3-none-any.whl.metadata (4.0 kB)
Using cached pystoi-0.4.1-py2.py3-none-any.whl (8.2 kB)
Installing collected packages: pystoi
Successfully installed pystoi-0.4.1



[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [2]:
import os
import torch  
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
from pesq import pesq
from pystoi import stoi
import librosa.core
import soundfile as sf
import torchaudio
import random
import shutil

In [3]:
import subprocess
import sys
import re

def check_ffmpeg():
    try:
        result = subprocess.run(['ffmpeg', '-version'], capture_output=True, text=True)
        version_str = result.stdout.split('\n')[0]
        print("FFmpeg found:", version_str)
        
        # Extract version info for git build
        if 'git-' in version_str:
            print("Using FFmpeg git build")
            # Install latest torchcodec for git build
            try:
                import torchcodec
                print(f"torchcodec version: {torchcodec.__version__}")
                return True
            except ImportError:
                print("Installing latest torchcodec...")
                subprocess.run([sys.executable, "-m", "pip", "install", "torchcodec"])
                return True
        return True
    except FileNotFoundError:
        print("FFmpeg not found in PATH")
        return False

if check_ffmpeg():
    print("System info:")
    print(f"Python: {sys.version.split()[0]}")
    print(f"OS: {sys.platform}")
    print("\nNote: Using latest torchcodec with git build FFmpeg")
else:
    print("Please install FFmpeg first")

FFmpeg found: ffmpeg version 2025-02-13-git-19a2d26177-essentials_build-www.gyan.dev Copyright (c) 2000-2025 the FFmpeg developers
Using FFmpeg git build
Installing latest torchcodec...
System info:
Python: 3.12.7
OS: win32

Note: Using latest torchcodec with git build FFmpeg


# 1. load dataset

We are going to use a subset of LibriSpeech dataset available via torchaudio library.

In [4]:
# Use dev-clean (smallest subset) and handle the loading more carefully
DATA_DIR = "./data/librispeech_subset"
RAW_DIR = os.path.join(DATA_DIR, "raw")
os.makedirs(RAW_DIR, exist_ok=True)

try:
    # Download dev-clean (smallest subset, ~337MB)
    dataset = torchaudio.datasets.LIBRISPEECH(
        root="./data",
        url="dev-clean",  # smallest subset
        download=True
    )
    
    # Take a small random sample
    N_SAMPLES = 50  # reduced from 100 to 50 for testing
    total_files = len(dataset)
    sampled_indices = random.sample(range(total_files), min(N_SAMPLES, total_files))
    
    print(f"Saving {len(sampled_indices)} samples to {RAW_DIR}")
    for idx in tqdm(sampled_indices):
        try:
            # Load audio
            waveform, sample_rate, _, _, _, _ = dataset[idx]
            
            # Save as WAV
            filename = f"librispeech_sample_{idx:04d}.wav"
            filepath = os.path.join(RAW_DIR, filename)
            torchaudio.save(filepath, waveform, sample_rate)
            
        except Exception as e:
            print(f"Error processing file {idx}: {e}")
            continue
            
    print(f"Dataset prepared in {RAW_DIR}")
    
except Exception as e:
    print(f"Failed to load dataset: {e}")
    print("Please check your internet connection and try again.")

Saving 50 samples to ./data/librispeech_subset\raw


100%|██████████| 50/50 [00:00<00:00, 67.96it/s]

Dataset prepared in ./data/librispeech_subset\raw





# 2. Prepare audiofiles

In [5]:
AUDIO_DIR = RAW_DIR
files = librosa.util.find_files(AUDIO_DIR, ext=["wav"])

print(f"{len(files)} files found.")

  "cipher": algorithms.TripleDES,


50 files found.


  "class": algorithms.Blowfish,
  "class": algorithms.TripleDES,


# 3. Calculate audio quality metrics

In [6]:
def signal_to_noise_ratio(clean, noisy):
    """Calculate Signal-to-Noise Ratio in dB."""
    noise = noisy - clean
    signal_power = torch.mean(clean ** 2)
    noise_power = torch.mean(noise ** 2)
    snr = 10 * torch.log10(signal_power / (noise_power + 1e-8))
    return snr.item()

In [7]:
# thresholds
MIN_DURATION = 2.0    # seconds
PESQ_MIN = 3.0
STOI_MIN = 0.9
SNR_MIN = 20.0        # dB

results = []
TARGET_SR = 16000  # PESQ requires 8000 or 16000 Hz

for f in tqdm(files[:100]):  # process only 100 files for testing
    try:
        # Load and resample to required rate for PESQ
        y, sr = librosa.load(f, sr=TARGET_SR, mono=True)
        if len(y) < sr * MIN_DURATION:
            continue  # ignore too short

        # Convert to torch tensor
        y = torch.from_numpy(y).float()
        
        # RMS normalization
        y = y / (torch.abs(y).max() + 1e-8)

        # Create degraded version with noise
        noise = torch.randn_like(y) * 0.03
        degraded = y + noise

        # Calculate metrics (using resampled audio)
        clean_np = y.numpy()
        degraded_np = degraded.numpy()
        pesq_score = pesq(TARGET_SR, clean_np, degraded_np, "wb")  # use wb mode for 16kHz
        stoi_score = stoi(clean_np, degraded_np, TARGET_SR, extended=False)
        snr_score = signal_to_noise_ratio(y, degraded)

        results.append({
            "file": f,
            "duration_s": len(y) / TARGET_SR,
            "pesq": pesq_score,
            "stoi": stoi_score, 
            "snr": snr_score
        })
    except Exception as e:
        print(f"Error processing {f}: {e}")

df = pd.DataFrame(results)


100%|██████████| 50/50 [00:09<00:00,  5.28it/s]


# 4. Filter and save metadata CSV

In [8]:
df_filtered = df[
    (df["pesq"] < PESQ_MIN) |
    (df["stoi"] < STOI_MIN) |
    (df["snr"] < SNR_MIN)
].reset_index(drop=True)

print(f"Low quality samples: {len(df_filtered)} samples kept.")

os.makedirs(DATA_DIR, exist_ok=True)
metadata_path = os.path.join(DATA_DIR, "metadata.csv")
df_filtered.to_csv(metadata_path, index=False)
print(f"Metadata saved: {metadata_path}")

# Display first few entries
print(df_filtered.head())

Low quality samples: 50 samples kept.
Metadata saved: ./data/librispeech_subset\metadata.csv
                                                file  duration_s      pesq  \
0  c:\Users\leami\Documents\projet_git\GaussBuste...       4.815  1.148980   
1  c:\Users\leami\Documents\projet_git\GaussBuste...       3.180  1.138175   
2  c:\Users\leami\Documents\projet_git\GaussBuste...       8.660  1.148772   
3  c:\Users\leami\Documents\projet_git\GaussBuste...       2.155  1.299947   
4  c:\Users\leami\Documents\projet_git\GaussBuste...      15.695  1.146796   

       stoi        snr  
0  0.932619  12.802691  
1  0.882518  11.888739  
2  0.935420  14.071278  
3  0.919859  11.977802  
4  0.896932  11.026851  
