# AIM : Download VoxCeleb1, extract audio samples and evaluate their quality to build the metadata csv

## Libraries

In [16]:
!pip install pesq
!pip install pystoi
!pip install librosa
!pip install torchaudio
!pip install pandas
!pip install tqdm
!pip install datasets soundfile





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip





[notice] A new release of pip is available: 25.0 -> 25.3
[notice] To update, run: python.exe -m pip install --upgrade pip


In [17]:
import os
import torch  
import librosa
import numpy as np
import pandas as pd
from tqdm import tqdm
from pesq import pesq
from pystoi import stoi
import librosa.core
import soundfile as sf
import torchaudio
import random
import shutil

# 1. load dataset

We are going to use a subset of LibriSpeech dataset available via torchaudio library.

In [18]:
# Use dev-clean (smallest subset) and handle the loading more carefully
DATA_DIR = "./data/librispeech_subset"
RAW_DIR = os.path.join(DATA_DIR, "raw")
os.makedirs(RAW_DIR, exist_ok=True)

try:
    # Download dev-clean (smallest subset, ~337MB)
    dataset = torchaudio.datasets.LIBRISPEECH(
        root="./data",
        url="dev-clean",  # smallest subset
        download=True
    )
    
    # Take a small random sample
    N_SAMPLES = 2000
    total_files = len(dataset)
    sampled_indices = random.sample(range(total_files), min(N_SAMPLES, total_files))
    
    print(f"Saving {len(sampled_indices)} samples to {RAW_DIR}")
    for idx in tqdm(sampled_indices):
        try:
            # Load audio
            waveform, sample_rate, _, _, _, _ = dataset[idx]
            
            # Save as WAV
            filename = f"librispeech_sample_{idx:04d}.wav"
            filepath = os.path.join(RAW_DIR, filename)
            torchaudio.save(filepath, waveform, sample_rate)
            
        except Exception as e:
            print(f"Error processing file {idx}: {e}")
            continue
            
    print(f"Dataset prepared in {RAW_DIR}")
    
except Exception as e:
    print(f"Failed to load dataset: {e}")
    print("Please check your internet connection and try again.")

Saving 2000 samples to ./data/librispeech_subset\raw


100%|██████████| 2000/2000 [00:24<00:00, 83.30it/s] 

Dataset prepared in ./data/librispeech_subset\raw





# 2. Prepare audiofiles

In [19]:
AUDIO_DIR = RAW_DIR
files = librosa.util.find_files(AUDIO_DIR, ext=["wav"])

print(f"{len(files)} files found.")

2000 files found.


# 3. Calculate audio quality metrics

In [20]:
def signal_to_noise_ratio(clean, noisy):
    """Calculate Signal-to-Noise Ratio in dB."""
    noise = noisy - clean
    signal_power = torch.mean(clean ** 2)
    noise_power = torch.mean(noise ** 2)
    snr = 10 * torch.log10(signal_power / (noise_power + 1e-8))
    return snr.item()

In [26]:
# thresholds - adjust these to be more reasonable
MIN_DURATION = 2.0    # seconds
PESQ_MIN = 2.5       # To adapt, according to precision of PESQ
STOI_MIN = 0.85       # closer to 1.0 meaning better intelligibility
SNR_MIN = 10.0       # Need higher SNR for better quality

results = []
TARGET_SR = 16000

for f in tqdm(files):
    try:
        # Load and resample to required rate for PESQ
        y, sr = librosa.load(f, sr=TARGET_SR, mono=True)
        if len(y) < sr * MIN_DURATION:
            continue

        # Convert to torch tensor and normalize
        y = torch.from_numpy(y).float()
        y = y / (torch.abs(y).max() + 1e-8)

        # Instead of adding noise, use the original audio
        # Calculate background noise level from silent segments
        frame_length = 2048
        hop_length = 512
        rms = librosa.feature.rms(y=y.numpy(), frame_length=frame_length, hop_length=hop_length)
        noise_floor = np.percentile(rms, 10)  # estimate noise floor from quietest segments
        
        # Calculate SNR using estimated noise floor
        signal_power = torch.mean(y ** 2)
        snr_score = 10 * torch.log10(signal_power / (noise_floor + 1e-8))

        # Calculate PESQ and STOI using clean audio as reference
        # This assumes the original audio is clean enough
        pesq_score = pesq(TARGET_SR, y.numpy(), y.numpy(), "wb")
        stoi_score = stoi(y.numpy(), y.numpy(), TARGET_SR, extended=False)

        results.append({
            "file": f,
            "duration_s": len(y) / TARGET_SR,
            "pesq": pesq_score,
            "stoi": stoi_score,
            "snr": snr_score.item()
        })
    except Exception as e:
        print(f"Error processing {f}: {e}")

df = pd.DataFrame(results)

# Filter to keep only higher quality samples
df_filtered = df[
    (df["pesq"] >= PESQ_MIN) &
    (df["stoi"] >= STOI_MIN) &
    (df["snr"] >= SNR_MIN)
].reset_index(drop=True)

100%|██████████| 2000/2000 [36:54<00:00,  1.11s/it]    


# 4. Filter and save metadata CSV

In [28]:
print(f"Low quality samples: {len(df_filtered)} samples kept.")

os.makedirs(DATA_DIR, exist_ok=True)
metadata_path = os.path.join(DATA_DIR, "metadata.csv")
df_filtered.to_csv(metadata_path, index=False)
print(f"Metadata saved: {metadata_path}")

# Copy selected files to a new directory
AUDIO_DIR_HIGH_QUALITY = os.path.join(DATA_DIR, "high_quality_audio")
os.makedirs(AUDIO_DIR_HIGH_QUALITY, exist_ok=True)
for _, row in df_filtered.iterrows():
    src_path = row["file"]
    filename = os.path.basename(src_path)
    dest_path = os.path.join(AUDIO_DIR_HIGH_QUALITY, filename)
    shutil.copy2(src_path, dest_path)


# Display first few entries
print(df_filtered.head())

Low quality samples: 501 samples kept.
Metadata saved: ./data/librispeech_subset\metadata.csv
                                                file  duration_s      pesq  \
0  c:\Users\leami\Documents\projet_git\GaussBuste...    5.855000  4.643888   
1  c:\Users\leami\Documents\projet_git\GaussBuste...    4.815000  4.643888   
2  c:\Users\leami\Documents\projet_git\GaussBuste...    9.240000  4.643888   
3  c:\Users\leami\Documents\projet_git\GaussBuste...    2.444937  4.643888   
4  c:\Users\leami\Documents\projet_git\GaussBuste...    6.535000  4.643888   

   stoi        snr  
0   1.0  11.052586  
1   1.0  10.639106  
2   1.0  10.487404  
3   1.0  10.181086  
4   1.0  12.612596  
