In [1]:
# Imports

import os
import torchaudio
import torchaudio.transforms as T
import librosa
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from pathlib import Path
import torch

In [52]:
# Set project root relative to notebook
NOTEBOOK_DIR = Path(__file__).resolve().parent if '__file__' in globals() else Path().resolve()
PROJECT_ROOT = NOTEBOOK_DIR.parent

VCTK_ROOT = PROJECT_ROOT / "data" / "raw" / "VCTK-Corpus"


In [53]:
# Constants
VCTK_ROOT = "../data/raw/VCTK-Corpus"
SAVE_DIR = "../data/processed"
SAMPLE_RATE = 22050
N_MELS = 80
HOP_LENGTH = 256
WIN_LENGTH = 1024

# Create save directory
os.makedirs(SAVE_DIR, exist_ok=True)

In [54]:
# Mel-spectrogram extractor
mel_extractor = T.MelSpectrogram(
    sample_rate=SAMPLE_RATE,
    n_fft=WIN_LENGTH,
    hop_length=HOP_LENGTH,
    n_mels=N_MELS
)

In [55]:
def extract_speaker_id(path):
    """Extract speaker ID from filename"""
    return path.parts[-2]  # Assuming path like VCTK-Corpus/wav48_silence_trimmed/p225/...

In [56]:
def preprocess_and_save():
    audio_dir = Path(VCTK_ROOT) / "wav48_silence_trimmed"
    
    if not audio_dir.exists():
        print(f"[ERROR] Audio directory does not exist: {audio_dir.resolve()}")
        return
    
    speaker_dirs = sorted([p for p in audio_dir.iterdir() if p.is_dir()])
    print(f"✅ Found {len(speaker_dirs)} speaker folders.")

    saved = 0
    for speaker_path in tqdm(speaker_dirs, desc="Processing speakers"):
        speaker_id = speaker_path.name
        audio_files = list(speaker_path.glob("*.flac")) + list(speaker_path.glob("*.wav"))

        if len(audio_files) == 0:
            print(f"⚠️  No .flac or .wav files found in {speaker_id}")
            continue

        for wav_file in audio_files:
            try:
                waveform, sr = torchaudio.load(wav_file)
                if sr != SAMPLE_RATE:
                    waveform = torchaudio.functional.resample(waveform, sr, SAMPLE_RATE)
                mel = mel_extractor(waveform).squeeze(0)  # shape: [n_mels, time]
                mel = (mel - mel.mean()) / (mel.std() + 1e-6)  # normalize

                save_path = Path(SAVE_DIR) / f"{speaker_id}_{wav_file.stem}.pt"
                torch.save({'mel': mel, 'speaker_id': speaker_id}, save_path)
                saved += 1
            except Exception as e:
                print(f"❌ Failed to process {wav_file.name}: {e}")

    print(f"\n✅ Saved {saved} examples to {SAVE_DIR}")

In [57]:
# --- Run the preprocessing ---
preprocess_and_save()

✅ Found 110 speaker folders.


Processing speakers: 100%|██████████| 110/110 [05:48<00:00,  3.17s/it]


✅ Saved 88328 examples to ../data/processed



