In [10]:
# Core
from pathlib import Path
import re, json, math, random
import pandas as pd
from tqdm import tqdm

# Audio
import torch
import torchaudio
import torchaudio.transforms as T

# Paths & I/O
DATA_DIR = Path("./data/RAVDESS")
OUT_DIR = Path("./preprocessed_dataset")
AUDIO_OUT_DIR = OUT_DIR / "audio_16k_clean"
AUG_OUT_DIR = OUT_DIR / "audio_16k_white_noise"

# Audio params
TARGET_SR = 16000       # For Wav2Vec2
MONO = True
AUDIO_EXT = ".wav"      # Change to ".flac" if you prefer FLAC

# Fade (usually off)
FADE_IN_MS = 0          # set >0 only if you want a very small fade-in

# Baseline noise (applied to CLEAN output to break initial silence)
BASELINE_NOISE_ENABLE = True
BASELINE_NOISE_SNR_DB = 32.0     # higher = lighter noise
BASELINE_NOISE_START_ONLY = True # only add to the start
BASELINE_NOISE_START_SEC = 0.5   # duration at the clip start to receive noise

# White-noise augmentation (a separate DUPLICATE file)
AUGMENT_WRITE_DUPLICATE = True
AUG_SNR_DB = 28.0                 # slightly stronger than baseline

# Randomness
SEED = 42
random.seed(SEED)
torch.manual_seed(SEED)

# Make dirs
AUDIO_OUT_DIR.mkdir(parents=True, exist_ok=True)
if AUGMENT_WRITE_DUPLICATE:
    AUG_OUT_DIR.mkdir(parents=True, exist_ok=True)

print("Configured.")


Configured.


In [11]:
file_pattern = re.compile(r"^(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{2})-(\d{2})\.wav$", re.I)

emotion_map = {
    "01": "neutral", "02": "calm", "03": "happy", "04": "sad",
    "05": "angry", "06": "fearful", "07": "disgust", "08": "surprised"
}
intensity_map = {"01": "normal", "02": "strong"}
vocal_map = {"01": "speech", "02": "song"}

label2id = {
    "vocal": {name: i for i, name in enumerate(["speech", "song"])},
    "emotion": {name: i for i, name in enumerate(
        ["neutral","calm","happy","sad","angry","fearful","disgust","surprised"]
    )},
    "intensity": {name: i for i, name in enumerate(["normal","strong"])},
}
label2id


{'vocal': {'speech': 0, 'song': 1},
 'emotion': {'neutral': 0,
  'calm': 1,
  'happy': 2,
  'sad': 3,
  'angry': 4,
  'fearful': 5,
  'disgust': 6,
  'surprised': 7},
 'intensity': {'normal': 0, 'strong': 1}}

In [12]:
rows = []
for actor in sorted(DATA_DIR.glob("Actor_*")):
    for f in actor.glob("*.wav"):
        m = file_pattern.match(f.name)
        if m:
            modality, vocal, emotion, intensity, stmt, rep, actor_id = m.groups()
            actor_num = int(actor_id)
            gender = "male" if actor_num % 2 == 1 else "female"
            rows.append({
                "path": f,
                "actor": actor_id,
                "gender": gender,
                "vocal": vocal_map.get(vocal, vocal),
                "emotion": emotion_map.get(emotion, emotion),
                "intensity": intensity_map.get(intensity, intensity),
            })
        else:
            print("Filename not matching pattern:", f)

df = pd.DataFrame(rows)
print(f"Total files found: {len(df)}")
df.head(3)


Total files found: 1440


Unnamed: 0,path,actor,gender,vocal,emotion,intensity
0,data/RAVDESS/Actor_01/03-01-04-02-01-01-01.wav,1,male,speech,sad,strong
1,data/RAVDESS/Actor_01/03-01-07-02-02-01-01.wav,1,male,speech,disgust,strong
2,data/RAVDESS/Actor_01/03-01-07-02-01-02-01.wav,1,male,speech,disgust,strong


In [13]:
def load_audio(path: Path):
    # Returns float32 tensor in [-1, 1] when possible
    wav, sr = torchaudio.load(str(path))   # [C, T]
    if wav.dtype != torch.float32:
        wav = wav.float() / (2**15)
    return wav, sr

def to_mono(wav: torch.Tensor):
    # [C, T] -> [1, T]
    if wav.ndim == 2 and wav.size(0) > 1:
        wav = wav.mean(dim=0, keepdim=True)
    return wav

def resample(wav: torch.Tensor, sr_in: int, sr_out: int):
    if sr_in == sr_out:
        return wav
    resampler = T.Resample(sr_in, sr_out)
    return resampler(wav)

def apply_fade_in(wav: torch.Tensor, sr: int, fade_ms: int):
    if fade_ms <= 0:
        return wav
    fade_samples = int(sr * fade_ms / 1000.0)
    fade_samples = max(1, min(fade_samples, wav.size(-1)))
    envelope = torch.linspace(0.0, 1.0, fade_samples, dtype=wav.dtype, device=wav.device)
    wav[..., :fade_samples] *= envelope
    return wav


In [14]:
import math

def mix_white_noise(
    wav: torch.Tensor,
    snr_db: float,
    apply_to_start_only: bool = False,
    start_sec: float = 0.5,
    sr: int = 16000,
):
    """
    Add white noise at a target SNR (dB).
    - If apply_to_start_only=True, only first `start_sec` seconds receive noise.
    - Otherwise, noise is added to the entire clip.
    """
    x = wav[0] if wav.ndim == 2 else wav

    # Signal power (avoid zero)
    sig_pow = torch.mean(x**2).item() + 1e-12
    noise_pow = sig_pow / (10.0 ** (snr_db / 10.0))
    noise_std = math.sqrt(noise_pow)
    noise = torch.randn_like(x) * noise_std

    if apply_to_start_only:
        n = min(int(start_sec * sr), x.numel())
        x[:n] = (x[:n] + noise[:n]).clamp(-1.0, 1.0)
    else:
        x = (x + noise).clamp(-1.0, 1.0)

    return x.unsqueeze(0) if wav.ndim == 2 else x


In [15]:
def process_one(row, write_aug=AUGMENT_WRITE_DUPLICATE):
    in_path: Path = row["path"]
    rel = in_path.relative_to(DATA_DIR)

    out_clean = (AUDIO_OUT_DIR / rel).with_suffix(AUDIO_EXT)
    out_clean.parent.mkdir(parents=True, exist_ok=True)

    out_aug = None
    if write_aug:
        out_aug = (AUG_OUT_DIR / rel).with_suffix(AUDIO_EXT)
        out_aug.parent.mkdir(parents=True, exist_ok=True)

    # Load
    wav_raw, sr_in = load_audio(in_path)

    # Mono
    if MONO:
        wav_raw = to_mono(wav_raw)

    # Resample to 16k
    wav_16k = resample(wav_raw, sr_in, TARGET_SR)

    # Optional tiny fade-in (off by default)
    if FADE_IN_MS > 0:
        wav_16k = apply_fade_in(wav_16k, TARGET_SR, FADE_IN_MS)

    # Keep a clean base (no noise) for derived variants
    wav_base = wav_16k.clone()

    # CLEAN: baseline noise (typically start-only, very light) to break pure silence
    if BASELINE_NOISE_ENABLE:
        wav_clean = mix_white_noise(
            wav_base.clone(),
            snr_db=BASELINE_NOISE_SNR_DB,
            apply_to_start_only=BASELINE_NOISE_START_ONLY,
            start_sec=BASELINE_NOISE_START_SEC,
            sr=TARGET_SR,
        )
    else:
        wav_clean = wav_base

    wav_clean = wav_clean.clamp(-1.0, 1.0)
    torchaudio.save(str(out_clean), wav_clean, sample_rate=TARGET_SR, bits_per_sample=16)
    dur_clean = wav_clean.size(-1) / TARGET_SR

    # AUG: full-clip noise (stronger), from the pristine base
    aug_path = None
    dur_aug = None
    if write_aug:
        wav_aug = mix_white_noise(
            wav_base.clone(),
            snr_db=AUG_SNR_DB,
            apply_to_start_only=False,
            sr=TARGET_SR,
        ).clamp(-1.0, 1.0)
        torchaudio.save(str(out_aug), wav_aug, sample_rate=TARGET_SR, bits_per_sample=16)
        aug_path = out_aug
        dur_aug = wav_aug.size(-1) / TARGET_SR

    return {
        "path_clean": out_clean,
        "duration_clean_sec": round(dur_clean, 4),
        "path_aug": aug_path,
        "duration_aug_sec": round(dur_aug, 4) if dur_aug is not None else None,
        "sr_in": sr_in,
        "sr": TARGET_SR,
    }


In [16]:
records = []
for _, row in tqdm(df.iterrows(), total=len(df)):
    meta = process_one(row)
    rec_base = {
        "actor": row["actor"],
        "gender": row["gender"],
        "vocal": row["vocal"],
        "emotion": row["emotion"],
        "intensity": row["intensity"],
        "orig_sr": meta["sr_in"],
        "sr": meta["sr"],
    }
    # Clean entry
    records.append({
        **rec_base,
        "path": str(meta["path_clean"]),
        "duration_sec": meta["duration_clean_sec"],
        "augment": "none",  # note: may include tiny start-only baseline noise by design
    })
    # Augmented entry (if exists)
    if meta["path_aug"] is not None:
        records.append({
            **rec_base,
            "path": str(meta["path_aug"]),
            "duration_sec": meta["duration_aug_sec"],
            "augment": "white_noise",
        })

manifest = pd.DataFrame(records)
print("Total manifest rows:", len(manifest))
manifest.head(4)


  return save_with_torchcodec(
100%|██████████| 1440/1440 [00:38<00:00, 37.75it/s]

Total manifest rows: 2880





Unnamed: 0,actor,gender,vocal,emotion,intensity,orig_sr,sr,path,duration_sec,augment
0,1,male,speech,sad,strong,48000,16000,preprocessed_dataset/audio_16k_clean/Actor_01/...,3.8038,none
1,1,male,speech,sad,strong,48000,16000,preprocessed_dataset/audio_16k_white_noise/Act...,3.8038,white_noise
2,1,male,speech,disgust,strong,48000,16000,preprocessed_dataset/audio_16k_clean/Actor_01/...,4.0374,none
3,1,male,speech,disgust,strong,48000,16000,preprocessed_dataset/audio_16k_white_noise/Act...,4.0374,white_noise


In [17]:
OUT_DIR.mkdir(parents=True, exist_ok=True)
manifest_path_csv = OUT_DIR / "manifest.csv"
manifest.to_csv(manifest_path_csv, index=False)

# Optional Parquet
try:
    manifest_path_parquet = OUT_DIR / "manifest.parquet"
    manifest.to_parquet(manifest_path_parquet, index=False)
except Exception as e:
    print("Parquet save failed (ok to ignore):", e)

with open(OUT_DIR / "label2id.json", "w") as f:
    json.dump(label2id, f, indent=2)

pre_cfg = {
    "target_sr": TARGET_SR,
    "mono": MONO,
    "fade_in_ms": FADE_IN_MS,
    "baseline_noise": {
        "enabled": BASELINE_NOISE_ENABLE,
        "snr_db": BASELINE_NOISE_SNR_DB,
        "start_only": BASELINE_NOISE_START_ONLY,
        "start_sec": BASELINE_NOISE_START_SEC,
    },
    "augmentation_duplicate": {
        "enabled": AUGMENT_WRITE_DUPLICATE,
        "snr_db": AUG_SNR_DB,
    },
    "audio_ext": AUDIO_EXT,
}
with open(OUT_DIR / "preprocess_config.json", "w") as f:
    json.dump(pre_cfg, f, indent=2)

print("Saved:", manifest_path_csv, "and config JSONs.")


Parquet save failed (ok to ignore): Unable to find a usable engine; tried using: 'pyarrow', 'fastparquet'.
A suitable version of pyarrow or fastparquet is required for parquet support.
Trying to import the above resulted in these errors:
 - Missing optional dependency 'pyarrow'. pyarrow is required for parquet support. Use pip or conda to install pyarrow.
 - Missing optional dependency 'fastparquet'. fastparquet is required for parquet support. Use pip or conda to install fastparquet.
Saved: preprocessed_dataset/manifest.csv and config JSONs.


In [18]:
print("Counts by vocal:")
print(manifest.groupby("vocal").size(), "\n")

print("Counts by emotion:")
print(manifest.groupby("emotion").size(), "\n")

print("Counts by augment:")
print(manifest.groupby("augment").size(), "\n")

print("Duration stats (sec):")
print(manifest["duration_sec"].describe())


Counts by vocal:
vocal
speech    2880
dtype: int64 

Counts by emotion:
emotion
angry        384
calm         384
disgust      384
fearful      384
happy        384
neutral      192
sad          384
surprised    384
dtype: int64 

Counts by augment:
augment
none           1440
white_noise    1440
dtype: int64 

Duration stats (sec):
count    2880.000000
mean        3.700687
std         0.336617
min         2.936300
25%         3.470200
50%         3.670400
75%         3.870600
max         5.271900
Name: duration_sec, dtype: float64


In [19]:
for p in manifest.sample(min(5, len(manifest)), random_state=SEED)["path"]:
    p = Path(p)
    print(p, "exists:", p.exists())

preprocessed_dataset/audio_16k_white_noise/Actor_04/03-01-02-01-01-02-04.wav exists: True
preprocessed_dataset/audio_16k_white_noise/Actor_13/03-01-08-02-01-02-13.wav exists: True
preprocessed_dataset/audio_16k_white_noise/Actor_20/03-01-08-01-02-02-20.wav exists: True
preprocessed_dataset/audio_16k_white_noise/Actor_14/03-01-03-02-02-01-14.wav exists: True
preprocessed_dataset/audio_16k_clean/Actor_10/03-01-06-01-01-01-10.wav exists: True


In [20]:
!/home/mahrjose/Hub/Research/HSA/.venv/bin/python -m pip install torchcodec


[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m A new release of pip is available: [0m[31;49m24.2[0m[39;49m -> [0m[32;49m25.3[0m
[1m[[0m[34;49mnotice[0m[1;39;49m][0m[39;49m To update, run: [0m[32;49mpip install --upgrade pip[0m
