In [None]:
# download in streaming mode to take only a part
from pathlib import Path

import numpy as np
import pandas as pd
import soundfile as sf
from datasets import load_dataset
from tqdm.auto import tqdm
import os

# Fast, good-quality resampling (preferred)
import torch
import torchaudio.functional as F

# ----------------------------
# Config
# ----------------------------
REPO = "Cnam-LMSSC/vibravox"
CONFIG = "speech_clean"
SPLIT = "test"

N = 1_000

SRC_SR = 48_000
TGT_SR = 16_000
SECONDS = 2.0
TGT_LEN = int(TGT_SR * SECONDS)  # 32000

# Save space: FLAC (lossless) + int16
AUDIO_EXT = "flac"               # "flac" recommended
SF_FORMAT = "FLAC"               # if AUDIO_EXT="wav", change to "WAV"
SF_SUBTYPE = "PCM_16"

# Which modalities you want to store
FIELDS = {
    "headset": "audio.headset_microphone",
    "forehead": "audio.forehead_accelerometer",
    "temple": "audio.temple_vibration_pickup",
}

out_dir = Path(f"vibravox_subset_n{N}_sr{TGT_SR}_len{TGT_LEN}_test")
audio_dir = out_dir / "audio"
audio_dir.mkdir(parents=True, exist_ok=True)


# ----------------------------
# Helpers
# ----------------------------
def _to_mono_float32(samples) -> torch.Tensor:
    """
    torchcodec AudioSamples -> mono float32 torch.Tensor [T]
    Handles [C, T] and [T].
    """
    x = samples.data  # torch Tensor
    if x.ndim == 2:
        # average channels to mono (safe default)
        x = x.mean(dim=0)
    x = x.to(dtype=torch.float32, device="cpu").contiguous()
    return x


def _resample_48k_to_16k(x_48k: torch.Tensor) -> torch.Tensor:
    # torchaudio expects [C, T]
    x = x_48k.unsqueeze(0)  # [1, T]
    y = F.resample(x, orig_freq=SRC_SR, new_freq=TGT_SR)  # [1, T']
    return y.squeeze(0).contiguous()


def _trim_or_pad(x: torch.Tensor, length: int) -> torch.Tensor:
    n = x.numel()
    if n >= length:
        return x[:length].contiguous()
    # pad with zeros at end
    out = torch.zeros(length, dtype=x.dtype)
    out[:n] = x
    return out


def _write_audio(path: Path, x_16k_len: torch.Tensor):
    # Write int16 FLAC/WAV to save storage
    arr = x_16k_len.numpy()
    sf.write(
        str(path),
        arr,
        TGT_SR,
        format=SF_FORMAT,
        subtype=SF_SUBTYPE,
    )


# ----------------------------
# Stream + process
# ----------------------------
ds_stream = load_dataset(
    REPO,
    CONFIG,
    split=SPLIT,
    streaming=True,
)

rows = []
for ex in tqdm(ds_stream.take(N), total=N):
    speaker_id = ex["speaker_id"]
    sentence_id = ex["sentence_id"]
    duration = float(ex.get("duration", np.nan))

    stem = f"{speaker_id}_{sentence_id}"

    out_paths = {}
    # Decode once per field, resample, trim/pad, write
    for short, field in FIELDS.items():
        samples = ex[field].get_all_samples()  # torchcodec decode
        # Optional safety check (dataset is expected 48k)
        sr = int(samples.sample_rate)
        if sr != SRC_SR:
            raise ValueError(f"Unexpected sample_rate={sr} for {field} (expected {SRC_SR})")

        x = _to_mono_float32(samples)
        x = _resample_48k_to_16k(x)
        x = _trim_or_pad(x, TGT_LEN)

        p = audio_dir / f"{stem}_{short}.{AUDIO_EXT}"
        _write_audio(p, x)
        out_paths[f"{short}_path"] = str(p)

    rows.append(
        {
            "speaker_id": speaker_id,
            "sentence_id": sentence_id,
            "duration": duration,
            **out_paths,
            "sr": TGT_SR,
            "length": TGT_LEN,
        }
    )

# Metadata
df = pd.DataFrame(rows)
df.to_parquet(out_dir / "metadata.parquet", index=False)

print(f"Saved {len(df)} items to: {out_dir.resolve()}")

In [None]:
# push to hf
import os
from datasets import load_dataset, Audio, DatasetDict, Value

train_dir = "vibravox_subset_n5000_sr16000_len32000"
test_dir  = "vibravox_subset_n1000_sr16000_len32000_test"

# 1) load metadata as *Dataset* (not DatasetDict)
train = load_dataset("parquet", data_files=f"{train_dir}/metadata.parquet", split="train")
test  = load_dataset("parquet", data_files=f"{test_dir}/metadata.parquet",  split="train")

# 2) cast path columns -> string (obligatory) -> Audio
audio16 = Audio(sampling_rate=16_000)

for col in ["headset_path", "forehead_path", "temple_path"]:
    train = train.cast_column(col, Value("string"))
    train = train.cast_column(col, audio16)

    test  = test.cast_column(col, Value("string"))
    test  = test.cast_column(col, audio16)

ds = DatasetDict({"train": train, "test": test})

# 3) push
repo_id = "verbreb/vibravox_16k_2s_subset"
ds.push_to_hub(repo_id, embed_external_files=True)

In [None]:
# how to load dataset
from datasets import load_dataset
ds = load_dataset("verbreb/vibravox_16k_2s_subset")