# Load Datasets

In [8]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [9]:
!cp /content/drive/MyDrive/dataset/LibriSpeech_WAV.zip .

In [10]:
!unzip LibriSpeech_WAV.zip

[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
  inflating: LibriSpeech_WAV/train-clean/7517/100442/7517-100442-0008.wav  
  inflating: LibriSpeech_WAV/train-clean/7517/100442/7517-100442-0029.wav  
  inflating: LibriSpeech_WAV/train-clean/7517/100442/7517-100442-0003.wav  
  inflating: LibriSpeech_WAV/train-clean/7517/100442/7517-100442-0019.wav  
  inflating: LibriSpeech_WAV/train-clean/7517/100442/7517-100442-0000.wav  
  inflating: LibriSpeech_WAV/train-clean/7517/100442/7517-100442-0012.wav  
  inflating: LibriSpeech_WAV/train-clean/7517/100442/7517-100442-0020.wav  
  inflating: LibriSpeech_WAV/train-clean/7517/100442/7517-100442-0014.wav  
  inflating: LibriSpeech_WAV/train-clean/7517/100442/7517-100442-0011.wav  
  inflating: LibriSpeech_WAV/train-clean/7517/100442/7517-100442-0022.wav  
  inflating: LibriSpeech_WAV/train-clean/7517/100442/7517-100442-0028.wav  
  inflating: LibriSpeech_WAV/train-clean/7517/100442/7517-100442-0006.wav  
  inflating: Li

In [12]:
import os
import random
import shutil
import soundfile as sf
from pathlib import Path

In [13]:
TRAIN_SRC_DIR = "LibriSpeech_WAV/train-clean"
TEST_SRC_DIR = "LibriSpeech_WAV/test-clean"
TRAIN_DST_DIR = "LibriSpeech_final/train-clean"
TEST_DST_DIR = "LibriSpeech_final/test-clean"

TRAIN_TARGET_HOURS = 25
TRAIN_TARGET_SECONDS = TRAIN_TARGET_HOURS * 3600
TEST_TARGET_HOURS = 1
TEST_TARGET_SECONDS = TEST_TARGET_HOURS * 3600

# Define Utility Function

In [15]:
def get_duration(wav_path):
    """
    Calculate duration of a WAV file in seconds using soundfile (no ffmpeg dependency).

    Args:
        wav_path (str): Path to WAV file.

    Returns:
        float: Duration in seconds
    """
    with sf.SoundFile(wav_path) as f:
        frames = f.frames
        sr = f.samplerate
    return frames / sr

def parse_librispeech_split(base_dir):
    """
    Parse LibriSpeech dataset directory to extract all utterances.

    Args:
        base_dir (str): Path to train-clean (or dev/test) folder.

    Returns:
        list[dict]: List of dicts with keys 'id', 'audio', 'text'.
    """
    samples = []

    for speaker in os.listdir(base_dir):
        spk_path = os.path.join(base_dir, speaker)
        if not os.path.isdir(spk_path):
            continue

        for chapter in os.listdir(spk_path):
            chap_path = os.path.join(spk_path, chapter)
            if not os.path.isdir(chap_path):
                continue

            txt_files = [f for f in os.listdir(chap_path) if f.endswith(".trans.txt")]
            if not txt_files:
                continue
            transcript_file = os.path.join(chap_path, txt_files[0])

            transcripts = {}
            with open(transcript_file, "r") as f:
                for line in f:
                    utt_id, text = line.strip().split(" ", 1)
                    transcripts[utt_id] = text.lower()

            for utt_id, text in transcripts.items():
                wav_path = os.path.join(chap_path, utt_id + ".wav")
                if os.path.exists(wav_path):
                    samples.append({
                        "id": utt_id,
                        "audio": wav_path,
                        "text": text,
                        "speaker": speaker,
                        "chapter": chapter
                    })
    return samples

def speaker_balanced_sampling(samples, target_seconds):
    """
    Sample utterances in a speaker-balanced manner until target duration is reached.

    Args:
        samples (list[dict]): List of utterance dictionaries.
        target_seconds (float): Target total duration in seconds.

    Returns:
        list[dict]: List of sampled utterances.
    """

    speaker_map = {}
    for s in samples:
        spk = s["speaker"]
        speaker_map.setdefault(spk, []).append(s)

    for spk in speaker_map:
        random.shuffle(speaker_map[spk])

    selected = []
    total_sec = 0

    while total_sec < target_seconds:
        for spk in speaker_map:
            if not speaker_map[spk]:
                continue
            utt = speaker_map[spk].pop(0)
            dur = get_duration(utt["audio"])
            if total_sec + dur <= target_seconds:
                selected.append(utt)
                total_sec += dur

        if all(len(v) == 0 for v in speaker_map.values()):
            break

    print(f"Selected utterances: {len(selected)}, total duration: {total_sec/3600:.2f} hours")
    return selected

def create_new_dataset_folder(selected_samples, dst_root):
    """
    Copy selected WAV files and transcripts to a new dataset folder
    maintaining the LibriSpeech folder structure.

    Args:
        selected_samples (list[dict]): List of sampled utterances.
        dst_root (str): Root folder for new dataset (e.g., LibriSpeech_25h/train-clean)
    """
    for utt in selected_samples:
        spk = utt["speaker"]
        chap = utt["chapter"]
        dst_chap_dir = os.path.join(dst_root, spk, chap)
        os.makedirs(dst_chap_dir, exist_ok=True)

        # Copy WAV file
        dst_wav_path = os.path.join(dst_chap_dir, utt["id"] + ".wav")
        shutil.copy2(utt["audio"], dst_wav_path)

        # Append transcript to chapter transcript file
        dst_transcript_file = os.path.join(dst_chap_dir, f"{chap}.trans.txt")
        with open(dst_transcript_file, "a") as f:
            f.write(f"{utt['id']} {utt['text']}\n")

# Sampling Datasets

In [16]:
train_samples = parse_librispeech_split(TRAIN_SRC_DIR)
test_samples = parse_librispeech_split(TEST_SRC_DIR)
print(f"Total utterances in original train dataset: {len(train_samples)}")
print(f"Total utterances in original test dataset: {len(test_samples)}")

selected_train_samples = speaker_balanced_sampling(train_samples, TRAIN_TARGET_SECONDS)
selected_test_samples = speaker_balanced_sampling(test_samples, TEST_TARGET_SECONDS)

create_new_dataset_folder(selected_train_samples, TRAIN_DST_DIR)
create_new_dataset_folder(selected_test_samples, TEST_DST_DIR)

print(f"New {TRAIN_TARGET_HOURS}h train dataset created at: {TRAIN_DST_DIR}")
print(f"New {TEST_TARGET_HOURS}h test dataset created at: {TEST_DST_DIR}")

Total utterances in original train dataset: 28539
Total utterances in original test dataset: 2620
Selected utterances: 7051, total duration: 25.00 hours
Selected utterances: 453, total duration: 1.00 hours
New 25h train dataset created at: LibriSpeech_final/train-clean
New 1h test dataset created at: LibriSpeech_final/test-clean


# Export New Datasets

In [17]:
!zip -r LibriSpeech_final.zip LibriSpeech_final

[1;30;43mOutput streaming akan dipotong hingga 5000 baris terakhir.[0m
  adding: LibriSpeech_final/train-clean/4406/ (stored 0%)
  adding: LibriSpeech_final/train-clean/4406/16882/ (stored 0%)
  adding: LibriSpeech_final/train-clean/4406/16882/4406-16882-0054.wav (deflated 21%)
  adding: LibriSpeech_final/train-clean/4406/16882/4406-16882-0005.wav (deflated 16%)
  adding: LibriSpeech_final/train-clean/4406/16882/4406-16882-0011.wav (deflated 21%)
  adding: LibriSpeech_final/train-clean/4406/16882/16882.trans.txt (deflated 54%)
  adding: LibriSpeech_final/train-clean/4406/16882/4406-16882-0001.wav (deflated 15%)
  adding: LibriSpeech_final/train-clean/4406/16882/4406-16882-0068.wav (deflated 20%)
  adding: LibriSpeech_final/train-clean/4406/16882/4406-16882-0052.wav (deflated 17%)
  adding: LibriSpeech_final/train-clean/4406/16882/4406-16882-0006.wav (deflated 18%)
  adding: LibriSpeech_final/train-clean/4406/16882/4406-16882-0003.wav (deflated 17%)
  adding: LibriSpeech_final/train-c

In [18]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [19]:
!cp LibriSpeech_final.zip /content/drive/MyDrive/dataset