In [49]:
# üß™ Whisperlaz - 1-0001-asr-finetune.ipython

üß™ Whisperlaz - 1-0001-asr-finetune.ipython

In [50]:
# üì¶ Install dependencies
# !uv add pandas tqdm ffmpeg-python numpy datasets webvtt-py

In [4]:
# üìö Imports
import os
import subprocess
import numpy as np
import pandas as pd
from tqdm import tqdm
import webvtt

In [5]:
# üìÇ Paths
manifest_path = "./manifest/asmr-data.csv"
out_dir = "./preprocessed_segments"
os.makedirs(out_dir, exist_ok=True)

In [6]:
# üîâ Audio loader using ffmpeg
def load_audio(path, sr=16000):
    cmd = [
        "ffmpeg", "-i", path,
        "-f", "f32le", "-ac", "1", "-ar", str(sr),
        "-loglevel", "quiet", "pipe:1"
    ]
    out = subprocess.check_output(cmd)
    return np.frombuffer(out, np.float32)

In [7]:
# ‚åõ Convert VTT timestamp to seconds
def to_seconds(ts):
    h, m, s = ts.replace(',', '.').split(':')
    return float(h) * 3600 + float(m) * 60 + float(s)

In [8]:
# üß© Load VTT segments
def load_vtt_segments(path):
    segments = []
    for caption in webvtt.read(path):
        start = to_seconds(caption.start)
        end = to_seconds(caption.end)
        text = caption.text.strip().replace('\n', ' ')
        if text:
            segments.append({"start": start, "end": end, "text": text})
    return segments

In [9]:
# üìÑ Load manifest
df = pd.read_csv(manifest_path)
print(f"Got {df.shape}")
df.head()

Got (85, 8)


Unnamed: 0,channel,video_id,video_idx,audio_path,en_subtitle_path,ja_subtitle_path,checksum,broken_file_check
0,channel_1,mSUlRMOlsrA,1,./data-local/data/channel_1/video_1/mSUlRMOlsr...,./data-local/data/channel_1/video_1/mSUlRMOlsr...,./data-local/data/channel_1/video_1/mSUlRMOlsr...,31b7ff7bec8e55c642dd9d09d3439d2f05c52974159513...,False
1,channel_1,dDE_n3Qzm8k,2,./data-local/data/channel_1/video_2/dDE_n3Qzm8...,./data-local/data/channel_1/video_2/dDE_n3Qzm8...,./data-local/data/channel_1/video_2/dDE_n3Qzm8...,d99b690dc86e1e65bb6b8c00bf0ba8940901355bf17fb9...,False
2,channel_1,iDBDGvdeSac,3,./data-local/data/channel_1/video_3/iDBDGvdeSa...,./data-local/data/channel_1/video_3/iDBDGvdeSa...,./data-local/data/channel_1/video_3/iDBDGvdeSa...,cb10711f115354396633ee2cd5e836f56806e8795fe139...,False
3,channel_1,8HaXefa3bas,4,./data-local/data/channel_1/video_4/8HaXefa3ba...,./data-local/data/channel_1/video_4/8HaXefa3ba...,./data-local/data/channel_1/video_4/8HaXefa3ba...,05b7687ee1c748f57490414819c7d6395d29dd3763ef58...,True
4,channel_1,8A-ieZY265s,6,./data-local/data/channel_1/video_6/8A-ieZY265...,./data-local/data/channel_1/video_6/8A-ieZY265...,./data-local/data/channel_1/video_6/8A-ieZY265...,0e14eb0b2be6775e66582d9dad41ef3c589ec6cfc9347d...,False


In [10]:
df = df[df["broken_file_check"] == False]
df = df[df["en_subtitle_path"].str.len() > 0].reset_index(drop=True)
df = df[df["ja_subtitle_path"].str.len() > 0].reset_index(drop=True)
print(f"Got {df.shape}")
df.head()

Got (65, 8)


Unnamed: 0,channel,video_id,video_idx,audio_path,en_subtitle_path,ja_subtitle_path,checksum,broken_file_check
0,channel_1,mSUlRMOlsrA,1,./data-local/data/channel_1/video_1/mSUlRMOlsr...,./data-local/data/channel_1/video_1/mSUlRMOlsr...,./data-local/data/channel_1/video_1/mSUlRMOlsr...,31b7ff7bec8e55c642dd9d09d3439d2f05c52974159513...,False
1,channel_1,dDE_n3Qzm8k,2,./data-local/data/channel_1/video_2/dDE_n3Qzm8...,./data-local/data/channel_1/video_2/dDE_n3Qzm8...,./data-local/data/channel_1/video_2/dDE_n3Qzm8...,d99b690dc86e1e65bb6b8c00bf0ba8940901355bf17fb9...,False
2,channel_1,iDBDGvdeSac,3,./data-local/data/channel_1/video_3/iDBDGvdeSa...,./data-local/data/channel_1/video_3/iDBDGvdeSa...,./data-local/data/channel_1/video_3/iDBDGvdeSa...,cb10711f115354396633ee2cd5e836f56806e8795fe139...,False
3,channel_1,8A-ieZY265s,6,./data-local/data/channel_1/video_6/8A-ieZY265...,./data-local/data/channel_1/video_6/8A-ieZY265...,./data-local/data/channel_1/video_6/8A-ieZY265...,0e14eb0b2be6775e66582d9dad41ef3c589ec6cfc9347d...,False
4,channel_1,DrMpvutnOno,7,./data-local/data/channel_1/video_7/DrMpvutnOn...,./data-local/data/channel_1/video_7/DrMpvutnOn...,./data-local/data/channel_1/video_7/DrMpvutnOn...,383cee464bc86a3cb1efd99319568538b0abf69be03f78...,False


In [68]:
# üíæ Process EN/JA segments in groups of 5 and save as .npz
index = []
group_size = 1

for _, row in tqdm(df.iterrows(), total=len(df)):
    try:
        audio = load_audio(row.audio_path)

        for lang in ["en", "ja"]:
            vtt_col = f"{lang}_subtitle_path"
            if not isinstance(row[vtt_col], str) or not os.path.exists(row[vtt_col]):
                continue
            segments = load_vtt_segments(row[vtt_col])
            for i in range(0, len(segments), group_size):
                group = segments[i:i + group_size]
                if not group:
                    continue

                start = group[0]["start"]
                end = group[-1]["end"]
                text = " ".join([seg["text"] for seg in group])

                start_sample = int(start * 16000)
                end_sample = int(end * 16000)
                chunk = audio[start_sample:end_sample]
                if len(chunk) == 0:
                    continue

                key = f"{row.channel}_{row.video_id}_{lang}_{i // group_size}"
                out_path = os.path.join(out_dir, key + ".npz")
                np.savez_compressed(out_path, audio=chunk, text=text, lang=lang, start=start, end=end, audio_path=row.audio_path)
                index.append({"key": key, "npz_path": out_path, "lang": lang})
    except Exception as e:
        print(f"Skip: {row.audio_path} ‚Äî {type(e).__name__}: {e}")

100%|‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà‚ñà| 65/65 [10:04<00:00,  9.30s/it]


In [69]:
# üßæ Save segment index
pd.DataFrame(index).to_csv("./manifest/preprocessed-segments-index.csv", index=False)
print(f"‚úÖ Saved {len(index)} grouped segments to '{out_dir}'")

‚úÖ Saved 33116 grouped segments to './preprocessed_segments'


In [11]:
# üß† OPTIONAL: Load as HuggingFace Dataset
from datasets import Dataset, IterableDataset
# üß† Load as lazy HuggingFace IterableDataset
def generator():
    data_manifesto = pd.read_csv("./manifest/preprocessed-segments-index.csv")
    for _, row in data_manifesto.iterrows():
        data = np.load(row.npz_path, allow_pickle=True)
        yield {
            "audio": {"array": data["audio"], "sampling_rate": 16000},
            "text": str(data["text"]),
            "lang": str(data["lang"]),
            "start": float(data["start"]),
            "end": float(data["end"])
        }

hf_dataset = IterableDataset.from_generator(generator)
ds_ja = hf_dataset.filter(lambda x: x["lang"] == "ja")
ds_en = hf_dataset.filter(lambda x: x["lang"] == "en")
print("‚úÖ Lazy IterableDataset ready")

‚úÖ Lazy IterableDataset ready


In [12]:
pd.read_csv("./manifest/preprocessed-segments-index.csv")

Unnamed: 0,key,npz_path,lang
0,channel_1_mSUlRMOlsrA_en_0,./preprocessed_segments/channel_1_mSUlRMOlsrA_...,en
1,channel_1_mSUlRMOlsrA_en_1,./preprocessed_segments/channel_1_mSUlRMOlsrA_...,en
2,channel_1_mSUlRMOlsrA_en_2,./preprocessed_segments/channel_1_mSUlRMOlsrA_...,en
3,channel_1_mSUlRMOlsrA_en_3,./preprocessed_segments/channel_1_mSUlRMOlsrA_...,en
4,channel_1_mSUlRMOlsrA_en_4,./preprocessed_segments/channel_1_mSUlRMOlsrA_...,en
...,...,...,...
33111,channel_1_OqGhuCkr3VE_ja_288,./preprocessed_segments/channel_1_OqGhuCkr3VE_...,ja
33112,channel_1_OqGhuCkr3VE_ja_289,./preprocessed_segments/channel_1_OqGhuCkr3VE_...,ja
33113,channel_1_OqGhuCkr3VE_ja_290,./preprocessed_segments/channel_1_OqGhuCkr3VE_...,ja
33114,channel_1_OqGhuCkr3VE_ja_291,./preprocessed_segments/channel_1_OqGhuCkr3VE_...,ja


In [13]:
# üîä Play a sample
from IPython.display import Audio

for i, sample in enumerate(ds_ja):
    print(f"{sample['lang']}, {sample['start']:.2f}-{sample['end']:.2f}: {sample['text']}")
    display(Audio(sample["audio"]["array"], rate=16000))
    if i == 10:
        break

ja, 18.50-23.56: „ÅäÂá∫„Åã„Åë„Å´Ë°å„Åè„ÅÆ? „Åù„Å£„Åã„Åù„Å£„Åã


ja, 24.57-25.42: „ÅÑ„ÅÑ„Çà


ja, 26.75-32.11: „É°„Ç§„ÇØ„Åó„Å¶„ÅÇ„Åí„Çã„Å≠


ja, 36.64-42.99: „Åß„ÇÇ‰ªä„ÅÆ„Åæ„Åæ„Åß„ÇÇ„Å®„Å£„Å¶„ÇÇÂèØÊÑõ„ÅÑ„Åë„Å©„Å≠


ja, 45.72-52.45: „Åù„Å£„Åã„Åï„Åô„Åå„Å´„Åô„Å£„Å¥„Çì„ÅÆ„Åæ„Åæ„Åò„ÇÉÂ´å„Å†„Çà„Å≠


ja, 52.45-57.31: ÁßÅ„ÅØ„Åô„Å£„Å¥„Åß„ÇÇ„ÇÅ„Å°„ÇÉ„Åè„Å°„ÇÉÂèØÊÑõ„ÅÑ„Å®ÊÄù„ÅÜ„Åë„Å©„Å≠


ja, 60.00-64.98: ÂàÜ„Åã„Å£„ÅüÂàÜ„Åã„Å£„Åü Êó©„Åè„É°„Ç§„ÇØ„Åô„Çã„Çà


ja, 67.14-74.96: ÁßÅ„ÅåÂêõ„ÅÆ„Åì„Å®1Áï™„Çà„ÅèÁü•„Å£„Å¶„Çã„Åã„Çâ Â§öÂàÜ„Åô„Å£„Åî„ÅèÁ¥†Êïµ„Å´‰ªï‰∏ä„Åí„Çâ„Çå„Çã„Å®ÊÄù„ÅÜ


ja, 76.00-77.12: ‰ªª„Åõ„Å¶


ja, 81.88-88.24: „Åò„ÇÉ„ÅÇ„ÄÄ„ÅäÈ°î„ÅØ„Åì„Å£„Å°„Å´Âêë„Åë„Å¶


ja, 89.28-93.09: „Åì„Åì„Åß„Åò„Å£„Å®„Åó„Å¶„Å¶„Å≠


In [42]:
hf_dataset = hf_dataset.filter(lambda sample: sample["lang"].startswith("j"))

In [47]:
from IPython.display import Audio

def play_segment(dataset, index):
    sample = dataset[index]
    print(f"{sample['lang'], sample['text'], sample['start'], sample['end']}")
    return Audio(sample["audio"]["array"], rate=sample["audio"]["sampling_rate"])

play_segment(ds_ja, 1)

NotImplementedError: Subclasses of Dataset should implement __getitem__.