In [10]:
# 📦 Dependencies
# !uv pip install -U transformers datasets evaluate wandb

In [11]:
# 📚 Imports
import os
import numpy as np
import pandas as pd
import torch
from datasets import DatasetDict, IterableDataset
from transformers import WhisperProcessor, WhisperForConditionalGeneration, Seq2SeqTrainer, Seq2SeqTrainingArguments
import evaluate
import wandb
from sklearn.model_selection import train_test_split

In [27]:
# 🔐 WANDB setup
os.environ["WANDB_PROJECT"] = "whisperlaz-asr-ja"
os.environ["WANDB_LOG_MODEL"] = "checkpoint"
!wandb login

[34m[1mwandb[0m: Currently logged in as: [33mhrnph[0m to [32mhttps://api.wandb.ai[0m. Use [1m`wandb login --relogin`[0m to force relogin


In [28]:
# 📂 Load preprocessed segment index
df = pd.read_csv("./manifest/preprocessed-segments-index.csv")
df = df[df.lang == "ja"].reset_index(drop=True)
print(f"Loaded {len(df)} JA training samples")

Loaded 16978 JA training samples


In [29]:
# 🔀 Split into train, val, test
train_df, test_df = train_test_split(df, test_size=0.1, random_state=42)
train_df, val_df = train_test_split(train_df, test_size=0.1, random_state=42)
print(f"Train: {len(train_df)}, Val: {len(val_df)}, Test: {len(test_df)}")

Train: 13752, Val: 1528, Test: 1698


In [30]:
# 🧠 Load model + processor
model_name = "openai/whisper-small"
processor = WhisperProcessor.from_pretrained(model_name)
model = WhisperForConditionalGeneration.from_pretrained(model_name)
model.config.forced_decoder_ids = None
model.config.suppress_tokens = []

In [31]:
# 🔄 Preprocessing
def preprocess(row):
    data = np.load(row.npz_path, allow_pickle=True)
    audio = {"array": data["audio"], "sampling_rate": 16000}
    text = str(data["text"])
    input_features = processor(audio, sampling_rate=16000).input_features[0]
    labels = processor.tokenizer(text).input_ids
    return {"input_features": input_features, "labels": labels}

In [32]:
# 🧠 Dataset generators
def make_generator(df):
    for _, row in df.iterrows():
        try:
            data = np.load(row.npz_path, allow_pickle=True)
            yield {
                "audio": {"array": data["audio"], "sampling_rate": 16000},
                "text": str(data["text"]),
                "start": float(data["start"]),
                "end": float(data["end"])
            }
        except Exception as e:
            print(f"Skip: {row.npz_path} — {type(e).__name__}: {e}")

In [33]:
# 🧱 Build lazy datasets
dataset = DatasetDict({
    "train": IterableDataset.from_generator(lambda: map(preprocess, train_df.to_dict(orient="records"))),
    "val": IterableDataset.from_generator(lambda: map(preprocess, val_df.to_dict(orient="records"))),
    "test": IterableDataset.from_generator(lambda: map(preprocess, test_df.to_dict(orient="records")))
})

In [34]:
# 🧪 Evaluation metric
metric = evaluate.load("wer")

In [35]:
def compute_metrics(pred):
    pred_ids = pred.predictions
    label_ids = pred.label_ids
    pred_str = processor.tokenizer.batch_decode(pred_ids, skip_special_tokens=True)
    label_str = processor.tokenizer.batch_decode(label_ids, skip_special_tokens=True)
    return {"wer": metric.compute(predictions=pred_str, references=label_str)}

In [36]:
training_args = Seq2SeqTrainingArguments(
    output_dir="./whisper-ja-asmr",
    per_device_train_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=1e-5,
    num_train_epochs=10,
    logging_steps=10,
    save_steps=200,
    fp16=torch.cuda.is_available(),
    report_to="wandb",
    predict_with_generate=True
)

In [37]:
# 🏋️ Train
trainer = Seq2SeqTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset["train"],
    eval_dataset=dataset["val"],
    tokenizer=processor.feature_extractor,
    compute_metrics=compute_metrics
)

  trainer = Seq2SeqTrainer(


ValueError: The train_dataset does not implement __len__, max_steps has to be specified. The number of steps needs to be known in advance for the learning rate scheduler.

In [None]:
trainer.train()