In [None]:
import os
from google.colab import userdata

GITHUB_USERNAME = "MUKAMAFrancois"
REPO_NAME = "tecGrwTechnical"
GITHUB_TOKEN = userdata.get("GITHUB_TOKEN")
project_dir = f"/content/{REPO_NAME}"

if os.path.exists(project_dir):
    print("Updating project repo...")
    %cd {project_dir}
    !git pull
else:
    print("Cloning project repo...")
    repo_url = f"https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git"
    !git clone {repo_url} {project_dir}


In [None]:
import os
os.chdir(f"/content/{REPO_NAME}")
print("Working directory:", os.getcwd())


In [None]:
!pip install -q --upgrade pip
!pip install -q "transformers>=4.56,<5" "tokenizers>=0.22,<0.24" accelerate tqdm torchaudio datasets pyyaml pandas soundfile speechbrain


In [None]:
from pathlib import Path
from src.loader import load_config, get_hf_token
from src.preprocess import run_preprocessing_pipeline

config = load_config()


def resolve_path(cfg, key):
    raw = cfg.get(key)
    if raw is None:
        return None
    p = Path(raw)
    if p.exists():
        return p
    p2 = Path(cfg.get("PROCESSED_DIR", "")) / p.name
    return p2


def metadata_non_empty(path_obj):
    return path_obj is not None and path_obj.exists() and path_obj.stat().st_size > 0


train_meta = resolve_path(config, "TRAIN_METADATA")
val_meta = resolve_path(config, "VAL_METADATA")

if metadata_non_empty(train_meta) and metadata_non_empty(val_meta):
    print(f"Processed metadata found: {train_meta} and {val_meta}")
else:
    print("Processed metadata missing or empty. Running preprocessing pipeline...")
    token = get_hf_token()
    stats = run_preprocessing_pipeline(config, token)
    print("Preprocessing stats:", stats)

train_meta = resolve_path(config, "TRAIN_METADATA")
val_meta = resolve_path(config, "VAL_METADATA")
print("TRAIN_METADATA:", train_meta)
print("VAL_METADATA:", val_meta)


In [None]:
import torch
from src.training.speecht5_pipeline import (
    TTSDataCollatorWithPadding,
    build_processed_datasets,
    get_speaker_embedding,
    load_speecht5_components,
    load_train_val_datasets,
    print_preprocessed_batch_debug,
)

config = load_config()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

train_ds, val_ds = load_train_val_datasets(config)
print("Train samples:", len(train_ds))
print("Val samples:", len(val_ds))

processor, model, vocoder = load_speecht5_components(device)
speaker_embedding = get_speaker_embedding(train_ds, device)

train_proc, val_proc = build_processed_datasets(
    train_ds=train_ds,
    val_ds=val_ds,
    processor=processor,
    model=model,
    speaker_embedding=speaker_embedding,
)

data_collator = TTSDataCollatorWithPadding(processor, model)
print_preprocessed_batch_debug(train_proc, data_collator)


In [None]:
from src.training.speecht5_pipeline import build_trainer_bundle

bundle = build_trainer_bundle(
    model=model,
    processor=processor,
    train_proc=train_proc,
    val_proc=val_proc,
    data_collator=data_collator,
    output_dir="speecht5_finetuned",
)
output_dir = bundle.output_dir
trainer = bundle.trainer

print("TrainingArguments metric_for_best_model:", trainer.args.metric_for_best_model)
print("TrainingArguments load_best_model_at_end:", trainer.args.load_best_model_at_end)
print("TrainingArguments save_strategy:", trainer.args.save_strategy)
eval_attr = "eval_strategy" if hasattr(trainer.args, "eval_strategy") else "evaluation_strategy"
print(f"TrainingArguments {eval_attr}:", getattr(trainer.args, eval_attr))
print("Running a pre-train evaluate() sanity check...")
eval_metrics = trainer.evaluate()
print("Eval metrics keys:", sorted(eval_metrics.keys()))

trainer.train()


In [None]:
from pathlib import Path
import soundfile as sf
from transformers import SpeechT5ForTextToSpeech

required_sentences = [
    "Muraho, nagufasha gute uyu munsi?",
    "Niba ufite ibibazo bijyanye n'ubuzima bwawe, twagufasha.",
    "Ni ngombwa ko ubonana umuganga vuba.",
    "Twabanye nawe kandi tuzakomeza kukwitaho.",
    "Ushobora kuduhamagara igihe cyose ukeneye ubufasha.",
]

best_ckpt = trainer.state.best_model_checkpoint or output_dir
print("Using checkpoint:", best_ckpt)
finetuned_model = SpeechT5ForTextToSpeech.from_pretrained(best_ckpt).to(device)

compare_dir = Path("evaluation/final_required_sentences")
compare_dir.mkdir(parents=True, exist_ok=True)

spk = torch.tensor(speaker_embedding, dtype=torch.float32).unsqueeze(0).to(device)


def synth_to_file(tts_model, text, out_path):
    inputs = processor(text=text, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    with torch.no_grad():
        speech = tts_model.generate_speech(input_ids, spk, vocoder=vocoder)
    speech = speech.cpu().numpy()
    sf.write(out_path, speech, 16000)


for i, text in enumerate(required_sentences, start=1):
    out_path = compare_dir / f"sentence_{i:02d}.wav"
    synth_to_file(finetuned_model, text, str(out_path))
    print("Saved:", out_path)


In [None]:
import time
import numpy as np

latencies_ms = []
spk = torch.tensor(speaker_embedding, dtype=torch.float32).unsqueeze(0).to(device)

for text in required_sentences:
    inputs = processor(text=text, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)

    t0 = time.perf_counter()
    with torch.no_grad():
        _speech = finetuned_model.generate_speech(input_ids, spk, vocoder=vocoder)
    t1 = time.perf_counter()

    ms = (t1 - t0) * 1000.0
    latencies_ms.append(ms)
    print(f"Latency: {ms:.2f} ms | {text}")

mean_ms = float(np.mean(latencies_ms)) if latencies_ms else float("nan")
print(f"Mean latency: {mean_ms:.2f} ms")
print("Latency target (< 800 ms):", "PASS" if mean_ms < 800.0 else "FAIL")
