In [None]:
import os
from google.colab import userdata

GITHUB_USERNAME = "MUKAMAFrancois"
REPO_NAME = "tecGrwTechnical"
GITHUB_TOKEN = userdata.get("GITHUB_TOKEN")
project_dir = f"/content/{REPO_NAME}"

if os.path.exists(project_dir):
    print("Updating project repo...")
    %cd {project_dir}
    !git pull
else:
    print("Cloning project repo...")
    repo_url = f"https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git"
    !git clone {repo_url} {project_dir}


In [None]:
import os
os.chdir(f"/content/{REPO_NAME}")
print("Working directory:", os.getcwd())

In [None]:
!pip install -q --upgrade pip
!pip install -q "transformers>=4.56,<5" "tokenizers>=0.22,<0.24" accelerate tqdm torchaudio datasets pyyaml pandas soundfile speechbrain


In [None]:
from pathlib import Path
from src.loader import load_config, get_hf_token
from src.preprocess import run_preprocessing_pipeline

config = load_config()


def resolve_path(cfg, key):
    raw = cfg.get(key)
    if raw is None:
        return None
    p = Path(raw)
    if p.exists():
        return p
    p2 = Path(cfg.get("PROCESSED_DIR", "")) / p.name
    return p2


def metadata_non_empty(path_obj):
    return path_obj is not None and path_obj.exists() and path_obj.stat().st_size > 0


train_meta = resolve_path(config, "TRAIN_METADATA")
val_meta = resolve_path(config, "VAL_METADATA")

if metadata_non_empty(train_meta) and metadata_non_empty(val_meta):
    print(f"Processed metadata found: {train_meta} and {val_meta}")
else:
    print("Processed metadata missing or empty. Running preprocessing pipeline...")
    token = get_hf_token()
    stats = run_preprocessing_pipeline(config, token)
    print("Preprocessing stats:", stats)

train_meta = resolve_path(config, "TRAIN_METADATA")
val_meta = resolve_path(config, "VAL_METADATA")
print("TRAIN_METADATA:", train_meta)
print("VAL_METADATA:", val_meta)


In [None]:
import torch
from src.training.speecht5_pipeline import (
    TTSDataCollatorWithPadding,
    build_processed_datasets,
    get_speaker_embedding,
    load_speecht5_components,
    load_train_val_datasets,
    print_preprocessed_batch_debug,
)

config = load_config()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

train_ds, val_ds = load_train_val_datasets(config)
print("Train samples:", len(train_ds))
print("Val samples:", len(val_ds))

processor, model, vocoder = load_speecht5_components(device)
speaker_embedding = get_speaker_embedding(train_ds, device)

train_proc, val_proc = build_processed_datasets(
    train_ds=train_ds,
    val_ds=val_ds,
    processor=processor,
    model=model,
    speaker_embedding=speaker_embedding,
)

data_collator = TTSDataCollatorWithPadding(processor, model)
print_preprocessed_batch_debug(train_proc, data_collator)


In [None]:
from src.training.speecht5_pipeline import build_trainer_bundle

bundle = build_trainer_bundle(
    model=model,
    processor=processor,
    train_proc=train_proc,
    val_proc=val_proc,
    data_collator=data_collator,
    output_dir="speecht5_finetuned",
)
output_dir = bundle.output_dir
trainer = bundle.trainer

print("TrainingArguments metric_for_best_model:", trainer.args.metric_for_best_model)
print("TrainingArguments load_best_model_at_end:", trainer.args.load_best_model_at_end)
print("TrainingArguments save_strategy:", trainer.args.save_strategy)
eval_attr = "eval_strategy" if hasattr(trainer.args, "eval_strategy") else "evaluation_strategy"
print(f"TrainingArguments {eval_attr}:", getattr(trainer.args, eval_attr))
print("Running a pre-train evaluate() sanity check...")
eval_metrics = trainer.evaluate()
print("Eval metrics keys:", sorted(eval_metrics.keys()))

metric_name = trainer.args.metric_for_best_model
metric_key = metric_name if metric_name.startswith("eval_") else f"eval_{metric_name}"
if metric_key not in eval_metrics:
    print(f"WARNING: '{metric_key}' missing from eval metrics. Disabling load_best_model_at_end for this run.")
    trainer.args.load_best_model_at_end = False
    trainer.args.metric_for_best_model = None

trainer.train()


In [None]:
from pathlib import Path
from IPython.display import Audio, display
import torch
from src.training.speecht5_inference import (
    ROBUST_FULL_PROFILE,
    configure_generation_for_latency,
    export_int8_deployment_bundle,
    get_directory_size_mb,
    load_finetuned_model,
    save_generation_config,
    synthesize_test_sentences,
)

if torch.cuda.is_available():
    torch.backends.cudnn.benchmark = True
    try:
        torch.set_float32_matmul_precision("high")
    except Exception:
        pass

required_sentences = [
    "Muraho, nagufasha gute uyu munsi?",
    "Niba ufite ibibazo bijyanye n'ubuzima bwawe, twagufasha.",
    "Ni ngombwa ko ubonana umuganga vuba.",
    "Twabanye nawe kandi tuzakomeza kukwitaho.",
    "Ushobora kuduhamagara igihe cyose ukeneye ubufasha.",
]

best_ckpt = trainer.state.best_model_checkpoint or output_dir
print("Using checkpoint:", best_ckpt)
finetuned_model = load_finetuned_model(best_ckpt, device)
configure_generation_for_latency(finetuned_model, max_length=900)
save_generation_config(finetuned_model, best_ckpt)

if getattr(device, "type", "cpu") == "cuda":
    finetuned_model = finetuned_model.half()
    vocoder = vocoder.half()

audio_paths = synthesize_test_sentences(
    model=finetuned_model,
    processor=processor,
    vocoder=vocoder,
    speaker_embedding=speaker_embedding,
    sentences=required_sentences,
    output_dir="evaluation/final_required_sentences",
    device=device,
    sample_rate=16000,
    profile=ROBUST_FULL_PROFILE,
    retry_for_completeness=True,
    min_duration_per_word=0.24,
)

for text, audio_path in zip(required_sentences, audio_paths):
    print("Saved:", audio_path)
    print("Text:", text)
    display(Audio(filename=str(audio_path), autoplay=False))

final_bundle_dir = export_int8_deployment_bundle(
    model=finetuned_model,
    processor=processor,
    output_dir="speecht5_deployment_int8_bundle",
    base_model_id="microsoft/speecht5_tts",
    source_checkpoint=best_ckpt,
)
bundle_size_mb = get_directory_size_mb(final_bundle_dir)

print(f"Final INT8 deployment bundle size: {bundle_size_mb:.2f} MB")
print("Final bundle target (< 200 MB):", "PASS" if bundle_size_mb < 200.0 else "FAIL")



In [None]:
from src.training.speecht5_inference import BENCHMARK_FAST_PROFILE, measure_latency

benchmark_sentence = "Muraho ndakwifurije umunsi mwiza kandi amahoro menshi uyu munsi rwose."

print("Inference device:", device)
if getattr(device, "type", "cpu") != "cuda":
    print("WARNING: running on CPU will usually exceed the 800 ms latency target.")

latencies_ms, mean_ms, p50_ms, p95_ms = measure_latency(
    model=finetuned_model,
    processor=processor,
    vocoder=vocoder,
    speaker_embedding=speaker_embedding,
    sentence=benchmark_sentence,
    device=device,
    profile=BENCHMARK_FAST_PROFILE,
    warmup_runs=3,
    num_runs=20,
)

for idx, ms in enumerate(latencies_ms, start=1):
    print(f"Run {idx:02d}: {ms:.2f} ms")

print("Benchmark sentence:", benchmark_sentence)
print(f"Mean latency: {mean_ms:.2f} ms")
print(f"P50 latency: {p50_ms:.2f} ms")
print(f"P95 latency: {p95_ms:.2f} ms")
print("Latency target (< 800 ms, mean):", "PASS" if mean_ms < 800.0 else "FAIL")

