In [None]:
import os
from google.colab import userdata

GITHUB_USERNAME = "MUKAMAFrancois"
REPO_NAME = "tecGrwTechnical"
GITHUB_TOKEN = userdata.get("GITHUB_TOKEN")
project_dir = f"/content/{REPO_NAME}"

if os.path.exists(project_dir):
    print("Updating project repo...")
    %cd {project_dir}
    !git pull
else:
    print("Cloning project repo...")
    repo_url = f"https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git"
    !git clone {repo_url} {project_dir}


In [None]:
import os
os.chdir(f"/content/{REPO_NAME}")
print("Working directory:", os.getcwd())

In [None]:
!pip install -q --upgrade pip
!pip install -q "transformers>=4.56,<5" "tokenizers>=0.22,<0.24" accelerate tqdm torchaudio datasets pyyaml pandas soundfile speechbrain


In [None]:
from pathlib import Path
from src.loader import load_config, get_hf_token
from src.analytics import run_speaker_analysis
from src.preprocess import run_preprocessing_pipeline

config = load_config()
token = get_hf_token()

print("Running speaker analytics before preprocessing...")
speaker_stats, best_speaker = run_speaker_analysis(config, token=token)
print("Recommended speaker from analytics:", best_speaker)
print(speaker_stats.head(10))

selected = int(config.get("SELECTED_SPEAKER_ID", best_speaker))
if int(best_speaker) != selected:
    print(f"WARNING: config SELECTED_SPEAKER_ID={selected}, analytics recommends {best_speaker}.")

def resolve_path(cfg, key):
    raw = cfg.get(key)
    if raw is None:
        return None
    p = Path(raw)
    if p.exists():
        return p
    p2 = Path(cfg.get("PROCESSED_DIR", "")) / p.name
    return p2


def metadata_non_empty(path_obj):
    return path_obj is not None and path_obj.exists() and path_obj.stat().st_size > 0


train_meta = resolve_path(config, "TRAIN_METADATA")
val_meta = resolve_path(config, "VAL_METADATA")

if metadata_non_empty(train_meta) and metadata_non_empty(val_meta):
    print(f"Processed metadata found: {train_meta} and {val_meta}")
else:
    print("Processed metadata missing or empty. Running preprocessing pipeline...")
    stats = run_preprocessing_pipeline(config, token)
    print("Preprocessing stats:", stats)

train_meta = resolve_path(config, "TRAIN_METADATA")
val_meta = resolve_path(config, "VAL_METADATA")
print("TRAIN_METADATA:", train_meta)
print("VAL_METADATA:", val_meta)


In [None]:
import torch
from src.training.speecht5_pipeline import (
    TTSDataCollatorWithPadding,
    build_processed_datasets,
    get_speaker_embedding,
    load_speecht5_components,
    load_train_val_datasets,
    print_preprocessed_batch_debug,
)

config = load_config()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

train_ds, val_ds = load_train_val_datasets(config)
print("Train samples:", len(train_ds))
print("Val samples:", len(val_ds))

processor, model, vocoder = load_speecht5_components(device)
speaker_embedding = get_speaker_embedding(train_ds, device)

train_proc, val_proc = build_processed_datasets(
    train_ds=train_ds,
    val_ds=val_ds,
    processor=processor,
    model=model,
    speaker_embedding=speaker_embedding,
)

data_collator = TTSDataCollatorWithPadding(processor, model)
print_preprocessed_batch_debug(train_proc, data_collator)


In [None]:
from src.training.speecht5_pipeline import build_trainer_bundle, run_stagewise_training

bundle = build_trainer_bundle(
    model=model,
    processor=processor,
    train_proc=train_proc,
    val_proc=val_proc,
    data_collator=data_collator,
    config=config,
    output_dir="speecht5_finetuned",
)
output_dir = bundle.output_dir
trainer = bundle.trainer

print("TrainingArguments metric_for_best_model:", trainer.args.metric_for_best_model)
print("TrainingArguments load_best_model_at_end:", trainer.args.load_best_model_at_end)
print("TrainingArguments save_strategy:", trainer.args.save_strategy)
eval_attr = "eval_strategy" if hasattr(trainer.args, "eval_strategy") else "evaluation_strategy"
print(f"TrainingArguments {eval_attr}:", getattr(trainer.args, eval_attr))
print("Running a pre-train evaluate() sanity check...")
eval_metrics = trainer.evaluate()
print("Eval metrics keys:", sorted(eval_metrics.keys()))

metric_name = trainer.args.metric_for_best_model
metric_key = metric_name if metric_name.startswith("eval_") else f"eval_{metric_name}"
if metric_key not in eval_metrics:
    print(f"WARNING: '{metric_key}' missing from eval metrics. Disabling load_best_model_at_end for this run.")
    trainer.args.load_best_model_at_end = False
    trainer.args.metric_for_best_model = None

train_result = run_stagewise_training(bundle)


In [None]:
from pathlib import Path
from IPython.display import Audio, display
from src.training.speecht5_inference import (
    configure_generation_for_latency,
    export_int8_deployment_package,
    export_final_model_package,
    get_directory_size_mb,
    load_finetuned_model,
    save_generation_config,
    synthesize_test_sentences,
)

required_sentences = [
    "Muraho, nagufasha gute uyu munsi?",
    "Niba ufite ibibazo bijyanye n'ubuzima bwawe, twagufasha.",
    "Ni ngombwa ko ubonana umuganga vuba.",
    "Twabanye nawe kandi tuzakomeza kukwitaho.",
    "Ushobora kuduhamagara igihe cyose ukeneye ubufasha.",
    # my own test sentences:
    "Dukora guheramu gitondo kugeza nimugoroba. Kuwa mbere kugeza kuwa gatanu, saa moya za mu gitondo kugeza saa kumi n'ebyiri z'umugoroba.",
    "Tubakirana urugwiro kandi twiteguye kugufasha mu buryo bwose dushoboye.",
    "Murakoze kandi turagushimira kuba waratuganye.",
]


best_ckpt = trainer.state.best_model_checkpoint or output_dir
print("Using checkpoint:", best_ckpt)
finetuned_model = load_finetuned_model(best_ckpt, device)
configure_generation_for_latency(finetuned_model, max_length=600)
save_generation_config(finetuned_model, best_ckpt)

audio_paths = synthesize_test_sentences(
    model=finetuned_model,
    processor=processor,
    vocoder=vocoder,
    speaker_embedding=speaker_embedding,
    sentences=required_sentences,
    output_dir="evaluation/final_required_sentences",
    device=device,
    sample_rate=16000,
    fast_maxlenratio=9.0,
    safe_maxlenratio=14.0,
    retry_for_completeness=True,
)

for text, audio_path in zip(required_sentences, audio_paths):
    print("Saved:", audio_path)
    print("Text:", text)
    display(Audio(filename=str(audio_path), autoplay=False))

int8_dir = export_int8_deployment_package(
    finetuned_model,
    processor,
    "speecht5_int8_deployment",
)
int8_size_mb = get_directory_size_mb(int8_dir)

print(f"\nINT8 deployment package size: {int8_size_mb:.2f} MB")

In [None]:
from src.training.speecht5_inference import measure_latency

def to_ten_words(text):
    words = str(text).split()
    return " ".join(words[:10])

latency_sentences = [to_ten_words(s) for s in required_sentences]
latency_sentences = [s for s in latency_sentences if s]

print("Inference device:", device)
if getattr(device, "type", "cpu") != "cuda":
    print("WARNING: running on CPU will usually exceed the 800 ms latency target.")

print("Benchmarking with <=10-word sentences:")
for s in latency_sentences:
    print(f"- {s} ({len(s.split())} words)")

latencies_ms, mean_ms = measure_latency(
    model=finetuned_model,
    processor=processor,
    vocoder=vocoder,
    speaker_embedding=speaker_embedding,
    sentences=latency_sentences,
    device=device,
    warmup_runs=2,
    add_leading_prompt=False,
    cache_inputs=True,
)

for ms, text in zip(latencies_ms, latency_sentences):
    print(f"Latency: {ms:.2f} ms | {text}")

print(f"Mean latency: {mean_ms:.2f} ms")

In [None]:
from google.colab import files
import shutil
import torch
from pathlib import Path

def directory_size_mb(path):
    p = Path(path)
    total = sum(f.stat().st_size for f in p.rglob("*") if f.is_file())
    return total / (1024.0 * 1024.0)

# Export clean FP32 inference package
fp32_dir = Path("speecht5_fp32_infer")
if fp32_dir.exists():
    shutil.rmtree(fp32_dir)
export_final_model_package(finetuned_model, processor, fp32_dir)
fp32_zip = shutil.make_archive("speecht5_fp32_infer", "zip", root_dir=str(fp32_dir))
fp32_dir_size_mb = directory_size_mb(fp32_dir)
fp32_zip_size_mb = Path(fp32_zip).stat().st_size / (1024.0 * 1024.0)
print(f"FP32 directory size: {fp32_dir_size_mb:.2f} MB")
files.download(fp32_zip)

# Export clean INT8 inference package
int8_dir = Path("speecht5_int8_deployment")
if int8_dir.exists():
    shutil.rmtree(int8_dir)
export_int8_deployment_package(finetuned_model, processor, int8_dir)
int8_zip = shutil.make_archive("speecht5_int8_deployment", "zip", root_dir=str(int8_dir))
int8_dir_size_mb = directory_size_mb(int8_dir)
int8_zip_size_mb = Path(int8_zip).stat().st_size / (1024.0 * 1024.0)
print(f"INT8 directory size: {int8_dir_size_mb:.2f} MB")
files.download(int8_zip)
torch.save(torch.tensor(speaker_embedding, dtype=torch.float32), "speaker_embedding.pt")
files.download("speaker_embedding.pt")
