In [None]:
import os
from google.colab import userdata

GITHUB_USERNAME = "MUKAMAFrancois"
REPO_NAME = "tecGrwTechnical"
GITHUB_TOKEN = userdata.get("GITHUB_TOKEN")
project_dir = f"/content/{REPO_NAME}"

if os.path.exists(project_dir):
    print("Updating project repo...")
    %cd {project_dir}
    !git pull
else:
    print("Cloning project repo...")
    repo_url = f"https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git"
    !git clone {repo_url} {project_dir}


In [None]:
import os
os.chdir(f"/content/{REPO_NAME}")
print("Working directory:", os.getcwd())


In [None]:
!pip install -q --upgrade pip
!pip install -q "transformers>=4.56,<5" "tokenizers>=0.22,<0.24" accelerate tqdm torchaudio datasets pyyaml pandas soundfile


In [None]:
from pathlib import Path
from src.loader import load_config, get_hf_token
from src.preprocess import run_preprocessing_pipeline

config = load_config()


def resolve_path(cfg, key):
    raw = cfg.get(key)
    if raw is None:
        return None
    p = Path(raw)
    if p.exists():
        return p
    p2 = Path(cfg.get("PROCESSED_DIR", "")) / p.name
    return p2


def metadata_non_empty(path_obj):
    return path_obj is not None and path_obj.exists() and path_obj.stat().st_size > 0


train_meta = resolve_path(config, "TRAIN_METADATA")
val_meta = resolve_path(config, "VAL_METADATA")

if metadata_non_empty(train_meta) and metadata_non_empty(val_meta):
    print(f"Processed metadata found: {train_meta} and {val_meta}")
else:
    print("Processed metadata missing or empty. Running preprocessing pipeline...")
    token = get_hf_token()
    stats = run_preprocessing_pipeline(config, token)
    print("Preprocessing stats:", stats)

train_meta = resolve_path(config, "TRAIN_METADATA")
val_meta = resolve_path(config, "VAL_METADATA")
print("TRAIN_METADATA:", train_meta)
print("VAL_METADATA:", val_meta)


In [None]:
import os
from src.model import load_model_and_tokenizer, get_device
from src.training.dataset import create_dataloader
from src.training.trainer import Trainer

config = load_config()
device = get_device()
print("Device:", device)

model, tokenizer = load_model_and_tokenizer(config, device)


def resolve_metadata_path(cfg, key):
    raw_path = cfg.get(key)
    if raw_path is None:
        return None
    if os.path.exists(raw_path):
        return raw_path
    processed_dir = cfg.get("PROCESSED_DIR", "")
    candidate = os.path.join(processed_dir, os.path.basename(raw_path))
    if os.path.exists(candidate):
        return candidate
    return raw_path


train_metadata = resolve_metadata_path(config, "TRAIN_METADATA")
val_metadata = resolve_metadata_path(config, "VAL_METADATA")

train_loader = create_dataloader(
    train_metadata,
    tokenizer,
    config["TARGET_SAMPLE_RATE"],
    config["BATCH_SIZE"],
    shuffle=True,
    num_workers=int(config.get("NUM_WORKERS", 2)),
    max_batch_duration_sec=config.get("MAX_BATCH_DURATION_SEC"),
)

val_loader = None
if val_metadata is not None and os.path.exists(val_metadata):
    val_loader = create_dataloader(
        val_metadata,
        tokenizer,
        config["TARGET_SAMPLE_RATE"],
        config["BATCH_SIZE"],
        shuffle=False,
        num_workers=int(config.get("NUM_WORKERS", 2)),
        max_batch_duration_sec=config.get("VAL_MAX_BATCH_DURATION_SEC"),
    )
    print("Validation loader enabled.")
else:
    print("Validation metadata not found, validation disabled.")

first_batch = next(iter(train_loader))
print("input_ids shape:", tuple(first_batch["input_ids"].shape))
print("waveforms shape:", tuple(first_batch["waveforms"].shape))


In [None]:
trainer = Trainer(
    model,
    tokenizer,
    train_loader,
    val_loader,
    config,
    device,
    use_amp=bool(config.get("USE_AMP", True)),
)

stages = config.get(
    "STAGES",
    [
        {"stage": 1, "lr": 1e-4, "epochs": 3},
        {"stage": 2, "lr": 5e-5, "epochs": 2},
        {"stage": 3, "lr": 1e-5, "epochs": 1},
    ],
)

for stage_cfg in stages:
    trainer.train_stage(
        stage=int(stage_cfg["stage"]),
        lr=float(stage_cfg["lr"]),
        epochs=int(stage_cfg["epochs"]),
    )


In [None]:
from pathlib import Path
from src.model import load_checkpoint, synthesize_speech, save_waveform

required_sentences = [
    "Muraho, nagufasha gute uyu munsi?",
    "Niba ufite ibibazo bijyanye n'ubuzima bwawe, twagufasha.",
    "Ni ngombwa ko ubonana umuganga vuba.",
    "Twabanye nawe kandi tuzakomeza kukwitaho.",
    "Ushobora kuduhamagara igihe cyose ukeneye ubufasha.",
]


def get_latest_checkpoint(ckpt_dir):
    ckpt_root = Path(ckpt_dir)
    if not ckpt_root.exists():
        return None

    candidates = []
    for p in ckpt_root.glob("checkpoint_*"):
        if p.is_dir():
            try:
                step = int(p.name.split("_")[-1])
                candidates.append((step, p))
            except ValueError:
                pass

    if not candidates:
        return None

    candidates.sort(key=lambda x: x[0])
    return str(candidates[-1][1])


latest_ckpt = get_latest_checkpoint(config.get("CHECKPOINT_DIR", "checkpoints"))
if latest_ckpt is not None:
    print("Using checkpoint:", latest_ckpt)
    infer_model, infer_tokenizer = load_checkpoint(latest_ckpt, device)
else:
    print("No checkpoint found. Using in-memory model weights.")
    infer_model, infer_tokenizer = trainer.model, trainer.tokenizer

final_out_dir = Path("evaluation/final_required_sentences")
final_out_dir.mkdir(parents=True, exist_ok=True)

for i, text in enumerate(required_sentences, start=1):
    waveform, sr = synthesize_speech(infer_model, infer_tokenizer, text, device)
    if waveform.dim() == 1:
        waveform = waveform.unsqueeze(0)
    out_path = final_out_dir / f"sentence_{i:02d}.wav"
    save_waveform(waveform, sr, str(out_path))
    print(f"Saved: {out_path}")


In [None]:
import time
import numpy as np

latencies_ms = []

for text in required_sentences:
    t0 = time.perf_counter()
    _waveform, _sr = synthesize_speech(infer_model, infer_tokenizer, text, device)
    t1 = time.perf_counter()
    ms = (t1 - t0) * 1000.0
    latencies_ms.append(ms)
    print(f"Latency: {ms:.2f} ms | {text}")

mean_ms = float(np.mean(latencies_ms)) if latencies_ms else float("nan")
print(f"Mean latency: {mean_ms:.2f} ms")
print("Latency target (< 800 ms):", "PASS" if mean_ms < 800.0 else "FAIL")
