In [None]:
import os
from google.colab import userdata

GITHUB_USERNAME = "MUKAMAFrancois"
REPO_NAME       = "TTS_tecGrw"
GITHUB_TOKEN    = userdata.get('GITHUB_TOKEN')
project_dir     = f"/content/{REPO_NAME}"

if os.path.exists(project_dir):
    print("ðŸ”„ Updating project repo...")
    %cd {project_dir}
    !git pull
else:
    print("ðŸ“¥ Cloning project repo...")
    repo_url = f"https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git"
    !git clone {repo_url} {project_dir}

In [None]:
os.chdir(f"/content/{REPO_NAME}")

In [None]:
!pip install -q --upgrade pip
!pip install -q "transformers>=4.56,<5" "tokenizers>=0.22,<0.24" accelerate tqdm torchaudio datasets pyyaml

### speaker analysis

In [None]:
from src.loader import load_config, get_hf_token
from src.analytics import run_speaker_analysis

config = load_config()

# Auto-fetches from Colab secrets (key: HF_TOKEN)
token = get_hf_token()

stats, best_speaker = run_speaker_analysis(config, token)

print(stats)
print("Recommended speaker:", best_speaker)

## splits

In [None]:
from src.loader import load_all_splits, combine_splits

splits = load_all_splits(config, token)

print(len(splits["train"]))
print(len(splits["validation"]))
print(len(splits["test"]))

combined = combine_splits(splits)
print("Total:", len(combined))

### preprocessing

In [None]:
from src.loader import load_config, get_hf_token
from src.preprocess import run_preprocessing_pipeline

config = load_config()
token = get_hf_token()

stats = run_preprocessing_pipeline(config, token)

## pre-trained model

In [None]:
from src.loader import load_config, ensure_dir
from src.model import (
    load_model_and_tokenizer,
    synthesize_speech,
    save_waveform
)

from IPython.display import Audio, display

config = load_config()
ensure_dir("evaluation/synthesized_wavs")

# Load pretrained facebook/mms-tts-kin
model, tokenizer = load_model_and_tokenizer(config)

text = "Muraho, nagufasha gute uyu munsi?"
waveform, sr = synthesize_speech(model, tokenizer, text)

output_path = "evaluation/synthesized_wavs/pretrained_test.wav"
save_waveform(waveform, sr, output_path)

print(f"Saved to: {output_path}")
print(f"Sample rate: {sr}")
print(f"Waveform shape: {waveform.shape}")

display(Audio(output_path))

In [None]:
# count parameters
from src.model import count_parameters
print(count_parameters(model))

## pre-trained test

In [None]:
test_sentences = [
    "Muraho, nagufasha gute uyu munsi?",
    "Niba ufite ibibazo bijyanye n'ubuzima bwawe, twagufasha.",
    "Ni ngombwa ko ubonana umuganga vuba.",
    "Twabanye nawe kandi tuzakomeza kukwitaho.",
    "Ushobora kuduhamagara igihe cyose ukeneye ubufasha."
]

for i, text in enumerate(test_sentences):
    waveform, sr = synthesize_speech(model, tokenizer, text)
    path = f"evaluation/synthesized_wavs/pretrained_{i}.wav"
    save_waveform(waveform, sr, path)
    print(path)
    display(Audio(path))

### Training

In [None]:
import os
from src.loader import load_config
from src.model import load_model_and_tokenizer, get_device
from src.training.dataset import create_dataloader
from src.training.trainer import Trainer

def resolve_metadata_path(config, key):
    raw_path = config.get(key)
    if raw_path is None:
        return None
    if os.path.exists(raw_path):
        return raw_path
    processed_dir = config.get("PROCESSED_DIR", "")
    candidate = os.path.join(processed_dir, os.path.basename(raw_path))
    if os.path.exists(candidate):
        return candidate
    return raw_path

config = load_config()
device = get_device()
print("Device:", device)

model, tokenizer = load_model_and_tokenizer(config, device)

train_metadata = resolve_metadata_path(config, "TRAIN_METADATA")
val_metadata = resolve_metadata_path(config, "VAL_METADATA")

train_loader = create_dataloader(
    train_metadata,
    tokenizer,
    config["TARGET_SAMPLE_RATE"],
    config["BATCH_SIZE"],
    shuffle=True,
    num_workers=int(config.get("NUM_WORKERS", 2)),
    max_batch_duration_sec=config.get("MAX_BATCH_DURATION_SEC")
)

val_loader = None
if val_metadata is not None and os.path.exists(val_metadata):
    val_loader = create_dataloader(
        val_metadata,
        tokenizer,
        config["TARGET_SAMPLE_RATE"],
        config["BATCH_SIZE"],
        shuffle=False,
        num_workers=int(config.get("NUM_WORKERS", 2)),
        max_batch_duration_sec=config.get("VAL_MAX_BATCH_DURATION_SEC")
    )
    print("Validation loader enabled.")
else:
    print("Validation metadata not found, validation disabled.")

trainer = Trainer(
    model,
    tokenizer,
    train_loader,
    val_loader,
    config,
    device,
    use_amp=bool(config.get("USE_AMP", True))
)


In [None]:
stages = config.get("STAGES", [
    {"stage": 1, "lr": 1e-4, "epochs": 3},
    {"stage": 2, "lr": 5e-5, "epochs": 2},
    {"stage": 3, "lr": 1e-5, "epochs": 1},
])

for stage_cfg in stages:
    trainer.train_stage(
        stage=int(stage_cfg["stage"]),
        lr=float(stage_cfg["lr"]),
        epochs=int(stage_cfg["epochs"])
    )


In [None]:
import os
import re
from pathlib import Path

from src.loader import load_config
from src.model import (
    get_device,
    load_model_and_tokenizer,
    load_checkpoint,
)

config = load_config()
device = get_device()
print("Device:", device)

def find_latest_checkpoint(checkpoint_dir):
    ckpt_dir = Path(checkpoint_dir)
    if not ckpt_dir.exists():
        raise FileNotFoundError(f"Checkpoint directory not found: {checkpoint_dir}")

    matches = []
    for p in ckpt_dir.iterdir():
        if p.is_dir():
            m = re.match(r"checkpoint_(\d+)$", p.name)
            if m:
                matches.append((int(m.group(1)), p))

    if not matches:
        raise FileNotFoundError(f"No checkpoint_<step> folders found in: {checkpoint_dir}")

    matches.sort(key=lambda x: x[0])
    return matches[-1][0], str(matches[-1][1])

latest_step, latest_ckpt_path = find_latest_checkpoint(config.get("CHECKPOINT_DIR", "checkpoints"))
print(f"Latest checkpoint: {latest_ckpt_path} (step={latest_step})")

pretrained_model, pretrained_tokenizer = load_model_and_tokenizer(config, device)
final_model, final_tokenizer = load_checkpoint(latest_ckpt_path, device)

print("Models loaded.")


In [None]:
import pandas as pd

from src.model import synthesize_speech, save_waveform

test_sentences = [
    "Muraho, nagufasha gute uyu munsi?",
    "Niba ufite ibibazo bijyanye n'ubuzima bwawe, twagufasha.",
    "Ni ngombwa ko ubonana umuganga vuba.",
    "Twabanye nawe kandi tuzakomeza kukwitaho.",
    "Ushobora kuduhamagara igihe cyose ukeneye ubufasha."
]

compare_dir = "evaluation/compare_pretrained_vs_final"
os.makedirs(compare_dir, exist_ok=True)

rows = []
for i, text in enumerate(test_sentences):
    pre_wav, pre_sr = synthesize_speech(pretrained_model, pretrained_tokenizer, text, device)
    fin_wav, fin_sr = synthesize_speech(final_model, final_tokenizer, text, device)

    pre_path = f"{compare_dir}/sentence_{i:02d}_pretrained.wav"
    fin_path = f"{compare_dir}/sentence_{i:02d}_final.wav"

    save_waveform(pre_wav, pre_sr, pre_path)
    save_waveform(fin_wav, fin_sr, fin_path)

    rows.append({
        "idx": i,
        "text": text,
        "pretrained_path": pre_path,
        "final_path": fin_path,
        "pretrained_duration_sec": pre_wav.shape[-1] / pre_sr,
        "final_duration_sec": fin_wav.shape[-1] / fin_sr,
    })

compare_df = pd.DataFrame(rows)
compare_df


In [None]:
from IPython.display import Audio, display, Markdown

for _, r in compare_df.iterrows():
    display(Markdown(f"### Sentence {int(r['idx'])}\n`{r['text']}`"))
    print("Pre-trained")
    display(Audio(r["pretrained_path"]))
    print("Final fine-tuned")
    display(Audio(r["final_path"]))
    print(f"Durations (s) -> pre: {r['pretrained_duration_sec']:.2f}, final: {r['final_duration_sec']:.2f}")
    print("-" * 70)
