In [None]:
import os
from google.colab import userdata

GITHUB_USERNAME = "MUKAMAFrancois"
REPO_NAME       = "tecGrwTechnical"
GITHUB_TOKEN    = userdata.get('GITHUB_TOKEN')
project_dir     = f"/content/{REPO_NAME}"

if os.path.exists(project_dir):
    print("ðŸ”„ Updating project repo...")
    %cd {project_dir}
    !git pull
else:
    print("ðŸ“¥ Cloning project repo...")
    repo_url = f"https://{GITHUB_TOKEN}@github.com/{GITHUB_USERNAME}/{REPO_NAME}.git"
    !git clone {repo_url} {project_dir}

In [None]:
os.chdir(f"/content/{REPO_NAME}")

In [None]:
!pip install -q --upgrade pip
!pip install -q "transformers>=4.56,<5" "tokenizers>=0.22,<0.24" accelerate tqdm torchaudio datasets pyyaml

In [None]:
from pathlib import Path
from src.loader import load_config, get_hf_token
from src.preprocess import run_preprocessing_pipeline

config = load_config()

def resolve_path(config, key):
    raw = config.get(key)
    if raw is None:
        return None
    p = Path(raw)
    if p.exists():
        return p
    p2 = Path(config.get("PROCESSED_DIR", "")) / p.name
    return p2

train_meta = resolve_path(config, "TRAIN_METADATA")
val_meta = resolve_path(config, "VAL_METADATA")

if train_meta and train_meta.exists() and val_meta and val_meta.exists():
    print(f"Processed metadata found: {train_meta} and {val_meta}")
else:
    print("Processed metadata missing. Running preprocessing pipeline...")
    token = get_hf_token()
    stats = run_preprocessing_pipeline(config, token)
    print("Preprocessing stats:", stats)


In [None]:
import os
from pathlib import Path
import numpy as np
import pandas as pd
import torch
import soundfile as sf

from datasets import Dataset, Audio, load_dataset
from transformers import (
    SpeechT5Processor,
    SpeechT5ForTextToSpeech,
    SpeechT5HifiGan,
    Seq2SeqTrainer,
    Seq2SeqTrainingArguments,
)

from src.loader import load_config
from IPython.display import Audio as IPyAudio, display, Markdown

config = load_config()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print("Device:", device)

def resolve_metadata_path(config, key):
    p = config.get(key)
    if p is None:
        return None
    p = Path(p)
    if p.exists():
        return str(p)
    p2 = Path(config.get("PROCESSED_DIR", "")) / p.name
    return str(p2)

def load_tts_dataset(metadata_path, target_sr=16000):
    df = pd.read_csv(
        metadata_path, sep="|", header=None,
        names=["audio", "text", "speaker_id"]
    )
    df = df[df["text"].notna()]
    df["text"] = df["text"].astype(str).str.strip()
    df = df[df["text"] != ""]
    df = df[df["audio"].map(lambda p: os.path.exists(p))]
    ds = Dataset.from_pandas(df[["audio", "text"]], preserve_index=False)
    ds = ds.cast_column("audio", Audio(sampling_rate=target_sr))
    return ds

train_meta = resolve_metadata_path(config, "TRAIN_METADATA")
val_meta = resolve_metadata_path(config, "VAL_METADATA")

train_ds = load_tts_dataset(train_meta, target_sr=int(config["TARGET_SAMPLE_RATE"]))
val_ds = load_tts_dataset(val_meta, target_sr=int(config["TARGET_SAMPLE_RATE"]))

print("Train samples:", len(train_ds))
print("Val samples:", len(val_ds))

In [None]:
processor = SpeechT5Processor.from_pretrained("microsoft/speecht5_tts")
model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
vocoder = SpeechT5HifiGan.from_pretrained("microsoft/speecht5_hifigan").to(device)

xvec_ds = load_dataset("Matthijs/cmu-arctic-xvectors", split="validation")
speaker_embedding = np.array(xvec_ds[0]["xvector"], dtype=np.float32)

def prepare_example(batch):
    audio = batch["audio"]
    out = processor(
        text=batch["text"],
        audio_target=audio["array"],
        sampling_rate=audio["sampling_rate"],
        return_attention_mask=False,
    )
    return {
        "input_ids": out["input_ids"][0],
        "labels": out["labels"][0],
        "speaker_embeddings": speaker_embedding,
    }

train_proc = train_ds.map(prepare_example, remove_columns=train_ds.column_names)
val_proc = val_ds.map(prepare_example, remove_columns=val_ds.column_names)

class TTSDataCollatorWithPadding:
    def __init__(self, processor, model):
        self.processor = processor
        self.model = model

    def __call__(self, features):
        input_ids = [{"input_ids": f["input_ids"]} for f in features]
        label_features = [{"input_values": f["labels"]} for f in features]

        batch = self.processor.pad(
            input_ids=input_ids,
            labels=label_features,
            return_tensors="pt",
        )

        decoder_attention_mask = batch.pop("decoder_attention_mask")
        batch["labels"] = batch["labels"].masked_fill(decoder_attention_mask.ne(1), -100)

        if self.model.config.reduction_factor > 1:
            lengths = (decoder_attention_mask.sum(dim=1) // self.model.config.reduction_factor) * self.model.config.reduction_factor
            max_len = int(lengths.max().item())
            batch["labels"] = batch["labels"][:, :max_len]

        batch["speaker_embeddings"] = torch.tensor(
            [f["speaker_embeddings"] for f in features], dtype=torch.float32
        )
        return batch

data_collator = TTSDataCollatorWithPadding(processor, model)


In [None]:
output_dir = "speecht5_finetuned"

args = Seq2SeqTrainingArguments(
    output_dir=output_dir,
    per_device_train_batch_size=4,
    per_device_eval_batch_size=4,
    gradient_accumulation_steps=2,
    learning_rate=2e-5,
    warmup_steps=500,
    num_train_epochs=10,
    evaluation_strategy="steps",
    eval_steps=500,
    save_steps=500,
    logging_steps=50,
    save_total_limit=2,
    load_best_model_at_end=True,
    metric_for_best_model="eval_loss",
    greater_is_better=False,
    fp16=torch.cuda.is_available(),
    report_to="none",
    remove_unused_columns=False,
)

trainer = Seq2SeqTrainer(
    model=model,
    args=args,
    train_dataset=train_proc,
    eval_dataset=val_proc,
    data_collator=data_collator,
    tokenizer=processor.tokenizer,
)

trainer.train()


In [None]:
best_ckpt = trainer.state.best_model_checkpoint or output_dir
print("Using checkpoint:", best_ckpt)

pretrained_model = SpeechT5ForTextToSpeech.from_pretrained("microsoft/speecht5_tts").to(device)
finetuned_model = SpeechT5ForTextToSpeech.from_pretrained(best_ckpt).to(device)

test_sentences = [
    "Muraho, nagufasha gute uyu munsi?",
    "Niba ufite ibibazo bijyanye n'ubuzima bwawe, twagufasha.",
    "Ni ngombwa ko ubonana umuganga vuba.",
    "Twabanye nawe kandi tuzakomeza kukwitaho.",
    "Ushobora kuduhamagara igihe cyose ukeneye ubufasha.",
]

compare_dir = Path("evaluation/compare_speecht5")
compare_dir.mkdir(parents=True, exist_ok=True)

spk = torch.tensor(speaker_embedding, dtype=torch.float32).unsqueeze(0).to(device)

def synth_to_file(tts_model, text, out_path):
    inputs = processor(text=text, return_tensors="pt")
    input_ids = inputs["input_ids"].to(device)
    with torch.no_grad():
        speech = tts_model.generate_speech(input_ids, spk, vocoder=vocoder)
    speech = speech.cpu().numpy()
    sf.write(out_path, speech, 16000)

for i, text in enumerate(test_sentences):
    pre_path = compare_dir / f"{i:02d}_pretrained.wav"
    fin_path = compare_dir / f"{i:02d}_finetuned.wav"

    synth_to_file(pretrained_model, text, str(pre_path))
    synth_to_file(finetuned_model, text, str(fin_path))

    display(Markdown(f"### Sentence {i}\n`{text}`"))
    print("Pretrained SpeechT5")
    display(IPyAudio(str(pre_path)))
    print("Fine-tuned SpeechT5")
    display(IPyAudio(str(fin_path)))
    print("-" * 70)
