# Sauti Ya Kenya - TTS Training

This notebook sets up the training environment for our Swahili TTS model.

## Setup Steps:
1. Clone repository and install dependencies
2. Download and prepare dataset
3. Run data augmentation
4. Train model
5. Evaluate and save model

In [None]:
# Mount Google Drive for persistent storage
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Clone repository
!git clone https://github.com/Msingi-AI/Sauti-Ya-Kenya.git
%cd Sauti-Ya-Kenya

# Install dependencies
!pip install -r requirements.txt

In [None]:
# Download Mozilla Common Voice dataset
# Note: You'll need to manually upload your local dataset
!mkdir -p cv-corpus-21.0-delta-2025-03-14/sw/clips

# Upload your local dataset files here
from google.colab import files
print("Please upload your dataset files:")
print("1. validated_sentences.tsv")
print("2. clip_durations.tsv")
print("3. clips/ directory (as zip)")

uploaded = files.upload()

# Extract clips
!unzip clips.zip -d cv-corpus-21.0-delta-2025-03-14/sw/clips/

In [None]:
# Process dataset
!python src/prepare_local_dataset.py \
    --dataset_path "cv-corpus-21.0-delta-2025-03-14/sw" \
    --output_dir "processed_data" \
    --clips_path "clips" \
    --sentences_file "validated_sentences.tsv" \
    --durations_file "clip_durations.tsv"

In [None]:
# Run data augmentation
!python src/augment_data.py \
    --data_dir "processed_data" \
    --output_dir "augmented_data" \
    --num_augmentations 3

In [None]:
# Set up training configuration
import torch
import json

config = {
    "train_data_dir": "augmented_data",
    "batch_size": 32,
    "learning_rate": 0.001,
    "num_epochs": 100,
    "save_every": 10,
    "device": "cuda" if torch.cuda.is_available() else "cpu",
    "max_len": 10000,  # Based on our previous configuration
    "checkpoint_dir": "checkpoints"
}

with open('config.json', 'w') as f:
    json.dump(config, f, indent=2)

print(f"Using device: {config['device']}")

In [None]:
# Start training
!python src/train.py --config config.json

In [None]:
# Run evaluation
!python src/evaluation.py \
    --model_path "checkpoints/best.pt" \
    --test_data "augmented_data/test" \
    --output_dir "evaluation_results"

In [None]:
# Save model to Google Drive
import shutil
import os

drive_path = "/content/drive/MyDrive/Sauti-Ya-Kenya/models"
os.makedirs(drive_path, exist_ok=True)

# Copy best model
shutil.copy("checkpoints/best.pt", f"{drive_path}/best.pt")

# Save evaluation results
shutil.copytree("evaluation_results", f"{drive_path}/evaluation_results", dirs_exist_ok=True)

print("Model and evaluation results saved to Google Drive")