<a href="https://colab.research.google.com/github/Msingi-AI/Sauti-Ya-Kenya/blob/main/colab/train_on_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sauti Ya Kenya - Training on Google Colab

This notebook sets up and runs the training process for the Kenyan Swahili TTS model using Google Colab's free GPU resources.

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Clone the repository and set up environment
!git clone https://github.com/Msingi-AI/Sauti-Ya-Kenya.git
%cd Sauti-Ya-Kenya

# Add src to Python path properly
import os
import sys
module_path = os.path.abspath(os.path.join('src'))
if module_path not in sys.path:
    sys.path.insert(0, module_path)

# Install dependencies
!pip install -r requirements.txt

In [None]:
# Set up directory structure and symlinks
import os
from pathlib import Path

# Define directories
DRIVE_BASE = "/content/drive/MyDrive/sauti_ya_kenya"
DRIVE_DATA = f"{DRIVE_BASE}/data"
DRIVE_CHECKPOINTS = f"{DRIVE_BASE}/checkpoints"
DRIVE_TOKENIZER = f"{DRIVE_BASE}/tokenizer"

# Create directories in Drive
for d in [DRIVE_DATA, DRIVE_CHECKPOINTS, DRIVE_TOKENIZER]:
    os.makedirs(d, exist_ok=True)

# Create symlinks
symlinks = {
    'data': DRIVE_DATA,
    'checkpoints': DRIVE_CHECKPOINTS,
    'data/tokenizer': DRIVE_TOKENIZER
}

for local_path, drive_path in symlinks.items():
    if os.path.exists(local_path):
        if os.path.islink(local_path):
            os.unlink(local_path)
        else:
            !rm -rf {local_path}
    os.symlink(drive_path, local_path)

In [None]:
# Copy src directory to make imports work
!cp -r src/* .

# Verify data and tokenizer
from preprocessor import SwahiliTokenizer
from pathlib import Path

def verify_setup():
    # Check tokenizer
    tokenizer_path = Path('data/tokenizer/tokenizer.model')
    if not tokenizer_path.exists():
        print("Tokenizer not found! Please train tokenizer first using train_tokenizer.py")
        return False
        
    # Check processed data
    data_path = Path('data/processed')
    if not data_path.exists() or not any(data_path.iterdir()):
        print("Processed data not found! Please run preprocessing first")
        return False
        
    return True

if not verify_setup():
    raise RuntimeError("Setup verification failed! Please ensure data is preprocessed and tokenizer is trained.")

In [None]:
# Configure GPU and memory settings
import torch
torch.cuda.empty_cache()

if torch.cuda.is_available():
    device = torch.device('cuda')
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    print(f"Using GPU: {torch.cuda.get_device_name()}")
else:
    raise RuntimeError("No GPU available!")

In [None]:
# Start training with memory optimizations
!python train.py \
    --batch_size 8 \
    --grad_acc_steps 4 \
    --checkpoint_dir checkpoints \
    --data_dir data/processed \
    --epochs 100 \
    --save_every 5