<a href="https://colab.research.google.com/github/Msingi-AI/Sauti-Ya-Kenya/blob/main/notebooks/train_on_colab.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sauti Ya Kenya - TTS Model Training

This notebook trains the Kenyan Swahili TTS model using Google Colab's GPU resources.

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

# Create project directories in Drive
!mkdir -p /content/drive/MyDrive/Sauti-Ya-Kenya
!mkdir -p /content/drive/MyDrive/Sauti-Ya-Kenya/data
!mkdir -p /content/drive/MyDrive/Sauti-Ya-Kenya/checkpoints

In [None]:
# Clean up any existing symlinks or directories
%cd /content
!rm -rf Sauti-Ya-Kenya

# Clone fresh repository
!git clone https://github.com/Msingi-AI/Sauti-Ya-Kenya.git
%cd Sauti-Ya-Kenya

# Remove default directories that we'll replace with symlinks
!rm -rf data checkpoints

# Create symlinks to Drive
!ln -s /content/drive/MyDrive/Sauti-Ya-Kenya/data data
!ln -s /content/drive/MyDrive/Sauti-Ya-Kenya/checkpoints checkpoints

# Install dependencies
!pip install -r requirements.txt

# Verify setup
!ls -la

In [None]:
import os

# Check for existing data
drive_data = '/content/drive/MyDrive/Sauti-Ya-Kenya/data'
if not os.listdir(drive_data):
    print("No data found in Drive. Please upload data.zip when prompted.")
    from google.colab import files
    uploaded = files.upload()
    
    # Extract to Drive
    !unzip data.zip -d {drive_data}
    print("\nData files:")
    !ls -R {drive_data}
else:
    print(f"Found existing data in Drive:")
    !ls -R {drive_data}

In [None]:
# Verify GPU is available
import torch
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory allocated: {torch.cuda.memory_allocated(0) / 1024**2:.1f}MB")
    print(f"Memory cached: {torch.cuda.memory_reserved(0) / 1024**2:.1f}MB")

In [None]:
# Preprocess the data if needed
import os

if not os.path.exists('processed_data'):
    print("Preprocessing data...")
    !python -m src.preprocess_data
else:
    print("Found preprocessed data directory")

In [None]:
# Find latest checkpoint
import glob
checkpoints = glob.glob("/content/drive/MyDrive/Sauti-Ya-Kenya/checkpoints/*.pt")
resume_path = None

if checkpoints:
    resume_path = max(checkpoints, key=os.path.getctime)
    print(f"Found checkpoint: {resume_path}")
    print("\nAll checkpoints:")
    for cp in checkpoints:
        print(f"- {os.path.basename(cp)}")
else:
    print("No existing checkpoints found. Starting fresh training.")

In [None]:
# Start/resume training
cmd = "python -m src.train"
if resume_path:
    cmd += f" --resume {resume_path}"
!{cmd}