<a href="https://colab.research.google.com/github/Msingi-AI/Sauti-Ya-Kenya/blob/main/colab/train_tts_fixed_v3.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Sauti Ya Kenya - Training on Google Colab

## Setup Instructions
1. Create this folder structure in your Google Drive:
```
sauti_ya_kenya/
├── data/
│   └── processed/  <- Copy your processed data here
└── tokenizer/      <- Copy your tokenizer.model here
```

2. Run all cells in sequence

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Fix NumPy version issue first
!pip uninstall -y numpy
!pip install 'numpy<2.0.0'

# Clean any existing repo
!rm -rf /content/Sauti-Ya-Kenya

# Clone repository and install dependencies
!git clone https://github.com/Msingi-AI/Sauti-Ya-Kenya.git
%cd Sauti-Ya-Kenya
!pip install -r requirements.txt

In [None]:
# Set up directory structure
import os
import shutil
from pathlib import Path

# Create data structure
!mkdir -p data/processed data/tokenizer

# Copy data from Drive
DRIVE_PATH = "/content/drive/MyDrive/sauti_ya_kenya"

# Copy processed data
!cp -r "{DRIVE_PATH}/data/processed/*" data/processed/

# Copy tokenizer
!cp -r "{DRIVE_PATH}/tokenizer/*" data/tokenizer/

# Verify files
print("\nChecking data structure:")
!ls -R data/

In [None]:
# Create __init__.py to make src a package
!touch src/__init__.py

# Add src to Python path
import sys
sys.path.insert(0, os.path.abspath('src'))

# Fix imports in train.py
with open('src/train.py', 'r') as f:
    content = f.read()

# Fix relative imports
content = content.replace('from .model', 'from model')
content = content.replace('from .preprocessor', 'from preprocessor')

with open('src/train.py', 'w') as f:
    f.write(content)

print("Fixed imports in train.py")

In [None]:
# Configure GPU
import torch
torch.cuda.empty_cache()

if torch.cuda.is_available():
    device = torch.device('cuda')
    torch.backends.cudnn.benchmark = True
    torch.backends.cuda.matmul.allow_tf32 = True
    print(f"Using GPU: {torch.cuda.get_device_name()}")
    print(f"GPU Memory: {torch.cuda.get_device_properties(0).total_memory / 1024**3:.1f}GB")
else:
    raise RuntimeError("No GPU available!")

In [None]:
# Start training
!python src/train.py \
    --batch_size 8 \
    --grad_accum 4 \
    --checkpoint_dir checkpoints \
    --data_dir data/processed \
    --epochs 100 \
    --save_every 5

In [None]:
# Copy checkpoints back to Drive
!mkdir -p "{DRIVE_PATH}/checkpoints"
!cp -r checkpoints/* "{DRIVE_PATH}/checkpoints/"
print("\nSaved checkpoints to Drive:")
!ls -l "{DRIVE_PATH}/checkpoints/"