# Train Swahili Tokenizer on Colab

This notebook trains the tokenizer for Kenyan Swahili TTS with code-switching support.

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Clone the repository
!git clone https://github.com/Msingi-AI/Sauti-Ya-Kenya.git
%cd Sauti-Ya-Kenya

In [None]:
# Install requirements
!pip install -r requirements.txt

In [None]:
# Create symlink to data directory in Drive
import os

DRIVE_DATA_DIR = "/content/drive/MyDrive/Sauti-Ya-Kenya/data"
LOCAL_DATA_DIR = "data"

# Create data directory in Drive if it doesn't exist
os.makedirs(DRIVE_DATA_DIR, exist_ok=True)

# Create symlink
if not os.path.exists(LOCAL_DATA_DIR):
    !ln -s $DRIVE_DATA_DIR $LOCAL_DATA_DIR

In [None]:
# Set up output directory in Drive
DRIVE_OUTPUT_DIR = "/content/drive/MyDrive/Sauti-Ya-Kenya/tokenizer"
os.makedirs(DRIVE_OUTPUT_DIR, exist_ok=True)

In [None]:
# Train tokenizer
!python src/train_tokenizer.py \
    --data-dir data/text \
    --output-dir $DRIVE_OUTPUT_DIR \
    --vocab-size 8000 \
    --min-length 3

In [None]:
# Test the trained tokenizer
from src.preprocessor import SwahiliTokenizer, TextPreprocessor

tokenizer = SwahiliTokenizer.load(f"{DRIVE_OUTPUT_DIR}/tokenizer.model")
preprocessor = TextPreprocessor(tokenizer)

test_texts = [
    "Habari yako! How are you doing leo?",
    "Niko sawa sana, asante.",
    "Tutaonana kesho asubuhi."
]

for text in test_texts:
    tokens = preprocessor.process_text(text)
    print(f"\nInput: {text}")
    print(f"Tokens: {tokens.token_ids.tolist()}")
    print(f"Languages: {tokens.languages}")