# Train Swahili Tokenizer on Colab GPU

This notebook:
1. Sets up Colab GPU
2. Loads data from archive.zip
3. Trains the tokenizer
4. Saves the model to Drive

In [None]:
# Verify GPU is available
!nvidia-smi

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Clone the repository
!git clone https://github.com/Msingi-AI/Sauti-Ya-Kenya.git
%cd Sauti-Ya-Kenya

In [None]:
# Install requirements
%pip install -r requirements.txt

In [None]:
# Set up directories
import os

# Drive paths
DRIVE_ROOT = "/content/drive/MyDrive/Sauti-Ya-Kenya"
DRIVE_DATA = os.path.join(DRIVE_ROOT, "data")
DRIVE_ARCHIVE = os.path.join(DRIVE_DATA, "archive.zip")
DRIVE_OUTPUT = os.path.join(DRIVE_ROOT, "tokenizer")

# Local paths
LOCAL_DATA = "data"
LOCAL_TEXT = os.path.join(LOCAL_DATA, "text")

# Create directories
os.makedirs(DRIVE_DATA, exist_ok=True)
os.makedirs(DRIVE_OUTPUT, exist_ok=True)
os.makedirs(LOCAL_TEXT, exist_ok=True)

print("Directory structure ready!")

In [None]:
# Extract archive
import zipfile
import shutil

def extract_text_files(zip_path, extract_to):
    """Extract .txt files from zip to target directory"""
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Get list of .txt files
        txt_files = [f for f in zip_ref.namelist() if f.endswith('.txt')]
        
        # Extract them
        for txt_file in txt_files:
            zip_ref.extract(txt_file, extract_to)
            
        print(f"Extracted {len(txt_files)} text files")
        return txt_files

# Clear previous extracts
if os.path.exists(LOCAL_TEXT):
    shutil.rmtree(LOCAL_TEXT)
os.makedirs(LOCAL_TEXT)

# Extract new files
extracted_files = extract_text_files(DRIVE_ARCHIVE, LOCAL_TEXT)

# Show first few files
print("\nFirst few extracted files:")
for f in extracted_files[:5]:
    print(f"- {f}")

In [None]:
# Train tokenizer
!python src/train_tokenizer.py \
    --data-dir $LOCAL_TEXT \
    --output-dir $DRIVE_OUTPUT \
    --vocab-size 8000 \
    --min-length 3

In [None]:
# Test the trained tokenizer
from src.preprocessor import SwahiliTokenizer, TextPreprocessor

tokenizer = SwahiliTokenizer.load(f"{DRIVE_OUTPUT}/tokenizer.model")
preprocessor = TextPreprocessor(tokenizer)

test_texts = [
    "Habari yako! How are you doing leo?",
    "Niko sawa sana, asante.",
    "Tutaonana kesho asubuhi at 9 AM.",
    "The weather ni nzuri sana today!"
]

print("Testing tokenizer on code-switched sentences:\n")
for text in test_texts:
    tokens = preprocessor.process_text(text)
    print(f"Input: {text}")
    print(f"Tokens: {tokens.token_ids.tolist()}")
    print(f"Languages: {tokens.languages}\n")

In [None]:
# Verify saved files
print("Files saved in Google Drive:")
!ls -lh $DRIVE_OUTPUT