# Train Swahili Tokenizer on Colab GPU

## Setup Instructions:
1. Upload your `archive.zip` using the file upload widget below
2. The trained tokenizer will be saved to Google Drive

Note: This notebook uses Colab's T4 GPU for faster training

In [None]:
# Verify GPU is available
!nvidia-smi

In [None]:
# Mount Google Drive
from google.colab import drive
drive.mount('/content/drive')

In [None]:
# Upload archive.zip
from google.colab import files
import os

print("Please upload your archive.zip file when prompted...")
uploaded = files.upload()

if 'archive.zip' not in uploaded:
    raise ValueError("Please upload a file named 'archive.zip'")

print("\nUpload successful!")

In [None]:
# Clone the repository
!git clone https://github.com/Msingi-AI/Sauti-Ya-Kenya.git
%cd Sauti-Ya-Kenya

In [None]:
# Install requirements
%pip install -r requirements.txt

In [None]:
# Set up directories
import os

# Drive paths for output
DRIVE_ROOT = "/content/drive/MyDrive/Sauti-Ya-Kenya"
DRIVE_OUTPUT = os.path.join(DRIVE_ROOT, "tokenizer")

# Local paths
LOCAL_DATA = "data"
LOCAL_TEXT = os.path.join(LOCAL_DATA, "text")
LOCAL_ARCHIVE = "archive.zip"

# Create directories
os.makedirs(DRIVE_OUTPUT, exist_ok=True)
os.makedirs(LOCAL_TEXT, exist_ok=True)

print("Directory structure ready!")

In [None]:
# Extract archive
import zipfile
import shutil

def extract_text_files(zip_path, extract_to):
    """Extract .txt files from zip to target directory"""
    with zipfile.ZipFile(zip_path, 'r') as zip_ref:
        # Get list of .txt files
        txt_files = [f for f in zip_ref.namelist() if f.endswith('.txt')]
        
        if not txt_files:
            raise ValueError("No .txt files found in archive.zip!")
        
        # Extract them
        for txt_file in txt_files:
            zip_ref.extract(txt_file, extract_to)
            
        print(f"Extracted {len(txt_files)} text files")
        return txt_files

# Clear previous extracts
if os.path.exists(LOCAL_TEXT):
    shutil.rmtree(LOCAL_TEXT)
os.makedirs(LOCAL_TEXT)

try:
    # Extract new files
    extracted_files = extract_text_files(LOCAL_ARCHIVE, LOCAL_TEXT)
    
    # Show first few files
    print("\nFirst few extracted files:")
    for f in extracted_files[:5]:
        print(f"- {f}")
        
    # Show total token count
    total_tokens = 0
    for txt_file in extracted_files:
        with open(os.path.join(LOCAL_TEXT, txt_file), 'r', encoding='utf-8') as f:
            text = f.read()
            total_tokens += len(text.split())
    
    print(f"\nTotal approximate tokens: {total_tokens:,}")
        
except Exception as e:
    print(f"Error: {str(e)}")
    print("\nPlease make sure:")
    print("1. You uploaded archive.zip using the file upload widget above")
    print("2. The archive contains .txt files")
    print("3. The text files are encoded in UTF-8")
    raise

In [None]:
# Train tokenizer
!python src/train_tokenizer.py \
    --data-dir $LOCAL_TEXT \
    --output-dir $DRIVE_OUTPUT \
    --vocab-size 8000 \
    --min-length 3

In [None]:
# Test the trained tokenizer
from src.preprocessor import SwahiliTokenizer, TextPreprocessor

try:
    tokenizer = SwahiliTokenizer.load(f"{DRIVE_OUTPUT}/tokenizer.model")
    preprocessor = TextPreprocessor(tokenizer)

    test_texts = [
        "Habari yako! How are you doing leo?",
        "Niko sawa sana, asante.",
        "Tutaonana kesho asubuhi at 9 AM.",
        "The weather ni nzuri sana today!"
    ]

    print("Testing tokenizer on code-switched sentences:\n")
    for text in test_texts:
        tokens = preprocessor.process_text(text)
        print(f"Input: {text}")
        print(f"Tokens: {tokens.token_ids.tolist()}")
        print(f"Languages: {tokens.languages}\n")
        
    print("✅ Tokenizer trained and saved successfully!")
    print(f"📁 Model saved to: {DRIVE_OUTPUT}/tokenizer.model")
    
except Exception as e:
    print(f"❌ Error testing tokenizer: {str(e)}")
    print("Please check the training output above for errors")

In [None]:
# Verify saved files
print("Files saved in Google Drive:")
!ls -lh $DRIVE_OUTPUT