# Msingi1 - Swahili Language Model Training

This notebook trains the Msingi1 model on Google Colab using GPU/TPU.

In [None]:
# Check if we're using GPU
!nvidia-smi

In [None]:
# Install required packages
!pip install -q wandb tokenizers torch transformers datasets tqdm

In [None]:
# Clone the repository
!git clone https://github.com/Msingi-AI/msingi1.git
%cd msingi1

In [None]:
# Upload the dataset and tokenizer
from google.colab import files
import os

# Create necessary directories
!mkdir -p data tokenizer

print("Please upload your archive.zip (dataset)...")
uploaded = files.upload()
!mv *.zip data/archive.zip

print("\nPlease upload your tokenizer files (tokenizer.json, vocab.json, merges.txt)...")
uploaded = files.upload()
!mv tokenizer.json vocab.json merges.txt tokenizer/

In [None]:
# Configure WandB (optional)
import wandb
wandb.login()

In [None]:
import torch
import os
from src.model import Msingi1, MsingiConfig
from src.train import train
from src.data_processor import extract_dataset

# Load dataset
texts = extract_dataset("data/archive.zip")
print(f"Loaded {len(texts)} texts")

# Split into train/val
val_size = int(len(texts) * 0.1)
train_texts = texts[val_size:]
val_texts = texts[:val_size]

# Initialize config with larger model for GPU/TPU
config = MsingiConfig(
    vocab_size=50000,
    max_position_embeddings=2048,
    hidden_size=768,
    num_hidden_layers=12,
    num_attention_heads=12,
    intermediate_size=3072,
    gradient_checkpointing=True  # Enable for memory efficiency
)

# Training parameters optimized for Colab
train(
    config=config,
    train_texts=train_texts,
    val_texts=val_texts,
    num_epochs=10,
    batch_size=8,  # Larger batch size for GPU
    gradient_accumulation_steps=8,  # Still accumulate for stability
    learning_rate=3e-4,
    max_length=1024,
    warmup_steps=1000,
    save_steps=1000,
    eval_steps=500,
    use_wandb=True
)

In [None]:
# Download the trained model
from google.colab import files

# Zip the checkpoints directory
!zip -r trained_model.zip checkpoints/

# Download the zipped file
files.download('trained_model.zip')