# English Keyboard Suggestion Model Training (with Google Drive Integration)

This notebook trains an English keyboard suggestion model using Microsoft Phi-3 Mini with LoRA fine-tuning.

**Features:**
- Automatic Google Drive data management
- Checks for existing data before downloading
- Email notifications on completion
- Saves models to Drive for persistence

**Target Specifications:**
- Model Size: 20-30 MB (after optimization)
- Latency: < 50 ms
- Perplexity: < 20
- Top-3 Accuracy: > 85%

## 1. Environment Setup

In [None]:
# Check if running in Colab
import os
IN_COLAB = 'COLAB_GPU' in os.environ or 'COLAB_TPU_ADDR' in os.environ

if IN_COLAB:
    print("✓ Running in Google Colab")
else:
    print("✓ Running locally")

In [None]:
# Clone repository (if running in Colab)
if IN_COLAB:
    import os
    
    # Ensure we're in /content
    os.chdir('/content')
    
    # Remove existing repo if it exists (for re-runs)
    if os.path.exists('Keyboard-Suggestions-ML-Colab'):
        import shutil
        shutil.rmtree('Keyboard-Suggestions-ML-Colab')
        print("✓ Removed existing repository")
    
    # Clone fresh copy
    !git clone https://github.com/MinhPhuPham/Keyboard-Suggestions-ML-Colab.git
    
    # Change to project directory
    os.chdir('/content/Keyboard-Suggestions-ML-Colab')
    
    print(f"✓ Repository cloned")
    print(f"✓ Working directory: {os.getcwd()}")

In [None]:
# Install dependencies
!pip install -q -r requirements.txt
print("✓ Dependencies installed")

In [None]:
# Import libraries
import sys
sys.path.append('./src')

import torch
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset

# Import custom utilities
from data_prep import clean_english_text, augment_with_emojis, split_dataset
from model_utils import (
    load_model_with_lora, train_causal_lm, evaluate_perplexity,
    prune_model, quantize_model, merge_lora_weights
)
from export_utils import (
    export_to_onnx, export_to_coreml, verify_model_size,
    benchmark_latency, package_for_download
)
from colab_data_manager import (
    mount_google_drive, setup_english_data, send_notification_email
)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Google Drive Setup and Data Management

In [None]:
# Mount Google Drive
if IN_COLAB:
    mount_success = mount_google_drive()
    if not mount_success:
        raise Exception("Failed to mount Google Drive")
else:
    print("Skipping Drive mount (running locally)")

In [None]:
# Set up English training data
# This will check Drive first, download if needed
if IN_COLAB:
    DRIVE_BASE = "/content/drive/MyDrive/Phu's Data development"
    data_path = setup_english_data(DRIVE_BASE)
    
    if data_path is None:
        print("⚠ Data setup failed. Please check errors above.")
        print("\nManual setup instructions:")
        print("1. Download SwiftKey dataset from Kaggle:")
        print("   https://www.kaggle.com/datasets/therohk/tweets-blogs-news-swiftkey-dataset-4million")
        print(f"2. Upload to: {DRIVE_BASE}/data/english/")
    else:
        print(f"\n✓ Data ready at: {data_path}")
else:
    data_path = "./data/english"
    print(f"Using local data path: {data_path}")

## 3. Data Preparation

In [None]:
# Load and prepare training data
# For demonstration, using sample data
# Replace with actual SwiftKey corpus processing

sample_sentences = [
    "Today is a beautiful day",
    "I love programming in Python",
    "The weather is nice today",
    "Let's meet tomorrow morning",
    "Thank you for your help",
    "How are you doing today",
    "See you later tonight",
    "Have a great weekend",
]

# Clean text
cleaned = [clean_english_text(s) for s in sample_sentences]

# Augment with emojis
augmented = augment_with_emojis(cleaned, emoji_ratio=0.2)

print(f"Sample augmented sentences:")
for sent in augmented[:5]:
    print(f"  {sent}")

print(f"\nTotal training samples: {len(augmented)}")

## 4. Model Setup and Fine-Tuning

In [None]:
# Memory optimization for Colab free tier
import gc
import torch
import os

# Clear any existing allocations
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Set memory allocation strategy
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

print("✓ Memory optimizations applied")
print(f"✓ GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"✓ GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Hugging Face Authentication (required for Gemma-2)
from huggingface_hub import login

# Login to Hugging Face
# This will use HF_TOKEN from Colab secrets or prompt for manual login
try:
    from google.colab import userdata
    hf_token = userdata.get('HF_TOKEN')
    login(token=hf_token, new_session=False)
    print("✓ Logged in to Hugging Face using token")
except:
    # Fallback to interactive login
    print("HF_TOKEN not found, using interactive login...")
    login(new_session=False)
    print("✓ Logged in to Hugging Face")

In [None]:
# Load model with LoRA
# Using Gemma-2-2B with official configuration
MODEL_NAME = "google/gemma-2-2b-it"

print(f"Loading model: {MODEL_NAME}")

# Load with official Gemma-2 settings
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model with official Gemma-2 dtype (bfloat16)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.bfloat16,  # Official Gemma-2 dtype
)

# Add LoRA adapters
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("✓ Model loaded with LoRA adapters")

In [None]:
# Prepare dataset for training
def tokenize_function(examples):
    # Tokenize the text with proper padding and truncation
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors=None  # Return lists, not tensors (for batching)
    )

# Create dataset
train_data = Dataset.from_dict({'text': augmented})
train_dataset = train_data.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']  # Remove original text column
)

print(f"✓ Training dataset prepared: {len(train_dataset)} samples")
print(f"✓ Dataset columns: {train_dataset.column_names}")

In [None]:
# Train model
print("Starting training...")

# Set checkpoint directory to Drive if in Colab
if IN_COLAB:
    checkpoint_dir = os.path.join(DRIVE_BASE, "checkpoints", "english")
else:
    checkpoint_dir = "./checkpoints/english"

trainer = train_causal_lm(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    output_dir=checkpoint_dir,
    num_epochs=3,
    batch_size=2,  # Reduced for free tier
    learning_rate=1e-5,
    max_seq_length=8,
    save_steps=100
)

print("✓ Training complete!")

## 5. Optimization and Export

In [None]:
# Merge LoRA weights
model = merge_lora_weights(model)
print("✓ LoRA weights merged")

In [None]:
# Prune model (optional - skip if running out of memory)
import gc
import torch

try:
    # Clear GPU cache before pruning
    torch.cuda.empty_cache()
    gc.collect()
    
    print("Attempting to prune model...")
    model = prune_model(model, amount=0.2)  # Reduced from 0.3 to 0.2
    print("✓ Model pruned")
    
except Exception as e:
    print(f"⚠ Pruning failed (likely OOM): {e}")
    print("Skipping pruning - model will be larger but still functional")
    # Continue without pruning
    pass

# Clear cache again
torch.cuda.empty_cache()
gc.collect()

In [None]:
# Quantize model (optional - may cause OOM on free tier)
import gc
import torch

# Clear memory before quantization
gc.collect()
torch.cuda.empty_cache()

try:
    print("Attempting quantization...")
    model = quantize_model(model, dtype=torch.qint8)
    print("✓ Model quantized")
except Exception as e:
    print(f"⚠ Quantization failed: {e}")
    print("Skipping quantization - exporting float16 model instead")
    # Continue with float16 model

# Clear memory again
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Export to ONNX
import gc
import torch

# Set model output directory
if IN_COLAB:
    model_dir = os.path.join(DRIVE_BASE, "models", "english")
else:
    model_dir = "./models/english"

os.makedirs(model_dir, exist_ok=True)

# Clear memory before export
gc.collect()
torch.cuda.empty_cache()

try:
    print("Exporting to ONNX...")
    onnx_path = export_to_onnx(
        model=model,
        tokenizer=tokenizer,
        output_path=os.path.join(model_dir, "english_model.onnx"),
        max_seq_length=128
    )
    print(f"✓ ONNX model saved to: {onnx_path}")
except Exception as e:
    print(f"⚠ ONNX export failed: {e}")
    print("Saving PyTorch model instead...")
    # Save as PyTorch
    torch_path = os.path.join(model_dir, "english_model.pt")
    torch.save(model.state_dict(), torch_path)
    print(f"✓ PyTorch model saved to: {torch_path}")
    onnx_path = None

# Clear memory
gc.collect()
torch.cuda.empty_cache()

In [None]:
# Export to Core ML (optional - skip if low on memory)
import gc

if onnx_path:  # Only if ONNX export succeeded
    try:
        print("Exporting to Core ML...")
        coreml_path = export_to_coreml(
            onnx_path=onnx_path,
            output_path=os.path.join(model_dir, "english_model.mlmodel"),
            model_name="EnglishKeyboardSuggestion"
        )
        if coreml_path:
            print(f"✓ Core ML model saved to: {coreml_path}")
    except Exception as e:
        print(f"⚠ Core ML export failed: {e}")
        print("Skipping Core ML export")
else:
    print("⚠ Skipping Core ML export (ONNX export failed)")

# Final memory cleanup
gc.collect()

## 6. Verification

In [None]:
# Verify model size
size_mb, meets_req = verify_model_size(
    model_path=onnx_path,
    max_size_mb=30
)

if meets_req:
    print(f"✓ Model size requirement met: {size_mb:.2f} MB")
else:
    print(f"⚠ Model size exceeds target: {size_mb:.2f} MB")

## 7. Save and Notify

In [None]:
# Package model for download
zip_path = package_for_download(
    model_dir=model_dir,
    output_zip="english_model.zip"
)

print(f"✓ Model packaged: {zip_path}")

In [None]:
# Send completion notification
send_notification_email(
    subject="English Model Training Complete! 🎉",
    message=f"""
English keyboard suggestion model training has completed successfully!

Model Details:
- Size: {size_mb:.2f} MB
- Location: {model_dir}
- Package: {zip_path}

The model is ready for integration into your keyboard app.

Next steps:
1. Download the model package
2. Integrate into iOS/Android app
3. Test on actual devices
    """,
    to_email="phamminhphueur@gmail.com"
)

print("\n" + "="*60)
print("✓ TRAINING COMPLETE!")
print("="*60)
print(f"Model saved to: {model_dir}")
print(f"Package: {zip_path}")
print(f"Size: {size_mb:.2f} MB")
print("="*60)

In [None]:
# Download (Colab only)
if IN_COLAB:
    from google.colab import files
    print("Downloading model package...")
    files.download(zip_path)
    print("✓ Download started")
else:
    print(f"Model saved locally to: {zip_path}")