# English Keyboard Suggestion Model Training (with Google Drive Integration)

This notebook trains an English keyboard suggestion model using Microsoft Phi-3 Mini with LoRA fine-tuning.

**Features:**
- Automatic Google Drive data management
- Checks for existing data before downloading
- Email notifications on completion
- Saves models to Drive for persistence

**Target Specifications:**
- Model Size: 20-30 MB (after optimization)
- Latency: < 50 ms
- Perplexity: < 20
- Top-3 Accuracy: > 85%

## 1. Environment Setup

In [None]:
# Check if running in Colab
import os
IN_COLAB = 'COLAB_GPU' in os.environ or 'COLAB_TPU_ADDR' in os.environ

if IN_COLAB:
    print("✓ Running in Google Colab")
else:
    print("✓ Running locally")

In [None]:
# Clone repository (if running in Colab)
if IN_COLAB:
    import os
    
    # Ensure we're in /content
    os.chdir('/content')
    
    # Remove existing repo if it exists (for re-runs)
    if os.path.exists('Keyboard-Suggestions-ML-Colab'):
        import shutil
        shutil.rmtree('Keyboard-Suggestions-ML-Colab')
        print("✓ Removed existing repository")
    
    # Clone fresh copy
    !git clone https://github.com/MinhPhuPham/Keyboard-Suggestions-ML-Colab.git
    
    # Change to project directory
    os.chdir('/content/Keyboard-Suggestions-ML-Colab')
    
    print(f"✓ Repository cloned")
    print(f"✓ Working directory: {os.getcwd()}")

In [None]:
# Install dependencies
!pip install -q -r requirements.txt
print("✓ Dependencies installed")

## 2. Google Drive Setup and Data Management

In [None]:
# Mount Google Drive
if IN_COLAB:
    mount_success = mount_google_drive()
    if not mount_success:
        raise Exception("Failed to mount Google Drive")
else:
    print("Skipping Drive mount (running locally)")

In [None]:
# Set up English training data
# This will check Drive first, download if needed
if IN_COLAB:
    DRIVE_BASE = "/content/drive/MyDrive/Phu's Data development"
    data_path = setup_english_data(DRIVE_BASE)
    
    if data_path is None:
        print("⚠ Data setup failed. Please check errors above.")
        print("\nManual setup instructions:")
        print("1. Download SwiftKey dataset from Kaggle:")
        print("   https://www.kaggle.com/datasets/therohk/tweets-blogs-news-swiftkey-dataset-4million")
        print(f"2. Upload to: {DRIVE_BASE}/data/english/")
    else:
        print(f"\n✓ Data ready at: {data_path}")
else:
    data_path = "./data/english"
    print(f"Using local data path: {data_path}")

## 3. Data Preparation

In [None]:
# Load and prepare training data
# For demonstration, using sample data
# Replace with actual SwiftKey corpus processing

sample_sentences = [
    "Today is a beautiful day",
    "I love programming in Python",
    "The weather is nice today",
    "Let's meet tomorrow morning",
    "Thank you for your help",
    "How are you doing today",
    "See you later tonight",
    "Have a great weekend",
]

# Clean text
cleaned = [clean_english_text(s) for s in sample_sentences]

# Augment with emojis
augmented = augment_with_emojis(cleaned, emoji_ratio=0.2)

print(f"Sample augmented sentences:")
for sent in augmented[:5]:
    print(f"  {sent}")

print(f"\nTotal training samples: {len(augmented)}")

## 4. Model Setup and Fine-Tuning

In [None]:
# Memory optimization for Colab free tier
import gc
import torch
import os

# Clear any existing allocations
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

# Set memory allocation strategy
os.environ['PYTORCH_CUDA_ALLOC_CONF'] = 'expandable_segments:True'

print("✓ Memory optimizations applied")
print(f"✓ GPU available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"✓ GPU memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
# Load model with LoRA
# Using Qwen2.5-1.5B-Instruct (ungated, no token needed)
MODEL_NAME = "Qwen/Qwen2.5-1.5B-Instruct"

print(f"Loading model: {MODEL_NAME}")

# Official Qwen2.5 configuration
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
from peft import LoraConfig, get_peft_model

# Load tokenizer
tokenizer = AutoTokenizer.from_pretrained(MODEL_NAME)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Load model (Qwen2.5 uses bfloat16 by default)
model = AutoModelForCausalLM.from_pretrained(
    MODEL_NAME,
    device_map="auto",
    torch_dtype=torch.bfloat16,
)

# LoRA configuration for Qwen2.5
# Target modules: q_proj, k_proj, v_proj, o_proj (standard for Qwen)
lora_config = LoraConfig(
    r=8,
    lora_alpha=16,
    target_modules=["q_proj", "k_proj", "v_proj", "o_proj"],
    lora_dropout=0.1,
    bias="none",
    task_type="CAUSAL_LM"
)

model = get_peft_model(model, lora_config)
model.print_trainable_parameters()

print("✓ Model loaded with LoRA adapters")

In [None]:
# Prepare dataset for training
def tokenize_function(examples):
    # Tokenize the text with proper padding and truncation
    return tokenizer(
        examples['text'],
        truncation=True,
        padding='max_length',
        max_length=128,
        return_tensors=None  # Return lists, not tensors (for batching)
    )

# Create dataset
train_data = Dataset.from_dict({'text': augmented})
train_dataset = train_data.map(
    tokenize_function,
    batched=True,
    remove_columns=['text']  # Remove original text column
)

print(f"✓ Training dataset prepared: {len(train_dataset)} samples")
print(f"✓ Dataset columns: {train_dataset.column_names}")

In [None]:
# Train model
print("Starting training...")

# IMPORTANT: Save checkpoints locally first (Drive can cause connection errors)
checkpoint_dir = "./checkpoints/english"

trainer = train_causal_lm(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    output_dir=checkpoint_dir,
    num_epochs=3,
    batch_size=2,  # Reduced for free tier
    learning_rate=1e-5,
    max_seq_length=8,
    save_steps=100
)

print("✓ Training complete!")

# Copy final checkpoint to Drive (optional)
if IN_COLAB:
    import shutil
    drive_checkpoint_dir = os.path.join(DRIVE_BASE, "checkpoints", "english")
    try:
        print(f"Copying checkpoint to Drive: {drive_checkpoint_dir}")
        os.makedirs(os.path.dirname(drive_checkpoint_dir), exist_ok=True)
        if os.path.exists(drive_checkpoint_dir):
            shutil.rmtree(drive_checkpoint_dir)
        shutil.copytree(checkpoint_dir, drive_checkpoint_dir)
        print("✓ Checkpoint copied to Drive")
    except Exception as e:
        print(f"⚠ Could not copy to Drive: {e}")
        print("  Checkpoint saved locally only")

## 5. Optimization and Export

In [None]:
# Merge LoRA weights into base model
print("Merging LoRA weights...")

# Use PEFT's built-in merge method
model = model.merge_and_unload()

print("✓ LoRA weights merged into base model")

In [None]:
# Export to ONNX (full precision)
import torch
import gc

# Set model output directory
if IN_COLAB:
    model_dir = os.path.join(DRIVE_BASE, "models", "english")
else:
    model_dir = "./models/english"

os.makedirs(model_dir, exist_ok=True)

# Clear memory
gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

print("Exporting to ONNX...")

# Prepare dummy input
dummy_input = tokenizer("Hello", return_tensors="pt", padding=True, max_length=128)
model.eval()

# Export to ONNX
onnx_path = os.path.join(model_dir, "english_model.onnx")

try:
    torch.onnx.export(
        model,
        (dummy_input['input_ids'],),
        onnx_path,
        input_names=['input_ids'],
        output_names=['logits'],
        dynamic_axes={'input_ids': {0: 'batch', 1: 'sequence'}},
        opset_version=14,
        do_constant_folding=True
    )
    print(f"✓ ONNX model exported: {onnx_path}")
    
    # Check size
    size_mb = os.path.getsize(onnx_path) / 1e6
    print(f"  Model size: {size_mb:.1f} MB")
    
except Exception as e:
    print(f"⚠ ONNX export failed: {e}")
    onnx_path = None

gc.collect()
if torch.cuda.is_available():
    torch.cuda.empty_cache()

In [None]:
# Quantize ONNX model (INT8)
if onnx_path:
    try:
        from onnxruntime.quantization import quantize_dynamic, QuantType
        
        print("Quantizing ONNX model...")
        
        quantized_path = onnx_path.replace('.onnx', '_quantized.onnx')
        
        quantize_dynamic(
            model_input=onnx_path,
            model_output=quantized_path,
            weight_type=QuantType.QUInt8
        )
        
        print(f"✓ Quantized ONNX model: {quantized_path}")
        
        # Check size reduction
        original_size = os.path.getsize(onnx_path) / 1e6
        quantized_size = os.path.getsize(quantized_path) / 1e6
        reduction = (1 - quantized_size/original_size) * 100
        
        print(f"  Original: {original_size:.1f} MB")
        print(f"  Quantized: {quantized_size:.1f} MB")
        print(f"  Reduction: {reduction:.1f}%")
        
        # Use quantized model for further steps
        onnx_path = quantized_path
        
    except Exception as e:
        print(f"⚠ ONNX quantization failed: {e}")
        print("  Using non-quantized ONNX model")
else:
    print("⚠ Skipping quantization (ONNX export failed)")

In [None]:
# Validate ONNX model
if onnx_path:
    try:
        import onnxruntime as ort
        import numpy as np
        
        print("Validating ONNX model...")
        
        # Load ONNX model
        session = ort.InferenceSession(onnx_path)
        
        # Test inference
        test_input = tokenizer("Hello world", return_tensors="pt", padding=True, max_length=128)
        onnx_output = session.run(
            None,
            {"input_ids": test_input['input_ids'].numpy()}
        )[0]
        
        print("✓ ONNX model validated")
        print(f"  Output shape: {onnx_output.shape}")
        print(f"  Sample logits: {onnx_output[0, 0, :5]}")
        
    except Exception as e:
        print(f"⚠ ONNX validation failed: {e}")
else:
    print("⚠ Skipping validation (no ONNX model)")

In [None]:
# Export to Core ML (optional - skip if low on memory)
import gc

if onnx_path:  # Only if ONNX export succeeded
    try:
        print("Exporting to Core ML...")
        coreml_path = export_to_coreml(
            onnx_path=onnx_path,
            output_path=os.path.join(model_dir, "english_model.mlmodel"),
            model_name="EnglishKeyboardSuggestion"
        )
        if coreml_path:
            print(f"✓ Core ML model saved to: {coreml_path}")
    except Exception as e:
        print(f"⚠ Core ML export failed: {e}")
        print("Skipping Core ML export")
else:
    print("⚠ Skipping Core ML export (ONNX export failed)")

# Final memory cleanup
gc.collect()

## 6. Verification

In [None]:
# Verify model size
size_mb, meets_req = verify_model_size(
    model_path=onnx_path,
    max_size_mb=30
)

if meets_req:
    print(f"✓ Model size requirement met: {size_mb:.2f} MB")
else:
    print(f"⚠ Model size exceeds target: {size_mb:.2f} MB")

## 7. Save and Notify

In [None]:
# Package model for download
zip_path = package_for_download(
    model_dir=model_dir,
    output_zip="english_model.zip"
)

print(f"✓ Model packaged: {zip_path}")

In [None]:
# Send completion notification
send_notification_email(
    subject="English Model Training Complete! 🎉",
    message=f"""
English keyboard suggestion model training has completed successfully!

Model Details:
- Size: {size_mb:.2f} MB
- Location: {model_dir}
- Package: {zip_path}

The model is ready for integration into your keyboard app.

Next steps:
1. Download the model package
2. Integrate into iOS/Android app
3. Test on actual devices
    """,
    to_email="phamminhphueur@gmail.com"
)

print("\n" + "="*60)
print("✓ TRAINING COMPLETE!")
print("="*60)
print(f"Model saved to: {model_dir}")
print(f"Package: {zip_path}")
print(f"Size: {size_mb:.2f} MB")
print("="*60)

In [None]:
# Download (Colab only)
if IN_COLAB:
    from google.colab import files
    print("Downloading model package...")
    files.download(zip_path)
    print("✓ Download started")
else:
    print(f"Model saved locally to: {zip_path}")