# Japanese Keyboard Suggestion Model Training (with Google Drive Integration)

This notebook trains a Japanese keyboard suggestion model using Qwen2-1.5B with LoRA fine-tuning.

**Features:**
- Automatic Google Drive data management
- Checks for existing data before downloading
- Email notifications on completion
- Saves models to Drive for persistence

**Target Specifications:**
- Model Size: 40-60 MB (after optimization)
- Latency: < 80 ms
- Perplexity: < 20
- Top-3 Accuracy: > 80%
- IME Support: Romaji → Kanji conversion

# Clone repository (if running in Colab)
if IN_COLAB:
    # Clone repo
    !git clone https://github.com/MinhPhuPham/Keyboard-Suggestions-ML-Colab.git
    
    # Change to project directory
    import os
    os.chdir('/content/Keyboard-Suggestions-ML-Colab')
    
    print(f"✓ Repository cloned")
    print(f"✓ Working directory: {os.getcwd()}")
else:
    print("✓ Running locally")

In [None]:
# Install dependencies
!pip install -q -r requirements.txt
print("✓ Dependencies installed")

In [None]:
# Import libraries
import sys
import os

# Add src to Python path (works in both Colab and local)
if IN_COLAB:
    src_path = '/content/Keyboard-Suggestions-ML-Colab/src'
else:
    src_path = os.path.abspath('./src')

if src_path not in sys.path:
    sys.path.insert(0, src_path)
    print(f"✓ Added {src_path} to Python path")

import torch
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset
import fugashi


# Import custom utilities
from data_prep import clean_japanese_text, prepare_japanese_data
from model_utils import (
    load_model_with_lora, train_causal_lm, evaluate_perplexity,
    prune_model, quantize_model, merge_lora_weights
)
from export_utils import (
    export_to_onnx, export_to_coreml, verify_model_size,
    benchmark_latency, package_for_download
)
from colab_data_manager import (
    mount_google_drive, setup_japanese_data, send_notification_email
)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

In [None]:
# Import libraries
import sys
import os

# Add src to Python path
if IN_COLAB:
    if not os.getcwd().endswith('Keyboard-Suggestions-ML-Colab'):
        os.chdir('/content/Keyboard-Suggestions-ML-Colab')
    src_path = os.path.join(os.getcwd(), 'src')
else:
    src_path = os.path.abspath('./src')

if src_path not in sys.path:
    sys.path.insert(0, src_path)

print(f"✓ Python path: {src_path}")

import torch
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset
import fugashi

# Import custom utilities
from data_prep import clean_japanese_text, prepare_japanese_data
from model_utils import (
    load_model_with_lora, train_causal_lm, evaluate_perplexity,
    prune_model, quantize_model, merge_lora_weights
)
from export_utils import (
    export_to_onnx, export_to_coreml, verify_model_size,
    benchmark_latency, package_for_download
)
from colab_data_manager import (
    mount_google_drive, setup_japanese_data, send_notification_email
)

print(f"\nPyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
else:
    print("\n⚠️ WARNING: GPU not available!")
    print("To enable GPU in Colab:")
    print("1. Runtime → Change runtime type")
    print("2. Hardware accelerator → GPU")
    print("3. Save")
    print("4. Restart runtime and re-run cells")

In [None]:
# Download UniDic for Japanese morphological analysis
!python -m unidic download
print("✓ UniDic downloaded")

In [None]:
# Import libraries
import sys
sys.path.append('./src')

import torch
from transformers import AutoTokenizer
from datasets import load_dataset, Dataset
import fugashi

# Import custom utilities
from data_prep import clean_japanese_text, prepare_japanese_data
from model_utils import (
    load_model_with_lora, train_causal_lm, evaluate_perplexity,
    prune_model, quantize_model, merge_lora_weights
)
from export_utils import (
    export_to_onnx, export_to_coreml, verify_model_size,
    benchmark_latency, package_for_download
)
from colab_data_manager import (
    mount_google_drive, setup_japanese_data, send_notification_email
)

print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")

## 2. Google Drive Setup and Data Management

In [None]:
# Mount Google Drive
if IN_COLAB:
    mount_success = mount_google_drive()
    if not mount_success:
        raise Exception("Failed to mount Google Drive")
else:
    print("Skipping Drive mount (running locally)")

In [None]:
# Set up Japanese training data
# This will check Drive first, download if needed
if IN_COLAB:
    DRIVE_BASE = "/content/drive/MyDrive/Phu's Data development"
    data_path = setup_japanese_data(DRIVE_BASE)
    
    if data_path is None:
        print("⚠ Data setup failed. Please check errors above.")
        print("\nManual setup instructions:")
        print("1. CC100 Japanese will be downloaded automatically from Hugging Face")
        print("2. If download fails, check internet connection and try again")
    else:
        print(f"\n✓ Data ready at: {data_path}")
else:
    data_path = "./data/japanese"
    print(f"Using local data path: {data_path}")

## 3. Data Preparation

In [None]:
# Initialize Japanese morphological analyzer
tagger = fugashi.Tagger('-Owakati')

# Test morphological analysis
test_text = "今日は昨日より良い日だ"
print(f"Original: {test_text}")
print(f"Morphemes: {tagger.parse(test_text)}")

In [None]:
# Prepare sample training data
# For demonstration, using sample data
# Replace with actual CC100 processing

sample_sentences = [
    "今日は良い天気ですね",
    "明日会議があります",
    "ありがとうございます",
    "お疲れ様でした",
    "よろしくお願いします",
    "おはようございます",
    "こんにちは",
    "さようなら",
]

# Clean text
cleaned = [clean_japanese_text(s) for s in sample_sentences]

print(f"Sample sentences:")
for sent in cleaned[:5]:
    print(f"  {sent}")

print(f"\nTotal training samples: {len(cleaned)}")

## 4. Model Setup and Fine-Tuning

In [None]:
# Load model with LoRA
MODEL_NAME = "Qwen/Qwen2-1.5B-Instruct"

print(f"Loading model: {MODEL_NAME}")
model, tokenizer = load_model_with_lora(
    model_name=MODEL_NAME,
    lora_r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=["q_proj", "k_proj", "v_proj"]  # More modules for Japanese
)

print("✓ Model loaded with LoRA adapters")

In [None]:
# Prepare dataset for training
def tokenize_function(examples):
    return tokenizer(examples['text'], truncation=True, max_length=8)

# Create dataset
train_data = Dataset.from_dict({'text': cleaned})
train_dataset = train_data.map(tokenize_function, batched=True)

print(f"✓ Training dataset prepared: {len(train_dataset)} samples")

In [None]:
# Train model
print("Starting training...")

# Set checkpoint directory to Drive if in Colab
if IN_COLAB:
    checkpoint_dir = os.path.join(DRIVE_BASE, "checkpoints", "japanese")
else:
    checkpoint_dir = "./checkpoints/japanese"

trainer = train_causal_lm(
    model=model,
    tokenizer=tokenizer,
    train_dataset=train_dataset,
    output_dir=checkpoint_dir,
    num_epochs=3,
    batch_size=8,  # Adjust based on GPU memory
    learning_rate=5e-6,  # Lower LR for Japanese
    max_seq_length=8,
    save_steps=100
)

print("✓ Training complete!")

## 5. Optimization and Export

In [None]:
# Merge LoRA weights
model = merge_lora_weights(model)
print("✓ LoRA weights merged")

In [None]:
# Prune model (more aggressive for larger model)
model = prune_model(model, amount=0.4)
print("✓ Model pruned")

In [None]:
# Quantize model
model = quantize_model(model, dtype=torch.qint8)
print("✓ Model quantized")

In [None]:
# Set model output directory to Drive if in Colab
if IN_COLAB:
    model_dir = os.path.join(DRIVE_BASE, "models", "japanese")
else:
    model_dir = "./models/japanese"

os.makedirs(model_dir, exist_ok=True)

# Export to ONNX
onnx_path = export_to_onnx(
    model=model,
    tokenizer=tokenizer,
    output_path=os.path.join(model_dir, "japanese_model.onnx"),
    max_seq_length=8
)

print(f"✓ ONNX model saved to: {onnx_path}")

In [None]:
# Export to Core ML (for iOS)
coreml_path = export_to_coreml(
    onnx_path=onnx_path,
    output_path=os.path.join(model_dir, "japanese_model.mlmodel"),
    model_name="JapaneseKeyboardSuggestion"
)

if coreml_path:
    print(f"✓ Core ML model saved to: {coreml_path}")

## 6. Verification

In [None]:
# Verify model size
size_mb, meets_req = verify_model_size(
    model_path=onnx_path,
    max_size_mb=60
)

if meets_req:
    print(f"✓ Model size requirement met: {size_mb:.2f} MB")
else:
    print(f"⚠ Model size exceeds target: {size_mb:.2f} MB")

In [None]:
# Test IME functionality (romaji → kanji)
test_inputs = [
    "kyouha",  # 今日は
    "arigatou",  # ありがとう
]

print("IME Test (requires additional IME layer):")
for inp in test_inputs:
    print(f"  {inp} → [IME conversion needed]")

## 7. Save and Notify

In [None]:
# Package model for download
zip_path = package_for_download(
    model_dir=model_dir,
    output_zip="japanese_model.zip"
)

print(f"✓ Model packaged: {zip_path}")

In [None]:
# Send completion notification
send_notification_email(
    subject="Japanese Model Training Complete! 🎉",
    message=f"""
Japanese keyboard suggestion model training has completed successfully!

Model Details:
- Size: {size_mb:.2f} MB
- Location: {model_dir}
- Package: {zip_path}

The model is ready for integration into your keyboard app.

Next steps:
1. Download the model package
2. Add IME layer for romaji → kanji conversion
3. Integrate into iOS/Android app
4. Test on actual devices with Japanese input
    """,
    to_email="phamminhphueur@gmail.com"
)

print("\n" + "="*60)
print("✓ TRAINING COMPLETE!")
print("="*60)
print(f"Model saved to: {model_dir}")
print(f"Package: {zip_path}")
print(f"Size: {size_mb:.2f} MB")
print("="*60)

In [None]:
# Download (Colab only)
if IN_COLAB:
    from google.colab import files
    print("Downloading model package...")
    files.download(zip_path)
    print("✓ Download started")
else:
    print(f"Model saved locally to: {zip_path}")