# GRU Keyboard Prediction Model (Hybrid)

Train a GRU model for keyboard suggestions with **hybrid prediction**.

**Supports 3 Tasks:**

| Input Format | Method | Example |
|--------------|--------|----------|
| `text + space` | GRU Model | "How are " → you, they, we |
| `partial word` | Vocabulary Filter | "Hel" → Hello, Help, Hell |
| `typo word` | Edit Distance | "Thers" → There, These |

**Why Hybrid?**
- ✅ GRU model: Best for context-aware next-word prediction
- ✅ Vocabulary filter: Instant, accurate word completion
- ✅ Edit distance: Handles any typo without training

**Model Specifications:**
- Architecture: GRU (Gated Recurrent Unit)
- Parameters: ~10M
- Model Size: 30-40MB (Keras), 15-20MB (TFLite)
- Training Time: 5-10 minutes on GPU
- Inference: <10ms on mobile

---

**Instructions:**
1. Runtime → Change runtime type → GPU (T4)
2. Set `TESTING_MODE = True` for quick test
3. Set `TESTING_MODE = False` for full training
4. Run all cells in order
5. Download TFLite for mobile

## 1. Environment Setup

In [None]:
# Mount Google Drive and setup directories
from google.colab import drive
import os

drive.mount('/content/drive')

# Define directories
DRIVE_DIR = '/content/drive/MyDrive/Keyboard-Suggestions-ML-Colab'
os.makedirs(f"{DRIVE_DIR}/models/gru_keyboard", exist_ok=True)

print(f"✓ Google Drive mounted")
print(f"✓ Project directory: {DRIVE_DIR}")

In [None]:
# Install dependencies
!pip install -q tensorflow keras nltk pandas numpy scikit-learn tqdm
print("✓ Dependencies installed")

In [None]:
# ============================================================
# CONFIGURATION - OPTIMIZED FOR T4 GPU
# ============================================================

TESTING_MODE = True  # ← Change to False for full training

if TESTING_MODE:
    print("⚠️  TESTING MODE")
    print("   - Dataset: keyboard_training_data.txt")
    print("   - Epochs: 2 (quick verification)")
    print("   - Time: ~1 min")
    NUM_EPOCHS = 2
    BATCH_SIZE = 512  # Optimized for T4 GPU
    VOCAB_SIZE_LIMIT = 25000  # Limit vocab for speed
    SEQUENCE_LENGTH = 10  # Better context
else:
    print("✓ FULL TRAINING MODE")
    print("   - Dataset: Fake.csv + True.csv + 1661-0.txt")
    print("   - Epochs: 20")
    print("   - Time: ~8-10 min (with optimizations)")
    NUM_EPOCHS = 20
    BATCH_SIZE = 512  # Optimized for T4 GPU
    VOCAB_SIZE_LIMIT = 25000  # Smaller model, faster inference
    SEQUENCE_LENGTH = 10  # Better predictions

print(f"\nOptimizations:")
print(f"  - Batch size: {BATCH_SIZE} (maximizes GPU)")
print(f"  - Vocab limit: {VOCAB_SIZE_LIMIT:,} (reduces model size)")
print(f"  - Sequence length: {SEQUENCE_LENGTH} (better context)")
print("="*60)

## 2. Verify Datasets in Google Drive

**Required datasets in Google Drive:**
- `{DRIVE_DIR}/datasets/Fake.csv` - Fake news dataset
- `{DRIVE_DIR}/datasets/True.csv` - True news dataset
- `{DRIVE_DIR}/datasets/1661-0.txt` - Next-word prediction corpus

Upload these files to your Google Drive before running.

In [None]:
import os

print("Checking datasets in Google Drive...")
print("="*60)

# Define dataset paths
FAKE_NEWS_PATH = f"{DRIVE_DIR}/datasets/Fake.csv"
TRUE_NEWS_PATH = f"{DRIVE_DIR}/datasets/True.csv"
CORPUS_PATH = f"{DRIVE_DIR}/datasets/1661-0.txt"

# Check each dataset
datasets_ok = True

for name, path in [("Fake.csv", FAKE_NEWS_PATH), 
                    ("True.csv", TRUE_NEWS_PATH),
                    ("1661-0.txt", CORPUS_PATH)]:
    if os.path.exists(path):
        size = os.path.getsize(path) / (1024 * 1024)
        print(f"✓ {name}: {size:.2f}MB")
    else:
        print(f"✗ Missing: {name}")
        print(f"   Expected at: {path}")
        datasets_ok = False

if not datasets_ok:
    print("\n⚠️  Please upload missing datasets to Google Drive!")
    print(f"   Upload to: {DRIVE_DIR}/datasets/")
    raise FileNotFoundError("Required datasets not found in Google Drive")
else:
    print("\n✅ All datasets found!")

In [None]:
import pandas as pd
import numpy as np

print("Loading datasets from Google Drive...")
print("="*60)

all_text = []

if TESTING_MODE:
    # Testing mode: Use keyboard_training_data.txt (smaller, faster)
    print("⚠️  TESTING MODE: Using keyboard_training_data.txt")
    
    CORPUS_PATH = f"{DRIVE_DIR}/datasets/keyboard_training_data.txt"
    
    if not os.path.exists(CORPUS_PATH):
        print(f"\n✗ Missing: keyboard_training_data.txt")
        print(f"   Expected at: {CORPUS_PATH}")
        raise FileNotFoundError("keyboard_training_data.txt not found")
    
    with open(CORPUS_PATH, 'r', encoding='utf-8') as f:
        corpus_text = f.read()
    
    all_text.append(corpus_text)
    print(f"✓ Loaded: {len(corpus_text):,} characters")
    
else:
    # Full training mode: Use Fake.csv + True.csv + 1661-0.txt
    print("✓ FULL TRAINING: Using Fake.csv + True.csv + 1661-0.txt")
    
    FAKE_NEWS_PATH = f"{DRIVE_DIR}/datasets/Fake.csv"
    TRUE_NEWS_PATH = f"{DRIVE_DIR}/datasets/True.csv"
    CORPUS_PATH = f"{DRIVE_DIR}/datasets/1661-0.txt"
    
    # Check files exist
    for name, path in [("Fake.csv", FAKE_NEWS_PATH), 
                        ("True.csv", TRUE_NEWS_PATH),
                        ("1661-0.txt", CORPUS_PATH)]:
        if not os.path.exists(path):
            print(f"\n✗ Missing: {name}")
            print(f"   Expected at: {path}")
            raise FileNotFoundError(f"{name} not found")
    
    # Load fake news
    fake_df = pd.read_csv(FAKE_NEWS_PATH)
    true_df = pd.read_csv(TRUE_NEWS_PATH)
    
    print(f"✓ Loaded {len(fake_df):,} fake news articles")
    print(f"✓ Loaded {len(true_df):,} true news articles")
    
    all_text.extend(fake_df['text'].tolist())
    all_text.extend(true_df['text'].tolist())
    
    # Load corpus
    with open(CORPUS_PATH, 'r', encoding='utf-8') as f:
        corpus_text = f.read()
        all_text.append(corpus_text)
    
    print(f"✓ Loaded corpus: {len(corpus_text):,} characters")

# Combine and clean
combined_text = ' '.join(all_text).lower()
combined_text = combined_text.replace('\n', ' ')
combined_text = ' '.join(combined_text.split())

print(f"\n✓ Total: {len(combined_text):,} characters")
print(f"✓ Sample: {combined_text[:200]}...")
print("="*60)

## 4. Tokenize and Create Sequences

In [None]:
from tensorflow.keras.preprocessing.text import Tokenizer
from tensorflow.keras.preprocessing.sequence import pad_sequences
import tensorflow as tf
import numpy as np

print("Tokenizing with vocabulary limit...")
print("="*60)

# Tokenize with vocab limit
tokenizer = Tokenizer(num_words=VOCAB_SIZE_LIMIT)
tokenizer.fit_on_texts([combined_text])

vocab_size = min(len(tokenizer.word_index) + 1, VOCAB_SIZE_LIMIT)

print(f"✓ Total unique words: {len(tokenizer.word_index):,}")
print(f"✓ Vocabulary size (limited): {vocab_size:,}")

# Convert to sequences
sequences = tokenizer.texts_to_sequences([combined_text])[0]

print(f"\nCreating optimized tf.data pipeline...")

sequences_array = np.array(sequences)

# Create dataset using timeseries for next-word prediction
dataset = tf.keras.utils.timeseries_dataset_from_array(
    data=sequences_array[:-1],
    targets=sequences_array[SEQUENCE_LENGTH:],
    sequence_length=SEQUENCE_LENGTH,
    sequence_stride=1,
    shuffle=True,
    batch_size=BATCH_SIZE,
    seed=42
)

# Calculate total steps
total_sequences = len(sequences) - SEQUENCE_LENGTH
total_steps = total_sequences // BATCH_SIZE

# Split: 90% train, 10% validation
val_steps = max(1, total_steps // 10)
train_steps = total_steps - val_steps

# Split dataset with proper steps
train_dataset = dataset.take(train_steps).prefetch(tf.data.AUTOTUNE)
val_dataset = dataset.skip(train_steps).take(val_steps).prefetch(tf.data.AUTOTUNE)

print(f"✓ Total sequences: {total_sequences:,}")
print(f"✓ Total steps: {total_steps:,}")
print(f"✓ Train steps: {train_steps:,} (90%)")
print(f"✓ Val steps: {val_steps:,} (10%)")
print(f"✓ Batch size: {BATCH_SIZE}")
print(f"✓ Prefetching: Enabled")
print("="*60)

print("\n📝 Note: GRU model trains on NEXT-WORD prediction only.")
print("   Word completion & typo correction use vocabulary + edit distance.")

## 5. Build GRU Model

In [None]:
from tensorflow.keras import mixed_precision
import tensorflow as tf

# Enable Mixed Precision for T4 GPU (2x faster training)
policy = mixed_precision.Policy('mixed_float16')
mixed_precision.set_global_policy(policy)

print("="*60)
print("PERFORMANCE OPTIMIZATIONS")
print("="*60)
print("✓ Mixed Precision enabled (FP16)")
print("  - Training speed: ~2x faster")
print("  - Memory usage: ~40% less")
print("  - Accuracy: Same as FP32")
print("="*60)

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Input, Embedding, GRU, Dense, Dropout
from tensorflow.keras.optimizers import AdamW

print("Building GRU model (Functional API + Mixed Precision)...")
print("="*60)

# Input layer
inputs = Input(shape=(SEQUENCE_LENGTH,), name='input')

# Embedding layer
x = Embedding(
    input_dim=vocab_size,
    output_dim=128,
    name='embedding'
)(inputs)

# GRU layer
x = GRU(
    units=256,
    dropout=0.2,
    recurrent_dropout=0.2,
    name='gru'
)(x)

# Dropout
x = Dropout(0.3, name='dropout')(x)

# Output layer (dtype=float32 for numerical stability with mixed precision)
outputs = Dense(vocab_size, activation='softmax', dtype='float32', name='output')(x)

# Create model
model = Model(inputs=inputs, outputs=outputs, name='gru_keyboard')

# Compile
model.compile(
    optimizer=AdamW(
        learning_rate=1e-3,  # 0.001 (higher than Adam's default)
        weight_decay=1e-4    # Decoupled weight decay for better regularization
    ),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()

params = model.count_params()
size_mb = (params * 4) / (1024 * 1024)

print("\n" + "="*60)
print("MODEL INFO")
print("="*60)
print(f"✓ Parameters: {params:,}")
print(f"✓ Size: {size_mb:.2f}MB (FP32), {size_mb/2:.2f}MB (FP16)")
print("✓ Architecture: Functional API")
print("✓ Optimizer: AdamW (lr=1e-3, weight_decay=1e-4)")
print("✓ Mixed Precision: Enabled")
print("="*60)

## 6. Train Model

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

callbacks = [
    ModelCheckpoint(
        f'{DRIVE_DIR}/models/gru_keyboard/best_model.keras',
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    ),
    EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True,
        verbose=1
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=3,
        verbose=1
    )
]

print("="*60)
print("TRAINING (OPTIMIZED)")
print("="*60)
print(f"Epochs: {NUM_EPOCHS}")
print(f"Batch size: {BATCH_SIZE}")
print(f"Mixed Precision: FP16")
print(f"Data Pipeline: tf.data (prefetched)")
print("="*60)

history = model.fit(
    train_dataset,
    epochs=NUM_EPOCHS,
    steps_per_epoch=train_steps,
    validation_data=val_dataset,
    validation_steps=val_steps,
    callbacks=callbacks,
    verbose=1
)

print("\n✓ Training complete!")
if TESTING_MODE:
    print("\n⚠️  This was TESTING mode")
    print("   Set TESTING_MODE = False for full training")

## 7. Visualize Training

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(14, 5))

ax1.plot(history.history['loss'], label='Train')
ax1.plot(history.history['val_loss'], label='Val')
ax1.set_title('Loss')
ax1.legend()
ax1.grid(True)

ax2.plot(history.history['accuracy'], label='Train')
ax2.plot(history.history['val_accuracy'], label='Val')
ax2.set_title('Accuracy')
ax2.legend()
ax2.grid(True)

plt.show()

val_acc = history.history['val_accuracy'][-1]
val_loss = history.history['val_loss'][-1]
print(f"\nFinal: Val Loss={val_loss:.4f}, Val Acc={val_acc*100:.2f}%")

## 8. Save Model

In [None]:
import pickle

model.save(f'{DRIVE_DIR}/models/gru_keyboard/gru_model.keras')

with open(f'{DRIVE_DIR}/models/gru_keyboard/tokenizer.pkl', 'wb') as f:
    pickle.dump(tokenizer, f)

config = {'vocab_size': vocab_size, 'sequence_length': SEQUENCE_LENGTH}
with open(f'{DRIVE_DIR}/models/gru_keyboard/config.pkl', 'wb') as f:
    pickle.dump(config, f)

print("✓ Saved: gru_model.keras, tokenizer.pkl, config.pkl")

## 9. Test Predictions

In [None]:
# ============================================================
# HYBRID PREDICTION SYSTEM
# ============================================================
# - GRU model: next-word prediction (ends with space)
# - Vocabulary filter: word completion (partial word)
# - Edit distance: typo correction (misspelled word)
# ============================================================

def edit_distance(s1, s2):
    """Calculate Levenshtein distance between two strings"""
    if len(s1) < len(s2):
        return edit_distance(s2, s1)
    if len(s2) == 0:
        return len(s1)
    
    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row
    
    return previous_row[-1]


# Build vocabulary list with word frequencies
vocab_list = sorted(tokenizer.word_index.items(), key=lambda x: x[1])[:VOCAB_SIZE_LIMIT]
vocab_words = [word for word, idx in vocab_list]
print(f"✓ Vocabulary loaded: {len(vocab_words):,} words")


def predict_next_word(context, top_k=5):
    """Use GRU model for next-word prediction"""
    seq = tokenizer.texts_to_sequences([context.lower()])[0]
    seq = seq[-SEQUENCE_LENGTH:]
    seq = pad_sequences([seq], maxlen=SEQUENCE_LENGTH, padding='pre')
    preds = model.predict(seq, verbose=0)[0]
    top_idx = np.argsort(preds)[-top_k:][::-1]
    return [(tokenizer.index_word.get(i, ''), preds[i]*100) for i in top_idx if i in tokenizer.index_word]


def complete_word(partial, top_k=5):
    """Use vocabulary filter for word completion"""
    partial = partial.lower()
    candidates = []
    
    for word in vocab_words:
        if word.startswith(partial) and word != partial:
            # Score by word frequency (lower index = more common)
            idx = tokenizer.word_index.get(word, 999999)
            score = 100 / (idx + 1)  # Higher score for common words
            candidates.append((word, score))
    
    # Sort by score (word frequency)
    candidates.sort(key=lambda x: x[1], reverse=True)
    return candidates[:top_k]


def correct_typo(typo, top_k=5):
    """Use edit distance for typo correction"""
    typo = typo.lower()
    candidates = []
    
    # Only check words with similar length
    for word in vocab_words[:10000]:  # Top 10k words for speed
        if abs(len(word) - len(typo)) <= 2:
            dist = edit_distance(word, typo)
            if dist <= 2 and word != typo:  # Max 2 character difference
                # Score: lower distance = higher score
                idx = tokenizer.word_index.get(word, 999999)
                score = (100 / (dist + 1)) * (100 / (idx + 1))
                candidates.append((word, score, dist))
    
    # Sort by score
    candidates.sort(key=lambda x: x[1], reverse=True)
    return [(w, s, d) for w, s, d in candidates[:top_k]]


def predict_hybrid(input_text, top_k=5):
    """
    Hybrid prediction system:
    - "text " (ends with space) → GRU next-word prediction
    - "tex" (partial word) → Vocabulary completion
    - "txet" (typo) → Edit distance correction
    """
    results = []
    
    if input_text.endswith(' '):
        # NEXT-WORD PREDICTION (GRU model)
        task = 'next_word'
        preds = predict_next_word(input_text.strip(), top_k)
        results = [(word, prob, task) for word, prob in preds]
    
    else:
        words = input_text.split()
        if len(words) > 1:
            # Context + partial/typo: "How are ther"
            context = ' '.join(words[:-1])
            partial = words[-1]
        else:
            # Just partial: "Hel"
            context = ""
            partial = input_text
        
        # Try COMPLETION first
        completions = complete_word(partial, top_k)
        
        if completions:
            # Found completions
            task = 'completion'
            results = [(word, score, task) for word, score in completions]
        else:
            # No completions, try TYPO CORRECTION
            task = 'typo'
            corrections = correct_typo(partial, top_k)
            results = [(word, score, task) for word, score, dist in corrections]
    
    return results


# ============================================================
# TEST HYBRID PREDICTIONS
# ============================================================

test_cases = [
    # Next-word prediction (with space) → GRU model
    ("How are ", "Next-word (GRU)"),
    ("Thank ", "Next-word (GRU)"),
    ("I want to ", "Next-word (GRU)"),
    ("Good morning ", "Next-word (GRU)"),
    
    # Word completion (partial word) → Vocabulary filter
    ("Hel", "Completion (Vocab)"),
    ("Tha", "Completion (Vocab)"),
    ("Goo", "Completion (Vocab)"),
    ("Mor", "Completion (Vocab)"),
    ("bea", "Completion (Vocab)"),
    
    # Typo correction (misspelled) → Edit distance
    ("thers", "Typo (Edit Dist)"),
    ("teh", "Typo (Edit Dist)"),
    ("helo", "Typo (Edit Dist)"),
    ("recieve", "Typo (Edit Dist)"),
    
    # Combined context + partial/typo
    ("How are yo", "Context + Completion"),
    ("I want to goe", "Context + Typo"),
]

print("\n" + "="*60)
print("HYBRID PREDICTION TESTS")
print("="*60)

for input_text, test_type in test_cases:
    print(f"\n📝 Input: '{input_text}' ({test_type})")
    predictions = predict_hybrid(input_text, top_k=5)
    
    if not predictions:
        print("   (no predictions)")
        continue
    
    for i, (word, score, task) in enumerate(predictions, 1):
        if score > 50:
            emoji = "🟢"
        elif score > 10:
            emoji = "🟡"
        else:
            emoji = "🔴"
        
        print(f"  {i}. {word:15s} {emoji} {score:5.1f} [{task}]")

print("\n" + "="*60)
print("\nSummary:")
print("  - Next-word: Uses trained GRU model (context-aware)")
print("  - Completion: Uses vocabulary filter (instant, accurate)")
print("  - Typo: Uses edit distance (handles any typo)")
print("="*60)

## 10. Export to TFLite

In [None]:
import tensorflow as tf

print("Converting to TFLite (GRU-compatible)...")
print("="*60)

# Create converter
converter = tf.lite.TFLiteConverter.from_keras_model(model)

# GRU/LSTM requires SELECT_TF_OPS for dynamic tensor lists
converter.target_spec.supported_ops = [
    tf.lite.OpsSet.TFLITE_BUILTINS,  # Standard TFLite ops
    tf.lite.OpsSet.SELECT_TF_OPS     # TensorFlow ops (for GRU)
]

# Disable tensor list lowering (required for GRU)
converter._experimental_lower_tensor_list_ops = False

# Optimize for size
converter.optimizations = [tf.lite.Optimize.DEFAULT]

# Convert
print("Converting model (this may take a minute)...")
tflite_model = converter.convert()

# Save
tflite_path = f'{DRIVE_DIR}/models/gru_keyboard/gru_model.tflite'
with open(tflite_path, 'wb') as f:
    f.write(tflite_model)

size_mb = len(tflite_model) / (1024 * 1024)

print("="*60)
print(f"✓ TFLite model saved: {size_mb:.2f}MB")
print(f"✓ Path: {tflite_path}")
print("\n⚠️  Note: Model uses SELECT_TF_OPS for GRU support")
print("   This is normal and required for RNN layers")
print("\n🎉 Training complete! Download from Google Drive.")
print("="*60)