# Next Word Predictor v2 ‚Äî Top-K Suggestions

**Supports**: Google Colab & Kaggle (Multi-GPU)

**Task**: Predict top-K next words from Japanese word context
- Input: `[„ÅÇ„Çä„Åå„Å®„ÅÜ]` ‚Üí Top-K: `„Åî„Åñ„ÅÑ„Åæ„Åô`, `„Åî„Åñ„ÅÑ„Åæ„Åó„Åü`, `„Å≠`
- Input: `[‰ªäÊó•, „ÅØ]` ‚Üí Top-K: `Â§©Ê∞ó`, `ËâØ„ÅÑ`, `„Å®„Å¶„ÇÇ`

**Same approach as English keyboard model**:
- 1 model call ‚Üí top-K next words (instant, no looping)
- Fast on mobile: single forward pass

**Architecture**: Bi-GRU + Self-Attention + Context GRU

**Testing workflow**:
1. Set `TESTING_MODE = True` ‚Üí 100K samples, 10 epochs (quality check)
2. Train ‚Üí verify loss ‚Üì and accuracy ‚Üë
3. Verify with real meaningful sentences from dataset
4. Set `TESTING_MODE = False` ‚Üí full training (5M samples)

In [None]:
import os
import gc

# Auto-detect platform (Colab check first - Colab also has /kaggle dir!)
if 'COLAB_RELEASE_TAG' in os.environ:
    PLATFORM = 'Colab'
    from google.colab import drive
    drive.mount('/content/drive')
    DRIVE_DIR = '/content/drive/MyDrive/Keyboard-Suggestions-ML-Colab'
elif os.path.exists('/kaggle/working'):
    PLATFORM = 'Kaggle'
    DRIVE_DIR = '/kaggle/working'
else:
    PLATFORM = 'Local'
    DRIVE_DIR = './output'

MODEL_DIR = f"{DRIVE_DIR}/models/gru_japanese_next_word"
CACHE_DIR = f"{DRIVE_DIR}/cache/nwp"
os.makedirs(MODEL_DIR, exist_ok=True)
os.makedirs(CACHE_DIR, exist_ok=True)

print(f"‚úÖ Platform: {PLATFORM}")
print(f"üìÅ Model: {MODEL_DIR}")
print(f"üíæ Cache: {CACHE_DIR}")

In [None]:
!pip install -q tensorflow keras datasets numpy tqdm fugashi unidic-lite

In [None]:
import tensorflow as tf

# ===========================================================
# MULTI-GPU + MIXED PRECISION
# ===========================================================
strategy = tf.distribute.MirroredStrategy()
NUM_GPUS = strategy.num_replicas_in_sync
print(f"üî• GPUs available: {NUM_GPUS}")

# Mixed precision: T4 has good FP16 Tensor Cores
tf.keras.mixed_precision.set_global_policy('mixed_float16')
print(f"‚ö° Mixed precision: {tf.keras.mixed_precision.global_policy().name}")

In [None]:
# ===========================================================
# CONFIGURATION
# ===========================================================
# ‚ö†Ô∏è Set True for quality check (100K samples, 10 epochs)
#    This is enough data for the model to learn real patterns.
# Set False for full production training (5M samples, 15 epochs)
TESTING_MODE = True

if TESTING_MODE:
    MAX_SAMPLES = 100_000
    MAX_NWP_PAIRS = 500_000
    NUM_EPOCHS = 10
    CACHE_SUFFIX = '_test'
    print("‚ö†Ô∏è TESTING MODE: 100K samples, 10 epochs")
else:
    MAX_SAMPLES = 5_000_000
    MAX_NWP_PAIRS = 8_000_000
    NUM_EPOCHS = 15
    CACHE_SUFFIX = ''
    print("üöÄ FULL TRAINING: 5M samples, 15 epochs")

BATCH_SIZE = 512 * NUM_GPUS  # Scale batch with GPUs (512 per GPU)
FORCE_REBUILD_CACHE = False

# Word-level model config
WORD_VOCAB_SIZE = 6000
MAX_WORD_CONTEXT = 10  # Max words in context (left-padded)
EMBEDDING_DIM = 96
GRU_UNITS = 192

SPECIAL_TOKENS = ['<PAD>', '<UNK>', '<BOS>', '<EOS>']
PAD_IDX = 0

print(f"Config: epochs={NUM_EPOCHS}, batch={BATCH_SIZE} ({BATCH_SIZE//NUM_GPUS}/GPU)")
print(f"Model: vocab={WORD_VOCAB_SIZE}, embed={EMBEDDING_DIM}, GRU={GRU_UNITS}")

## 0. Shared Utilities

In [None]:
import fugashi

tagger = fugashi.Tagger()

def tokenize_words(text):
    """Word-level tokenization using fugashi (MeCab)."""
    if not text:
        return []
    result = []
    for t in tagger(text):
        if t.feature.pos1 not in ['Á©∫ÁôΩ']:  # Skip whitespace
            result.append(t.surface)
    return result

def encode_words(words, vocab, pad_id, unk_id, max_len=None):
    """Encode word list to padded integer IDs (left-padded)."""
    if max_len is None:
        max_len = MAX_WORD_CONTEXT
    ids = [vocab.get(w, unk_id) for w in words]
    if len(ids) < max_len:
        ids = [pad_id] * (max_len - len(ids)) + ids  # Left-pad
    return ids[-max_len:]  # Keep last N tokens

# Quick test
test_words = tokenize_words('‰ªäÊó•„ÅØ„Å®„Å¶„ÇÇÊöë„ÅÑ„Åß„Åô„Å≠')
print(f"‚úì Tokenize test: {test_words}")
print(f"  ({len(test_words)} words)")

## 1. Load or Build Cache

Uses `left_context + output` combined for full sentence context.

Testing mode uses separate cache files (`_test` suffix).

In [None]:
import json
import numpy as np
from tqdm.auto import tqdm

# Cache paths ‚Äî separate for test vs full to avoid overwriting
VOCAB_CACHE = f"{CACHE_DIR}/word_vocab_v2{CACHE_SUFFIX}.json"
NWP_X_CACHE = f"{CACHE_DIR}/nwp_x_v2{CACHE_SUFFIX}.npy"
NWP_Y_CACHE = f"{CACHE_DIR}/nwp_y_v2{CACHE_SUFFIX}.npy"
TEST_CASES_CACHE = f"{CACHE_DIR}/test_cases_nwp{CACHE_SUFFIX}.json"

def cache_exists():
    return all(os.path.exists(f) for f in [VOCAB_CACHE, NWP_X_CACHE, NWP_Y_CACHE])

if cache_exists() and not FORCE_REBUILD_CACHE:
    print("üì¶ Loading from cache (memory-mapped)...")
    
    with open(VOCAB_CACHE, 'r', encoding='utf-8') as f:
        vocab_data = json.load(f)
    word_to_idx = vocab_data['word_to_idx']
    idx_to_word = {int(k): v for k, v in vocab_data['idx_to_word'].items()}
    vocab_size = len(word_to_idx)
    
    x_mmap = np.load(NWP_X_CACHE, mmap_mode='r')
    y_mmap = np.load(NWP_Y_CACHE, mmap_mode='r')
    
    print(f"‚úì Vocab: {vocab_size:,} words")
    print(f"‚úì Pairs: {len(x_mmap):,} (memory-mapped)")
    CACHE_LOADED = True
else:
    print("üî® Building from scratch (will save to drive)...")
    CACHE_LOADED = False

In [None]:
# Load dataset + build word vocabulary
if not CACHE_LOADED:
    from datasets import load_dataset
    from collections import Counter
    
    print("üì• Loading zenz dataset...")
    dataset = load_dataset(
        "Miwa-Keita/zenz-v2.5-dataset",
        data_files="train_wikipedia.jsonl",
        split="train"
    )
    print(f"‚úì Raw: {len(dataset):,} items")
    
    # Pass 1: Build vocab from left_context + output (combined)
    print(f"\nüìù Building word vocabulary (limit: {MAX_SAMPLES:,} items)...")
    word_counts = Counter()
    # Store tokenized sentences for reuse in Pass 2
    all_sentences = []  # [(words_list, sentence_text), ...]
    processed = 0
    
    for item in tqdm(dataset, desc="Counting words"):
        left_ctx = item.get('left_context', '') or ''
        output = item.get('output', '') or ''
        text = left_ctx + output
        if not text.strip():
            continue
        words = tokenize_words(text)
        if len(words) < 3:  # Skip very short fragments
            continue
        word_counts.update(words)
        all_sentences.append((words, text))
        processed += 1
        if MAX_SAMPLES and processed >= MAX_SAMPLES:
            break
    
    print(f"\n‚úì Found {len(word_counts):,} unique words from {processed:,} items")
    print(f"  Top 15: {[w for w, c in word_counts.most_common(15)]}")
    
    # Build vocab: special tokens first, then most common words
    word_to_idx = {tok: i for i, tok in enumerate(SPECIAL_TOKENS)}
    for word, _ in word_counts.most_common(WORD_VOCAB_SIZE - len(SPECIAL_TOKENS)):
        word_to_idx[word] = len(word_to_idx)
    
    idx_to_word = {v: k for k, v in word_to_idx.items()}
    vocab_size = len(word_to_idx)
    UNK_IDX = word_to_idx['<UNK>']
    
    # Check vocab coverage
    total_tokens = sum(word_counts.values())
    covered_tokens = sum(c for w, c in word_counts.items() if w in word_to_idx)
    print(f"‚úì Vocab size: {vocab_size:,}")
    print(f"  Coverage: {covered_tokens/total_tokens*100:.1f}% of tokens in vocab")
    
    # Save vocab
    with open(VOCAB_CACHE, 'w', encoding='utf-8') as f:
        json.dump({
            'word_to_idx': word_to_idx,
            'idx_to_word': {str(k): v for k, v in idx_to_word.items()}
        }, f, ensure_ascii=False)
    print(f"‚úì Vocab saved to {VOCAB_CACHE}")
    
    del word_counts
    gc.collect()

In [None]:
# Create training pairs + save meaningful test cases
if not CACHE_LOADED:
    print(f"\nüî¢ Creating training pairs (limit: {MAX_NWP_PAIRS:,})...")
    
    PAD = word_to_idx['<PAD>']
    UNK = word_to_idx['<UNK>']
    
    # Pre-allocate arrays
    X = np.zeros((MAX_NWP_PAIRS, MAX_WORD_CONTEXT), dtype=np.int32)
    y = np.zeros(MAX_NWP_PAIRS, dtype=np.int32)
    pair_idx = 0
    
    # Save meaningful test cases: complete sentences where ALL words are in vocab
    test_cases_to_save = []
    
    for words, original_text in tqdm(all_sentences, desc="Creating pairs"):
        if len(words) < 2:
            continue
        
        # Check if this sentence is "clean" (all words in vocab, no UNK)
        all_in_vocab = all(w in word_to_idx for w in words)
        
        # Create sliding window pairs: context ‚Üí next_word
        for i in range(1, len(words)):
            next_word = words[i]
            if next_word not in word_to_idx:
                continue
            
            context = words[max(0, i - MAX_WORD_CONTEXT):i]
            X[pair_idx] = encode_words(context, word_to_idx, PAD, UNK)
            y[pair_idx] = word_to_idx[next_word]
            pair_idx += 1
            
            if pair_idx >= MAX_NWP_PAIRS:
                break
        
        # Save as test case if: clean sentence, >= 4 words, meaningful
        if (all_in_vocab and len(words) >= 4 and 
            len(test_cases_to_save) < 50):
            # Pick a meaningful context‚Üíword pair from middle of sentence
            # (not the first word which has little context)
            for i in range(2, len(words)):  # Start from 3rd word
                next_word = words[i]
                # Skip punctuation as target (we want real words)
                if next_word in ['„ÄÅ', '„ÄÇ', '„Éª', 'Ôºà', 'Ôºâ', '„Äå', '„Äç', 'ÔºÅ', 'Ôºü']:
                    continue
                if next_word not in word_to_idx:
                    continue
                context = words[max(0, i - MAX_WORD_CONTEXT):i]
                test_cases_to_save.append({
                    'context': context,
                    'expected': next_word,
                    'sentence': ''.join(words),
                })
                break  # One test case per sentence
        
        if pair_idx >= MAX_NWP_PAIRS:
            break
    
    # Trim to actual size
    X = X[:pair_idx]
    y = y[:pair_idx]
    print(f"\n‚úì Created {pair_idx:,} training pairs")
    print(f"  Avg pairs/item: {pair_idx / max(len(all_sentences), 1):.1f}")
    
    # Show sample pairs
    print("\nüìù Sample training pairs:")
    for i in range(min(10, pair_idx)):
        ctx = [idx_to_word.get(int(idx), '?') for idx in X[i] if idx != PAD]
        tgt = idx_to_word.get(int(y[i]), '?')
        print(f"  [{', '.join(ctx)}] ‚Üí {tgt}")
    
    # üíæ Save test cases (meaningful sentences only)
    with open(TEST_CASES_CACHE, 'w', encoding='utf-8') as f:
        json.dump(test_cases_to_save, f, ensure_ascii=False, indent=2)
    print(f"\nüíæ Saved {len(test_cases_to_save)} meaningful test cases ‚Üí {TEST_CASES_CACHE}")
    print("\nüìù Sample test cases:")
    for tc in test_cases_to_save[:5]:
        print(f"  {''.join(tc['context'])} ‚Üí {tc['expected']}  (from: {tc['sentence'][:30]}...)")
    
    # Save as .npy and release
    np.save(NWP_X_CACHE, X)
    np.save(NWP_Y_CACHE, y)
    del X, y, test_cases_to_save, all_sentences
    gc.collect()
    
    # Release dataset
    del dataset
    gc.collect()
    print("üßπ Saved cache, released memory")
    
    # Load as memory-mapped
    x_mmap = np.load(NWP_X_CACHE, mmap_mode='r')
    y_mmap = np.load(NWP_Y_CACHE, mmap_mode='r')
    print(f"‚úì Loaded as mmap: X={x_mmap.shape}, y={y_mmap.shape}")

print(f"\nüìä Total pairs: {len(x_mmap):,}")

## 2. Create Dataset

In [None]:
n_samples = len(x_mmap)
split = int(n_samples * 0.9)

# Random shuffle indices
indices = np.random.permutation(n_samples).astype(np.int32)
train_idx = indices[:split]
val_idx = indices[split:]

def make_generator(x, y_arr, idx_arr):
    """Generator reads from mmap arrays (zero RAM copy)."""
    def gen():
        for i in idx_arr:
            yield x[i], y_arr[i]
    return gen

output_sig = (
    tf.TensorSpec(shape=(MAX_WORD_CONTEXT,), dtype=tf.int32),
    tf.TensorSpec(shape=(), dtype=tf.int32),
)

# .repeat() is required: from_generator is one-shot, exhausts after epoch 1
train_ds = tf.data.Dataset.from_generator(
    make_generator(x_mmap, y_mmap, train_idx),
    output_signature=output_sig
).repeat().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

val_ds = tf.data.Dataset.from_generator(
    make_generator(x_mmap, y_mmap, val_idx),
    output_signature=output_sig
).repeat().batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

print(f"Train: {len(train_idx):,}, Val: {len(val_idx):,}")
print(f"üí° Data loaded via mmap + generator (near-zero RAM)")

## 3. Build Model (Bi-GRU + Self-Attention)

Model is built inside `strategy.scope()` for multi-GPU training.

In [None]:
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, GRU, Dense, Dropout,
    Bidirectional, Attention, Concatenate, LayerNormalization
)

with strategy.scope():
    inputs = Input(shape=(MAX_WORD_CONTEXT,), name='input')

    # Embedding
    x = Embedding(vocab_size, EMBEDDING_DIM, name='embedding')(inputs)

    # Bidirectional GRU
    encoder_out = Bidirectional(
        GRU(GRU_UNITS, return_sequences=True, dropout=0.2),
        name='bi_gru'
    )(x)

    # Self-Attention (Luong-style)
    attention_out = Attention(use_scale=True, name='attention')(
        [encoder_out, encoder_out]
    )

    # Combine encoder + attention
    combined = Concatenate()([encoder_out, attention_out])
    combined = LayerNormalization()(combined)

    # Context GRU (compress to single vector)
    context = GRU(GRU_UNITS, name='context_gru')(combined)
    context = Dropout(0.3)(context)

    # Output: predict next word
    # dtype='float32' ensures output stays FP32 under mixed precision
    outputs = Dense(vocab_size, activation='softmax', name='output', dtype='float32')(context)

    model = Model(inputs, outputs, name='next_word_lm_v2')

    # Gradient clipping for stable training
    model.compile(
        optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3, clipnorm=1.0),
        loss='sparse_categorical_crossentropy',
        metrics=['accuracy']
    )

model.summary()
params = model.count_params()
print(f"\nüìä Parameters: {params:,}")
print(f"   FP32: ~{params * 4 / 1024 / 1024:.1f} MB")
print(f"   FP16: ~{params * 2 / 1024 / 1024:.1f} MB")

## 4. Train

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

# Calculate steps (from_generator doesn't auto-detect size)
steps_per_epoch = len(train_idx) // BATCH_SIZE
validation_steps = max(1, len(val_idx) // BATCH_SIZE)

callbacks = [
    ModelCheckpoint(
        f'{MODEL_DIR}/best_v2.keras',
        monitor='val_accuracy',
        save_best_only=True,
        verbose=1
    ),
    EarlyStopping(
        monitor='val_loss',
        patience=5,
        restore_best_weights=True
    ),
    ReduceLROnPlateau(
        monitor='val_loss',
        factor=0.5,
        patience=2,
        min_lr=1e-6,
        verbose=1
    )
]

print(f"Steps/epoch: {steps_per_epoch}, Val steps: {validation_steps}")

history = model.fit(
    train_ds,
    epochs=NUM_EPOCHS,
    steps_per_epoch=steps_per_epoch,
    validation_data=val_ds,
    validation_steps=validation_steps,
    callbacks=callbacks
)

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.plot(history.history['loss'], label='Train')
ax1.plot(history.history['val_loss'], label='Val')
ax1.set_title('Loss'); ax1.legend()

ax2.plot(history.history['accuracy'], label='Train')
ax2.plot(history.history['val_accuracy'], label='Val')
ax2.set_title('Accuracy'); ax2.legend()

plt.savefig(f'{MODEL_DIR}/training_v2.png')
plt.show()

# ‚úÖ Logic check: loss should decrease, accuracy should increase
losses = history.history['loss']
accs = history.history['accuracy']
print(f"\nüìä Training Summary:")
print(f"  Loss:     {losses[0]:.4f} ‚Üí {losses[-1]:.4f} ({'‚úÖ decreasing' if losses[-1] < losses[0] else '‚ùå NOT decreasing'})")
print(f"  Accuracy: {accs[0]*100:.2f}% ‚Üí {accs[-1]*100:.2f}% ({'‚úÖ increasing' if accs[-1] > accs[0] else '‚ùå NOT increasing'})")
print(f"  Best val accuracy: {max(history.history['val_accuracy'])*100:.2f}%")

## 5. Save & Export

In [None]:
# Save model + vocab + config
model.save(f'{MODEL_DIR}/model.keras')

with open(f'{MODEL_DIR}/word_to_idx.json', 'w', encoding='utf-8') as f:
    json.dump(word_to_idx, f, ensure_ascii=False)

with open(f'{MODEL_DIR}/idx_to_word.json', 'w', encoding='utf-8') as f:
    json.dump({str(k): v for k, v in idx_to_word.items()}, f, ensure_ascii=False)

with open(f'{MODEL_DIR}/config.json', 'w') as f:
    json.dump({
        'vocab_size': vocab_size,
        'max_context_len': MAX_WORD_CONTEXT,
        'embedding_dim': EMBEDDING_DIM,
        'gru_units': GRU_UNITS,
        'architecture': 'BiGRU_SelfAttention_ContextGRU',
        'special_tokens': SPECIAL_TOKENS,
        'version': 'v2'
    }, f, indent=2)

keras_size = os.path.getsize(f'{MODEL_DIR}/model.keras')
print(f"‚úì Model saved: {keras_size / 1024 / 1024:.2f} MB")

In [None]:
# Export TFLite
try:
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    converter.target_spec.supported_ops = [
        tf.lite.OpsSet.TFLITE_BUILTINS,
        tf.lite.OpsSet.SELECT_TF_OPS
    ]
    converter._experimental_lower_tensor_list_ops = False
    
    tflite = converter.convert()
    with open(f'{MODEL_DIR}/model.tflite', 'wb') as f:
        f.write(tflite)
    print(f"‚úì model.tflite ({len(tflite)/(1024*1024):.2f} MB)")
    
    # FP16 version (smaller, same accuracy)
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.target_spec.supported_types = [tf.float16]
    tflite16 = converter.convert()
    with open(f'{MODEL_DIR}/model_fp16.tflite', 'wb') as f:
        f.write(tflite16)
    print(f"‚úì model_fp16.tflite ({len(tflite16)/(1024*1024):.2f} MB)")
    
except Exception as e:
    print(f"‚ö† TFLite export failed: {e}")

## 6. Verification ‚Äî Real Meaningful Test Cases

Test cases are picked **during data prep** from sentences where:
- ‚úÖ ALL words are in vocabulary (no `<UNK>` contamination)
- ‚úÖ Sentence has ‚â• 4 words (meaningful context)
- ‚úÖ Target is a real word (not punctuation)

**What to check**:
- Expected word appears in top-5? ‚Üí model learned context
- Predictions are diverse? ‚Üí model not collapsed to frequency

In [None]:
print("="*60)
print("VERIFICATION: Real Test Cases from Dataset")
print("="*60)

PAD = word_to_idx['<PAD>']
UNK = word_to_idx['<UNK>']

def predict_top_k(context_words, top_k=5):
    """Predict top-K next words from context. 1 model call = instant."""
    encoded = np.array([encode_words(context_words, word_to_idx, PAD, UNK)])
    probs = model.predict(encoded, verbose=0)[0]
    
    top_indices = np.argsort(probs)[-top_k*2:][::-1]
    results = []
    for idx in top_indices:
        word = idx_to_word.get(idx, '<UNK>')
        if word not in SPECIAL_TOKENS:
            results.append((word, float(probs[idx])))
        if len(results) >= top_k:
            break
    return results


# ==========================================================
# Load test cases saved during data prep
# These are clean sentences with NO <UNK> tokens
# ==========================================================
if os.path.exists(TEST_CASES_CACHE):
    with open(TEST_CASES_CACHE, 'r', encoding='utf-8') as f:
        test_cases = json.load(f)
    print(f"\n‚úì Loaded {len(test_cases)} meaningful test cases")
else:
    # Fallback: pick from validation data (may have UNK)
    print("\n‚ö†Ô∏è No saved test cases, building from validation set...")
    test_cases = []
    np.random.seed(42)
    sample_indices = np.random.choice(val_idx, size=min(200, len(val_idx)), replace=False)
    seen = set()
    for i in sample_indices:
        x_row = x_mmap[i]
        y_val = int(y_mmap[i])
        expected = idx_to_word.get(y_val, '<UNK>')
        if expected in SPECIAL_TOKENS or expected in seen:
            continue
        if expected in ['„ÄÅ', '„ÄÇ', '„Éª', 'Ôºà', 'Ôºâ', '„Äå', '„Äç']:
            continue
        ctx = [idx_to_word.get(int(idx), '<UNK>') for idx in x_row if idx != PAD]
        if '<UNK>' in ctx or len(ctx) < 2:
            continue
        test_cases.append({'context': ctx, 'expected': expected, 'sentence': ''.join(ctx) + expected})
        seen.add(expected)
        if len(test_cases) >= 15:
            break

# ==========================================================
# Run predictions
# ==========================================================
# Use at most 20 test cases
test_subset = test_cases[:20]

print(f"\nüìù Testing {len(test_subset)} cases (clean sentences, no UNK):")
print("-" * 60)

correct_top1 = 0
correct_top5 = 0
all_predictions = set()  # Track prediction diversity

for tc in test_subset:
    preds = predict_top_k(tc['context'], top_k=5)
    pred_words = [w for w, _ in preds]
    all_predictions.update(pred_words)
    
    in_top1 = pred_words[0] == tc['expected'] if preds else False
    in_top5 = tc['expected'] in pred_words
    
    if in_top1: correct_top1 += 1
    if in_top5: correct_top5 += 1
    
    status = '‚úÖ' if in_top5 else ('üü°' if in_top1 else '‚ùå')
    ctx_str = ''.join(tc['context'][-5:])  # Show last 5 context words
    pred_str = ', '.join(pred_words[:5])
    print(f"  {status} {ctx_str} ‚Üí expected: {tc['expected']}")
    print(f"       top5: [{pred_str}]")

n = len(test_subset)
print(f"\n" + "="*60)
print(f"üìä Results:")
print(f"  Top-1 accuracy: {correct_top1}/{n} ({correct_top1/n*100:.1f}%)")
print(f"  Top-5 accuracy: {correct_top5}/{n} ({correct_top5/n*100:.1f}%)")
print(f"  Unique predictions across all tests: {len(all_predictions)}")
print(f"    (should be >> 5. If ‚â§ 5 = model collapsed to frequency only)")

if TESTING_MODE:
    print("\n‚ö†Ô∏è TESTING MODE (100K samples).")
    if correct_top5 / n >= 0.1:
        print("   ‚úÖ Model shows learning signal! Ready for full training.")
        print("   ‚Üí Set TESTING_MODE = False, FORCE_REBUILD_CACHE = True")
    else:
        print("   üü° Low accuracy is expected with 100K samples.")
        print("   ‚úÖ Check: loss decreased? accuracy improved? no crashes?")
        print("   ‚Üí If yes, set TESTING_MODE = False for production training.")

print("\n" + "="*60)
print("‚úÖ VERIFICATION COMPLETE")

In [None]:
# List exported files
print(f"\nüì¶ Files ({PLATFORM}):")
for f in sorted(os.listdir(MODEL_DIR)):
    p = f'{MODEL_DIR}/{f}'
    if os.path.isfile(p):
        s = os.path.getsize(p)
        if s > 1024*1024:
            print(f"  {f}: {s/(1024*1024):.2f} MB")
        else:
            print(f"  {f}: {s/1024:.1f} KB")