# Model 1: Kana-Kanji Converter (Seq2Seq)

**Task:** Convert katakana to kanji
- Input: `„Ç™„Çª„ÉØ`
- Output: `„Åä‰∏ñË©±`

**Architecture:** Bi-GRU Encoder + Luong Attention + GRU Decoder

**Target:** ~2MB, 90%+ accuracy, <5ms inference

## 1. Setup

In [None]:
from google.colab import drive
import os

drive.mount('/content/drive')
DRIVE_DIR = '/content/drive/MyDrive/Keyboard-Suggestions-ML-Colab'
MODEL_DIR = f"{DRIVE_DIR}/models/gru_japanese_kana_kanji"
os.makedirs(MODEL_DIR, exist_ok=True)
print(f"‚úì Model: {MODEL_DIR}")

In [None]:
!pip install -q tensorflow keras datasets numpy tqdm

In [None]:
# ============================================================
# CONFIGURATION
# ============================================================

TESTING_MODE = True

if TESTING_MODE:
    NUM_EPOCHS = 4
    BATCH_SIZE = 256
    MAX_SAMPLES = 250000
else:
    NUM_EPOCHS = 30
    BATCH_SIZE = 256
    MAX_SAMPLES = 500000

# Model specs (optimized for size + accuracy)
CHAR_VOCAB_SIZE = 3000
MAX_SEQ_LENGTH = 30
EMBEDDING_DIM = 64
ENCODER_UNITS = 128
DECODER_UNITS = 128

SPECIAL_TOKENS = ['<PAD>', '<UNK>', '<BOS>', '<EOS>']

print(f"Config: epochs={NUM_EPOCHS}, samples={MAX_SAMPLES:,}")
print(f"Model: Embed={EMBEDDING_DIM}, Encoder={ENCODER_UNITS}, Decoder={DECODER_UNITS}")

## 2. Load Dataset

In [None]:
from datasets import load_dataset

print("Loading zenz-v2.5-dataset...")

try:
    dataset = load_dataset(
        "Miwa-Keita/zenz-v2.5-dataset",
        data_files="train_wikipedia.jsonl",
        split=f"train[:{MAX_SAMPLES}]"
    )
except:
    dataset = load_dataset(
        "Miwa-Keita/zenz-v2.5-dataset",
        split=f"train[:{MAX_SAMPLES}]"
    )

print(f"‚úì Loaded {len(dataset):,} samples")
print(f"Sample: {dataset[0]['input'][:15]} ‚Üí {dataset[0]['output'][:15]}")

## 3. Build Character Vocabulary

In [None]:
from collections import Counter
from tqdm import tqdm

print("Building character vocabulary...")

char_counts = Counter()

for item in tqdm(dataset, desc="Counting chars"):
    kana = item.get('input', '')
    kanji = item.get('output', '')
    char_counts.update(list(kana))
    char_counts.update(list(kanji))

print(f"‚úì Found {len(char_counts):,} unique chars")

# Build vocab
char_to_idx = {}
for i, tok in enumerate(SPECIAL_TOKENS):
    char_to_idx[tok] = i

for char, _ in char_counts.most_common(CHAR_VOCAB_SIZE - len(SPECIAL_TOKENS)):
    char_to_idx[char] = len(char_to_idx)

idx_to_char = {v: k for k, v in char_to_idx.items()}
vocab_size = len(char_to_idx)

print(f"‚úì Vocab size: {vocab_size:,}")

## 4. Create Training Data

In [None]:
import numpy as np

print("Creating training data...")

def encode_seq(text, max_len, add_bos=False, add_eos=False):
    tokens = list(text)
    if add_bos:
        tokens = ['<BOS>'] + tokens
    if add_eos:
        tokens = tokens + ['<EOS>']
    
    ids = [char_to_idx.get(c, char_to_idx['<UNK>']) for c in tokens]
    if len(ids) < max_len:
        ids = ids + [char_to_idx['<PAD>']] * (max_len - len(ids))
    return ids[:max_len]

encoder_inputs = []
decoder_inputs = []
decoder_targets = []

for item in tqdm(dataset, desc="Processing"):
    kana = item.get('input', '').strip()
    kanji = item.get('output', '').strip()
    
    if not kana or not kanji:
        continue
    if len(kana) > MAX_SEQ_LENGTH - 2 or len(kanji) > MAX_SEQ_LENGTH - 2:
        continue
    
    # Encoder: kana input
    enc_in = encode_seq(kana, MAX_SEQ_LENGTH)
    
    # Decoder input: <BOS> + kanji
    dec_in = encode_seq(kanji, MAX_SEQ_LENGTH, add_bos=True)
    
    # Decoder target: kanji + <EOS>
    dec_out = encode_seq(kanji, MAX_SEQ_LENGTH, add_eos=True)
    
    encoder_inputs.append(enc_in)
    decoder_inputs.append(dec_in)
    decoder_targets.append(dec_out)

encoder_inputs = np.array(encoder_inputs)
decoder_inputs = np.array(decoder_inputs)
decoder_targets = np.array(decoder_targets)

print(f"\n‚úì {len(encoder_inputs):,} training pairs")
print(f"‚úì Encoder shape: {encoder_inputs.shape}")
print(f"‚úì Decoder shape: {decoder_inputs.shape}")

In [None]:
import tensorflow as tf

# Shuffle and split
indices = np.random.permutation(len(encoder_inputs))
encoder_inputs = encoder_inputs[indices]
decoder_inputs = decoder_inputs[indices]
decoder_targets = decoder_targets[indices]

split = int(len(encoder_inputs) * 0.9)

train_ds = tf.data.Dataset.from_tensor_slices((
    {'encoder_input': encoder_inputs[:split], 'decoder_input': decoder_inputs[:split]},
    decoder_targets[:split]
)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

val_ds = tf.data.Dataset.from_tensor_slices((
    {'encoder_input': encoder_inputs[split:], 'decoder_input': decoder_inputs[split:]},
    decoder_targets[split:]
)).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)

print(f"‚úì Train: {split:,}, Val: {len(encoder_inputs)-split:,}")

## 5. Build Seq2Seq Model

In [None]:
from tensorflow.keras import mixed_precision
from tensorflow.keras.models import Model
from tensorflow.keras.layers import (
    Input, Embedding, GRU, Dense, Dropout,
    Bidirectional, Attention, Concatenate, LayerNormalization
)

mixed_precision.set_global_policy('mixed_float16')

print("Building Seq2Seq with Luong Attention...")

# Shared embedding
embedding = Embedding(vocab_size, EMBEDDING_DIM, name='embedding')

# ============================================================
# Encoder
# ============================================================
encoder_input = Input(shape=(MAX_SEQ_LENGTH,), name='encoder_input')
encoder_embed = embedding(encoder_input)

encoder_gru = Bidirectional(
    GRU(ENCODER_UNITS, return_sequences=True, return_state=True, dropout=0.2),
    name='encoder'
)
encoder_outputs, forward_h, backward_h = encoder_gru(encoder_embed)
encoder_state = Concatenate()([forward_h, backward_h])

# ============================================================
# Decoder
# ============================================================
decoder_input = Input(shape=(MAX_SEQ_LENGTH,), name='decoder_input')
decoder_embed = embedding(decoder_input)

decoder_gru = GRU(
    ENCODER_UNITS * 2,  # Match bidirectional output
    return_sequences=True,
    dropout=0.2,
    name='decoder'
)
decoder_outputs = decoder_gru(decoder_embed, initial_state=encoder_state)

# ============================================================
# Luong Attention
# ============================================================
attention = Attention(use_scale=True, name='attention')
context = attention([decoder_outputs, encoder_outputs])

# Combine
combined = Concatenate()([decoder_outputs, context])
combined = LayerNormalization()(combined)
combined = Dropout(0.3)(combined)

# Output
output = Dense(vocab_size, activation='softmax', dtype='float32', name='output')(combined)

model = Model(
    inputs=[encoder_input, decoder_input],
    outputs=output,
    name='kana_kanji_seq2seq'
)

model.compile(
    optimizer=tf.keras.optimizers.Adam(learning_rate=1e-3, clipnorm=1.0),
    loss='sparse_categorical_crossentropy',
    metrics=['accuracy']
)

model.summary()
print(f"\n‚úì Parameters: {model.count_params():,}")
print(f"‚úì Estimated size: {model.count_params() * 4 / 1024 / 1024:.2f} MB")

## 6. Train

In [None]:
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping, ReduceLROnPlateau

callbacks = [
    ModelCheckpoint(f'{MODEL_DIR}/best.keras', monitor='val_accuracy', save_best_only=True, verbose=1),
    EarlyStopping(monitor='val_loss', patience=5, restore_best_weights=True),
    ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=2, min_lr=1e-6)
]

history = model.fit(
    train_ds,
    epochs=NUM_EPOCHS,
    validation_data=val_ds,
    callbacks=callbacks,
    verbose=1
)

In [None]:
import matplotlib.pyplot as plt

fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(12, 4))
ax1.plot(history.history['loss'], label='Train')
ax1.plot(history.history['val_loss'], label='Val')
ax1.set_title('Loss'); ax1.legend()

ax2.plot(history.history['accuracy'], label='Train')
ax2.plot(history.history['val_accuracy'], label='Val')
ax2.set_title('Accuracy'); ax2.legend()
plt.savefig(f'{MODEL_DIR}/training.png')
plt.show()

print(f"\n‚úì Final Val Accuracy: {history.history['val_accuracy'][-1]*100:.2f}%")

## 7. Save Model

In [None]:
import json

model.save(f'{MODEL_DIR}/model.keras')

with open(f'{MODEL_DIR}/char_to_idx.json', 'w', encoding='utf-8') as f:
    json.dump(char_to_idx, f, ensure_ascii=False)

with open(f'{MODEL_DIR}/idx_to_char.json', 'w', encoding='utf-8') as f:
    json.dump({str(k): v for k, v in idx_to_char.items()}, f, ensure_ascii=False)

config = {
    'vocab_size': vocab_size,
    'max_seq_length': MAX_SEQ_LENGTH,
    'embedding_dim': EMBEDDING_DIM,
    'encoder_units': ENCODER_UNITS,
    'decoder_units': DECODER_UNITS,
    'architecture': 'Seq2Seq_BiGRU_LuongAttention',
    'task': 'kana_to_kanji',
    'special_tokens': SPECIAL_TOKENS
}
with open(f'{MODEL_DIR}/config.json', 'w') as f:
    json.dump(config, f, indent=2)

print("‚úì Saved model and config")

## 8. Export TFLite

In [None]:
print("Exporting TFLite...")

try:
    converter = tf.lite.TFLiteConverter.from_keras_model(model)
    converter.target_spec.supported_ops = [tf.lite.OpsSet.TFLITE_BUILTINS, tf.lite.OpsSet.SELECT_TF_OPS]
    converter._experimental_lower_tensor_list_ops = False
    
    tflite_model = converter.convert()
    with open(f'{MODEL_DIR}/model.tflite', 'wb') as f:
        f.write(tflite_model)
    print(f"‚úì model.tflite ({len(tflite_model)/(1024*1024):.2f}MB)")
    
    # FP16
    converter.optimizations = [tf.lite.Optimize.DEFAULT]
    converter.target_spec.supported_types = [tf.float16]
    tflite_fp16 = converter.convert()
    with open(f'{MODEL_DIR}/model_fp16.tflite', 'wb') as f:
        f.write(tflite_fp16)
    print(f"‚úì model_fp16.tflite ({len(tflite_fp16)/(1024*1024):.2f}MB)")
except Exception as e:
    print(f"‚ö† Error: {e}")

## 9. Verification

In [None]:
print("="*60)
print("VERIFICATION: Kana ‚Üí Kanji Conversion")
print("="*60)

def convert_kana_to_kanji(kana_text, max_len=30):
    """Convert katakana to kanji using beam search."""
    # Encode input
    enc_input = np.array([encode_seq(kana_text, MAX_SEQ_LENGTH)])
    
    # Start with <BOS>
    dec_input = np.zeros((1, MAX_SEQ_LENGTH), dtype=np.int32)
    dec_input[0, 0] = char_to_idx['<BOS>']
    
    result = []
    for i in range(max_len):
        predictions = model.predict(
            {'encoder_input': enc_input, 'decoder_input': dec_input},
            verbose=0
        )
        
        # Get next char
        next_idx = np.argmax(predictions[0, i])
        next_char = idx_to_char.get(next_idx, '<UNK>')
        
        if next_char == '<EOS>':
            break
        if next_char not in SPECIAL_TOKENS:
            result.append(next_char)
        
        # Update decoder input
        if i + 1 < MAX_SEQ_LENGTH:
            dec_input[0, i + 1] = next_idx
    
    return ''.join(result)

# Test cases
tests = [
    '„Ç¢„É™„Ç¨„Éà„Ç¶',      # ‚Üí ÊúâÈõ£„ÅÜ
    '„Ç¥„Ç∂„Ç§„Éû„Çπ',      # ‚Üí „Åî„Åñ„ÅÑ„Åæ„Åô
    '„Ç™„Çª„ÉØ',          # ‚Üí „Åä‰∏ñË©±
    '„Ç∑„É≥„Ç∏„É•„ÇØ',      # ‚Üí Êñ∞ÂÆø
    '„Éà„Ç¶„Ç≠„Éß„Ç¶',      # ‚Üí Êù±‰∫¨
    '„Éã„Éõ„É≥',          # ‚Üí Êó•Êú¨
    '„Ç≥„É≥„Éã„ÉÅ„Éè',      # ‚Üí „Åì„Çì„Å´„Å°„ÅØ
]

print("\nüìù Conversion Results:")
print("-" * 40)
for kana in tests:
    result = convert_kana_to_kanji(kana)
    print(f"  {kana} ‚Üí {result}")

print("\n" + "="*60)
print("‚úÖ VERIFICATION COMPLETE")

In [None]:
# List exports
print("\nExported files:")
for f in sorted(os.listdir(MODEL_DIR)):
    path = f'{MODEL_DIR}/{f}'
    if os.path.isfile(path):
        size = os.path.getsize(path)
        if size > 1024*1024:
            print(f"  {f}: {size/(1024*1024):.1f} MB")
        else:
            print(f"  {f}: {size/1024:.1f} KB")