In [5]:
import json
import numpy as np
import re
import os
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import Dataset, DataLoader
import pytorch_lightning as pl
from pytorch_lightning.callbacks import ModelCheckpoint, EarlyStopping, LearningRateMonitor

# Configuration
CONFIG = {
    'vocab_path': 'word2id_daily.json',
    'id_path': 'id2word_daily.json',
    'emb_path': 'embedding_matrix_daily.npz',
    'batch_size': 512,
    'hidden_dim': 512,       # Reverted to 256 as requested
    'num_layers': 3,         # IMPROVEMENT: Increased layers for depth
    'learning_rate': 0.001,
    'max_seq_len': 30, 
    'pad_token_id': 0,
    'dropout': 0.3,
    'unk_token_id': 0        # Placeholder, will update after loading vocab
}

# Check for GPU
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(f"Running on: {device}")

Running on: cuda


In [6]:
# Restoring Attention Mechanism
class Attention(nn.Module):
    def __init__(self, hidden_dim):
        super(Attention, self).__init__()
        self.attn = nn.Linear(hidden_dim, hidden_dim)
        self.v = nn.Linear(hidden_dim, 1, bias=False)
        
    def forward(self, hidden, encoder_outputs, mask=None):
        # hidden: [batch, hidden_dim] (The final state of the GRU)
        # encoder_outputs: [batch, seq_len, hidden_dim] (All states of the GRU)
        
        src_len = encoder_outputs.shape[1]
        
        # Repeat hidden state src_len times
        hidden_expanded = hidden.unsqueeze(1).repeat(1, src_len, 1)
        
        # Calculate energy
        energy = torch.tanh(self.attn(hidden_expanded + encoder_outputs))
        
        # Calculate attention scores
        attention = self.v(energy).squeeze(2)
        
        if mask is not None:
            attention = attention.masked_fill(mask == 0, -1e4)
            
        return F.softmax(attention, dim=1)

In [7]:
class NextWordGRU(pl.LightningModule):
    def __init__(self, embedding_matrix, hidden_dim, vocab_size, lr, pad_idx):
        super().__init__()
        self.save_hyperparameters(ignore=['embedding_matrix'])
        
        # 1. Embedding Layer
        # FROZEN as requested because they are FastText semantics
        self.embedding = nn.Embedding.from_pretrained(
            embedding_matrix, 
            freeze=True, 
            padding_idx=pad_idx
        )
        
        # 2. GRU Layer (Reverted to GRU)
        self.gru = nn.GRU(
            input_size=embedding_matrix.shape[1], 
            hidden_size=hidden_dim,
            num_layers=CONFIG['num_layers'], # 3 Layers
            batch_first=True,
            dropout=CONFIG['dropout'] if CONFIG['num_layers'] > 1 else 0
        )
        
        # 3. Attention Layer
        self.attention = Attention(hidden_dim)
        
        # 4. Dense Output
        self.fc = nn.Linear(hidden_dim * 2, vocab_size)
        
        self.loss_fn = nn.CrossEntropyLoss(ignore_index=pad_idx)
        self.lr = lr

    def forward(self, x):
        mask = (x != self.hparams.pad_idx)
        embedded = self.embedding(x)
        
        # GRU Output
        outputs, hidden = self.gru(embedded)
        
        # Take the hidden state of the LAST layer
        final_hidden = hidden[-1] 
        
        # Calculate Attention
        attn_weights = self.attention(final_hidden, outputs, mask)
        context = torch.bmm(attn_weights.unsqueeze(1), outputs).squeeze(1)
        
        # Combine Context and Hidden
        combined = torch.cat((context, final_hidden), dim=1)
        logits = self.fc(combined)
        return logits

    def training_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y)
        self.log('train_loss', loss, prog_bar=True, on_step=True, on_epoch=True)
        return loss

    def validation_step(self, batch, batch_idx):
        x, y = batch
        logits = self(x)
        loss = self.loss_fn(logits, y)
        predictions = torch.argmax(logits, dim=1)
        accuracy = (predictions == y).float().mean()
        self.log('val_loss', loss, prog_bar=True, on_epoch=True)
        self.log('val_accuracy', accuracy, prog_bar=True, on_epoch=True)
        return loss

    def configure_optimizers(self):
        optimizer = torch.optim.Adam(self.parameters(), lr=self.lr)
        scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
            optimizer, mode='min', factor=0.5, patience=1, verbose=True
        )
        return {
            "optimizer": optimizer,
            "lr_scheduler": {
                "scheduler": scheduler,
                "monitor": "val_loss"
            }
        }

In [8]:
# Load vocabulary and embedding matrix
print("Loading vocabulary and embedding matrix...")
with open(CONFIG['vocab_path'], 'r') as f:
    word2id = json.load(f)

with open(CONFIG['id_path'], 'r') as f:
    id2word = json.load(f)

vocab_size = len(word2id)
print(f"Vocabulary size: {vocab_size}")

# Load embedding matrix
embedding_data = np.load(CONFIG['emb_path'])
embedding_matrix = torch.from_numpy(embedding_data['embedding_matrix']).float()
embedding_dim = embedding_matrix.shape[1]
print(f"Embedding matrix shape: {embedding_matrix.shape}")

# Update unknown token id in config
CONFIG['unk_token_id'] = word2id.get('<unk>', 0)

Loading vocabulary and embedding matrix...
Vocabulary size: 20003
Embedding matrix shape: torch.Size([20003, 300])


In [9]:
# Load checkpoint
checkpoint_path = 'checkpoints/daily-epoch=07-val_loss=4.41.ckpt'
print(f"Loading checkpoint from: {checkpoint_path}")

# Load the Lightning module from checkpoint
model = NextWordGRU.load_from_checkpoint(
    checkpoint_path,
    embedding_matrix=embedding_matrix,
    hidden_dim=CONFIG['hidden_dim'],
    vocab_size=vocab_size,
    lr=CONFIG['learning_rate'],
    pad_idx=CONFIG['pad_token_id']
)

model.eval()
model = model.to(device)
print("Model loaded and set to eval mode")

Loading checkpoint from: checkpoints/daily-epoch=07-val_loss=4.41.ckpt
Model loaded and set to eval mode
Model loaded and set to eval mode


In [10]:
# Save as .pth (PyTorch state dict + metadata)
pth_save_path = 'model_daily_gru_attention.pth'
print(f"Saving model to: {pth_save_path}")

# Save the model state dict along with embedding matrix and config
checkpoint_data = {
    'model_state_dict': model.state_dict(),
    'embedding_matrix': embedding_matrix,
    'vocab_size': vocab_size,
    'vocab': word2id,
    'id2word': id2word,
    'config': CONFIG
}

torch.save(checkpoint_data, pth_save_path)
print(f"✓ Model saved as .pth: {pth_save_path}")
print(f"  File size: {os.path.getsize(pth_save_path) / (1024**2):.2f} MB")

Saving model to: model_daily_gru_attention.pth
✓ Model saved as .pth: model_daily_gru_attention.pth
  File size: 143.00 MB
✓ Model saved as .pth: model_daily_gru_attention.pth
  File size: 143.00 MB


In [11]:
# Create a wrapper for ONNX export (without the Lightning wrapper)
class NextWordGRUForONNX(nn.Module):
    """Pure PyTorch model without Lightning - suitable for ONNX export"""
    def __init__(self, model: NextWordGRU):
        super().__init__()
        self.embedding = model.embedding
        self.gru = model.gru
        self.attention = model.attention
        self.fc = model.fc
        self.pad_idx = model.hparams.pad_idx
        
    def forward(self, x):
        mask = (x != self.pad_idx)
        embedded = self.embedding(x)
        
        # GRU Output
        outputs, hidden = self.gru(embedded)
        
        # Take the hidden state of the LAST layer
        final_hidden = hidden[-1]
        
        # Calculate Attention
        attn_weights = self.attention(final_hidden, outputs, mask)
        context = torch.bmm(attn_weights.unsqueeze(1), outputs).squeeze(1)
        
        # Combine Context and Hidden
        combined = torch.cat((context, final_hidden), dim=1)
        logits = self.fc(combined)
        return logits

# Create the pure PyTorch model
onnx_model = NextWordGRUForONNX(model)
onnx_model.eval()
onnx_model = onnx_model.to(device)

print("Created ONNX-compatible model wrapper")

Created ONNX-compatible model wrapper


In [14]:
# Export to ONNX format for edge deployment
onnx_save_path = 'model_daily_gru_attention.onnx'
print(f"Exporting model to ONNX: {onnx_save_path}")

# Create a dummy input for tracing
dummy_input = torch.randint(0, vocab_size, (1, CONFIG['max_seq_len'])).to(device)

try:
    # Export the model
    torch.onnx.export(
        onnx_model,
        dummy_input,
        onnx_save_path,
        input_names=['input_ids'],
        output_names=['logits'],
        dynamic_axes={
            'input_ids': {0: 'batch_size', 1: 'sequence_length'},
            'logits': {0: 'batch_size'}
        },
        opset_version=14,
        do_constant_folding=True,
        verbose=False
    )
    print(f"✓ Model exported to ONNX: {onnx_save_path}")
    print(f"  File size: {os.path.getsize(onnx_save_path) / (1024**2):.2f} MB")
except Exception as e:
    print(f"✗ Error during ONNX export: {e}")
    print("  Note: Some layers might not be fully supported in ONNX")

Exporting model to ONNX: model_daily_gru_attention.onnx
✓ Model exported to ONNX: model_daily_gru_attention.onnx
  File size: 118.91 MB
✓ Model exported to ONNX: model_daily_gru_attention.onnx
  File size: 118.91 MB


In [15]:
# Verification: Test the saved .pth model
print("\n=== Verification ===")
print(f"✓ .pth file exists: {os.path.exists(pth_save_path)}")
print(f"✓ ONNX file exists: {os.path.exists(onnx_save_path)}")

# Test inference with dummy input
print("\nTesting inference...")
test_input = torch.randint(0, vocab_size, (2, CONFIG['max_seq_len'])).to(device)

with torch.no_grad():
    output = model(test_input)
    print(f"✓ Model output shape: {output.shape}")
    print(f"  Expected: torch.Size([2, {vocab_size}])")

print("\n=== Export Summary ===")
print(f"1. PyTorch model (.pth): {pth_save_path}")
print(f"   - Contains: model weights + embedding matrix + vocabulary + config")
print(f"2. ONNX model: {onnx_save_path}")
print(f"   - Format: ONNX opset 14 (compatible with TensorRT, ONNX Runtime)")
print(f"\nBoth models are ready for deployment!")


=== Verification ===
✓ .pth file exists: True
✓ ONNX file exists: True

Testing inference...
✓ Model output shape: torch.Size([2, 20003])
  Expected: torch.Size([2, 20003])

=== Export Summary ===
1. PyTorch model (.pth): model_daily_gru_attention.pth
   - Contains: model weights + embedding matrix + vocabulary + config
2. ONNX model: model_daily_gru_attention.onnx
   - Format: ONNX opset 14 (compatible with TensorRT, ONNX Runtime)

Both models are ready for deployment!
