<a href="https://colab.research.google.com/github/Jarvisss1/MCCN-ReMGU_Text_Generator/blob/main/MCCN_ReMGU.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader, Dataset
from collections import Counter
import numpy as np
import random
from nltk.tokenize import word_tokenize
import nltk

# Download tokenizer resources
nltk.download('punkt_tab')

# Set random seeds for reproducibility
seed = 42
torch.manual_seed(seed)
random.seed(seed)

np.random.seed(seed)

# Helper functions
def read_words(filename):
    """Read words from a file, replacing newlines with <eos>."""
    with open(filename, "r", encoding="utf-8") as f:
        return f.read().replace("\n", " <eos> ").split()

def build_vocab(filename):
    """Build a vocabulary from a dataset file."""
    data = read_words(filename)
    counter = Counter(data)
    count_pairs = sorted(counter.items(), key=lambda x: (-x[1], x[0]))
    words, _ = zip(*count_pairs)
    word_to_id = {word: i for i, word in enumerate(words)}
    return word_to_id

def file_to_word_ids(filename, word_to_id):
    """Convert words in a file to their corresponding IDs."""
    data = read_words(filename)
    return [word_to_id.get(word, word_to_id.get("<unk>", 1)) for word in data]

def load_ptb_dataset(train_path, valid_path, test_path):
    """Load and preprocess the PTB dataset."""
    word_to_id = build_vocab(train_path)
    train_data = file_to_word_ids(train_path, word_to_id)
    valid_data = file_to_word_ids(valid_path, word_to_id)
    test_data = file_to_word_ids(test_path, word_to_id)
    vocab_size = len(word_to_id)
    return train_data, valid_data, test_data, vocab_size, word_to_id


class PTBDataset(Dataset):
    def __init__(self, data, seq_length):
        self.data = data
        self.seq_length = seq_length

    def __len__(self):
        return len(self.data) - self.seq_length

    def __getitem__(self, idx):
        x = self.data[idx:idx + self.seq_length]
        y = self.data[idx + self.seq_length]
        return torch.tensor(x, dtype=torch.long), torch.tensor(y, dtype=torch.long)

[nltk_data] Downloading package punkt_tab to /root/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt_tab.zip.


In [None]:
train_path = "/content/ptb.train.txt"
valid_path = "/content/ptb.valid.txt"
test_path = "/content/ptb.test.txt"

# Load dataset
train_data, valid_data, test_data, vocab_size, word_to_id = load_ptb_dataset(train_path, valid_path, test_path)
id_to_word = {id_: word for word, id_ in word_to_id.items()}
print(f"Vocabulary size: {vocab_size}")
print(f"Sample id_to_word: {dict(list(id_to_word.items())[:10])}")  # Sample mapping

# Parameters
seq_length = 20
batch_size = 32
reduced_data_size = 300000

# Reduce dataset size for faster experimentation
train_data = train_data[:reduced_data_size]

# DataLoader for training
train_dataset = PTBDataset(train_data, seq_length)
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True, num_workers=4)

print(f"Number of training batches: {len(train_loader)}")


import torch
import torch.nn as nn
import torch.nn.functional as F

class MGUCell(nn.Module):
    def __init__(self, input_size, hidden_size):
        super(MGUCell, self).__init__()
        self.W_f = nn.Linear(input_size + hidden_size, hidden_size)
        self.W_a = nn.Linear(input_size + hidden_size, hidden_size)

    def forward(self, x, h_prev):
        combined = torch.cat([x, h_prev], dim=1)
        f_t = torch.sigmoid(self.W_f(combined))
        a_t = torch.tanh(self.W_a(combined))
        h_t = f_t * h_prev + (1 - f_t) * a_t
        return h_t

class MGU(nn.Module):
    def __init__(self, input_size, hidden_size, num_layers):
        super(MGU, self).__init__()
        self.hidden_size = hidden_size
        self.num_layers = num_layers
        self.cells = nn.ModuleList([MGUCell(input_size if i == 0 else hidden_size, hidden_size) for i in range(num_layers)])

    def forward(self, x, h_0):
        seq_len, batch_size, _ = x.size()
        h = h_0
        outputs = []
        for t in range(seq_len):
            for i, cell in enumerate(self.cells):
                h[i] = cell(x[t], h[i])
            outputs.append(h[-1])
        outputs = torch.stack(outputs, dim=0)
        return outputs, h

class ResidualMGUNetwork(nn.Module):
    def __init__(self, vocab_size, embed_size, cnn_channels, hidden_size, num_layers, output_size):
        super(ResidualMGUNetwork, self).__init__()
        self.embedding = nn.Embedding(vocab_size, embed_size)

        # Multi-window convolution: kernel sizes 3, 4, 5
        self.kernel_sizes = [3, 5,7]
        assert cnn_channels % len(self.kernel_sizes) == 0, "cnn_channels must be divisible by number of kernels"
        self.per_kernel_channels = cnn_channels // len(self.kernel_sizes)

        self.convs = nn.ModuleList([
            nn.Conv1d(
                in_channels=embed_size,
                out_channels=self.per_kernel_channels,
                kernel_size=k,
                padding=k // 2  # Same-length output
            )
            for k in self.kernel_sizes
        ])

        self.mgu = MGU(cnn_channels, hidden_size, num_layers)
        self.residual_fc = nn.Linear(hidden_size, hidden_size)
        self.dropout = nn.Dropout(0.5)
        self.fc = nn.Linear(hidden_size, output_size)

    def forward(self, x):
        # x: (batch_size, seq_length)
        embedded = self.embedding(x)  # (batch, seq_len, embed_size)
        embedded = embedded.permute(0, 2, 1)  # (batch, embed_size, seq_len)

        # Apply each convolution and ReLU
        conv_outputs = [F.relu(conv(embedded)) for conv in self.convs]  # List of (batch, C, seq_len)
        cnn_out = torch.cat(conv_outputs, dim=1)  # (batch, cnn_channels, seq_len)

        # Prepare for RNN: (seq_len, batch, cnn_channels)
        cnn_out = cnn_out.permute(2, 0, 1)

        batch_size = cnn_out.size(1)
        h_0 = [torch.zeros(batch_size, self.mgu.hidden_size, device=x.device) for _ in range(self.mgu.num_layers)]

        mgu_out, _ = self.mgu(cnn_out, h_0)  # (seq_len, batch, hidden_size)
        last_mgu_out = mgu_out[-1]  # (batch, hidden_size)

        # Residual connection
        residual_out = F.relu(self.residual_fc(last_mgu_out)) + last_mgu_out
        residual_out = self.dropout(residual_out)
        output = self.fc(residual_out)
        return F.log_softmax(output, dim=1)


# Example usage
vocab_size = 10000
embed_size = 128
cnn_channels = 150  # Example value
assert cnn_channels % 3 == 0, "Make sure this is divisible by 3"

hidden_size = 150
num_layers = 1
output_size = vocab_size

model = ResidualMGUNetwork(vocab_size, embed_size, cnn_channels, hidden_size, num_layers, output_size)
sample_input = torch.randint(0, vocab_size, (20, 35))
output = model(sample_input)
print(output.shape)  # Should be (Batch, Output Size)


Vocabulary size: 10000
Sample id_to_word: {0: 'the', 1: '<unk>', 2: '<eos>', 3: 'N', 4: 'of', 5: 'to', 6: 'a', 7: 'in', 8: 'and', 9: "'s"}
Number of training batches: 9374




torch.Size([20, 10000])


In [None]:
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
from tqdm import tqdm
import time

# Initialize the model
vocab_size = 10000
embed_size = 128
cnn_channels = 150
hidden_size = 150
num_layers = 1
output_size = vocab_size

model = ResidualMGUNetwork(vocab_size, embed_size, cnn_channels, hidden_size, num_layers, output_size)

# Training Parameters
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)

# Learning Rate Scheduler
scheduler = torch.optim.lr_scheduler.ExponentialLR(optimizer, gamma=0.9)  # Increase LR by 50% each epoch

num_epochs = 15

def calculate_perplexity(loss):
    return torch.exp(torch.tensor(loss))

# DataLoader
train_loader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True, drop_last=True)

# Training Loop
for epoch in range(num_epochs):
    model.train()
    total_loss = 0
    correct = 0
    total = 0
    start_time = time.time()

    print(f"Epoch {epoch + 1}/{num_epochs} - Learning Rate: {scheduler.get_last_lr()[0]:.6f}")
    epoch_iterator = tqdm(train_loader, desc=f"Training Epoch {epoch + 1}")

    for batch_idx, (inputs, targets) in enumerate(epoch_iterator):
        inputs, targets = inputs.to(device), targets.to(device)

        optimizer.zero_grad()
        outputs = model(inputs)
        loss = criterion(outputs, targets)
        loss.backward()

        # Gradient Clipping
        torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm=5.0)

        optimizer.step()

        total_loss += loss.item()
        _, predicted = torch.max(outputs, dim=1)
        correct += (predicted == targets).sum().item()
        total += targets.size(0)

        avg_loss = total_loss / (batch_idx + 1)
        perplexity = calculate_perplexity(avg_loss)
        speed = (batch_idx + 1) * batch_size / (time.time() - start_time)

        epoch_iterator.set_postfix({
            "Batch Loss": f"{loss.item():.4f}",
            "Perplexity": f"{perplexity:.2f}",
            "Speed (words/s)": f"{speed:.0f}",
            "Accuracy": f"{100 * correct / total:.2f}%",
            "LR": f"{scheduler.get_last_lr()[0]:.6f}"
        })

    # Step the scheduler
    scheduler.step()

    accuracy = 100 * correct / total
    avg_loss = total_loss / len(train_loader)
    perplexity = calculate_perplexity(avg_loss)
    print(f"Epoch {epoch + 1} Completed: Loss: {avg_loss:.4f}, Perplexity: {perplexity:.2f}, Accuracy: {accuracy:.2f}%")

Epoch 1/15 - Learning Rate: 0.001000


Training Epoch 1: 100%|██████████| 9374/9374 [03:20<00:00, 46.83it/s, Batch Loss=6.2617, Perplexity=504.57, Speed (words/s)=1499, Accuracy=13.18%, LR=0.001000]


Epoch 1 Completed: Loss: 6.2237, Perplexity: 504.57, Accuracy: 13.18%
Epoch 2/15 - Learning Rate: 0.000900


Training Epoch 2: 100%|██████████| 9374/9374 [03:15<00:00, 48.03it/s, Batch Loss=5.7313, Perplexity=321.79, Speed (words/s)=1537, Accuracy=16.16%, LR=0.000900]


Epoch 2 Completed: Loss: 5.7739, Perplexity: 321.79, Accuracy: 16.16%
Epoch 3/15 - Learning Rate: 0.000810


Training Epoch 3: 100%|██████████| 9374/9374 [03:25<00:00, 45.72it/s, Batch Loss=6.1427, Perplexity=266.47, Speed (words/s)=1463, Accuracy=17.59%, LR=0.000810]


Epoch 3 Completed: Loss: 5.5852, Perplexity: 266.47, Accuracy: 17.59%
Epoch 4/15 - Learning Rate: 0.000729


Training Epoch 4: 100%|██████████| 9374/9374 [03:24<00:00, 45.86it/s, Batch Loss=5.0688, Perplexity=233.35, Speed (words/s)=1468, Accuracy=18.58%, LR=0.000729]


Epoch 4 Completed: Loss: 5.4526, Perplexity: 233.35, Accuracy: 18.58%
Epoch 5/15 - Learning Rate: 0.000656


Training Epoch 5: 100%|██████████| 9374/9374 [03:21<00:00, 46.47it/s, Batch Loss=5.9631, Perplexity=209.36, Speed (words/s)=1487, Accuracy=19.55%, LR=0.000656]


Epoch 5 Completed: Loss: 5.3440, Perplexity: 209.36, Accuracy: 19.55%
Epoch 6/15 - Learning Rate: 0.000590


Training Epoch 6: 100%|██████████| 9374/9374 [03:20<00:00, 46.71it/s, Batch Loss=6.2092, Perplexity=190.74, Speed (words/s)=1495, Accuracy=20.35%, LR=0.000590]


Epoch 6 Completed: Loss: 5.2509, Perplexity: 190.74, Accuracy: 20.35%
Epoch 7/15 - Learning Rate: 0.000531


Training Epoch 7: 100%|██████████| 9374/9374 [03:23<00:00, 46.15it/s, Batch Loss=5.9597, Perplexity=175.30, Speed (words/s)=1477, Accuracy=21.04%, LR=0.000531]


Epoch 7 Completed: Loss: 5.1665, Perplexity: 175.30, Accuracy: 21.04%
Epoch 8/15 - Learning Rate: 0.000478


Training Epoch 8: 100%|██████████| 9374/9374 [03:24<00:00, 45.78it/s, Batch Loss=4.8221, Perplexity=162.50, Speed (words/s)=1465, Accuracy=21.79%, LR=0.000478]


Epoch 8 Completed: Loss: 5.0907, Perplexity: 162.50, Accuracy: 21.79%
Epoch 9/15 - Learning Rate: 0.000430


Training Epoch 9: 100%|██████████| 9374/9374 [03:24<00:00, 45.93it/s, Batch Loss=6.3412, Perplexity=150.96, Speed (words/s)=1470, Accuracy=22.38%, LR=0.000430]


Epoch 9 Completed: Loss: 5.0170, Perplexity: 150.96, Accuracy: 22.38%
Epoch 10/15 - Learning Rate: 0.000387


Training Epoch 10: 100%|██████████| 9374/9374 [03:19<00:00, 47.05it/s, Batch Loss=5.0232, Perplexity=141.52, Speed (words/s)=1506, Accuracy=22.95%, LR=0.000387]


Epoch 10 Completed: Loss: 4.9525, Perplexity: 141.52, Accuracy: 22.95%
Epoch 11/15 - Learning Rate: 0.000349


Training Epoch 11: 100%|██████████| 9374/9374 [03:21<00:00, 46.43it/s, Batch Loss=5.2873, Perplexity=133.09, Speed (words/s)=1486, Accuracy=23.45%, LR=0.000349]


Epoch 11 Completed: Loss: 4.8910, Perplexity: 133.09, Accuracy: 23.45%
Epoch 12/15 - Learning Rate: 0.000314


Training Epoch 12: 100%|██████████| 9374/9374 [03:17<00:00, 47.50it/s, Batch Loss=4.7629, Perplexity=126.00, Speed (words/s)=1520, Accuracy=24.02%, LR=0.000314]


Epoch 12 Completed: Loss: 4.8363, Perplexity: 126.00, Accuracy: 24.02%
Epoch 13/15 - Learning Rate: 0.000282


Training Epoch 13: 100%|██████████| 9374/9374 [03:18<00:00, 47.16it/s, Batch Loss=5.9654, Perplexity=119.13, Speed (words/s)=1509, Accuracy=24.51%, LR=0.000282]


Epoch 13 Completed: Loss: 4.7802, Perplexity: 119.13, Accuracy: 24.51%
Epoch 14/15 - Learning Rate: 0.000254


Training Epoch 14: 100%|██████████| 9374/9374 [03:22<00:00, 46.33it/s, Batch Loss=4.6739, Perplexity=113.45, Speed (words/s)=1483, Accuracy=24.95%, LR=0.000254]


Epoch 14 Completed: Loss: 4.7313, Perplexity: 113.45, Accuracy: 24.95%
Epoch 15/15 - Learning Rate: 0.000229


Training Epoch 15: 100%|██████████| 9374/9374 [03:23<00:00, 45.99it/s, Batch Loss=5.3813, Perplexity=108.51, Speed (words/s)=1472, Accuracy=25.31%, LR=0.000229]

Epoch 15 Completed: Loss: 4.6869, Perplexity: 108.51, Accuracy: 25.31%





In [None]:
import torch

# Save the model
model_path = "residual_mgu_model.pth"
torch.save(model.state_dict(), model_path)
print(f"Model saved to {model_path}")

Model saved to residual_mgu_model.pth


In [None]:
# Assuming valid_data and test_data are already loaded and preprocessed

# Parameters
seq_length = 20
batch_size = 32

# Create DataLoaders
valid_dataset = PTBDataset(valid_data, seq_length)
valid_loader = DataLoader(valid_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

test_dataset = PTBDataset(test_data, seq_length)
test_loader = DataLoader(test_dataset, batch_size=batch_size, shuffle=False, drop_last=True)

# Evaluation Function
def evaluate(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            _, predicted = torch.max(outputs, dim=1)
            correct += (predicted == targets).sum().item()
            total += targets.size(0)

    avg_loss = total_loss / len(data_loader)
    accuracy = 100 * correct / total
    return avg_loss, accuracy

# Evaluate on Validation Set
val_loss, val_accuracy = evaluate(model, valid_loader, criterion)
print(f"Validation Loss: {val_loss:.4f}, Validation Accuracy: {val_accuracy:.2f}%")

# Evaluate on Test Set
test_loss, test_accuracy = evaluate(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f}, Test Accuracy: {test_accuracy:.2f}%")

Validation Loss: 5.7652, Validation Accuracy: 19.75%
Test Loss: 5.6367, Test Accuracy: 19.53%


In [None]:
import torch
import torch.nn.functional as F

def apply_temperature(logits, temperature=1.0):
    logits = logits / temperature
    probabilities = F.softmax(logits, dim=-1)
    return probabilities

def generate_text(model, start_sequence, word_to_id, id_to_word, num_words=10, temperature=1.0):
    model.eval()
    device = next(model.parameters()).device  # get model's device
    generated_sequence = start_sequence.copy()

    # Convert start_sequence to IDs
    input_ids = [word_to_id.get(word, word_to_id.get("<unk>", 0)) for word in start_sequence]

    largest_kernel_size = 5
    if len(input_ids) < largest_kernel_size:
        padding = [0] * (largest_kernel_size - len(input_ids))
        input_ids = padding + input_ids

    input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)  # Send to correct device

    with torch.no_grad():
        for _ in range(num_words):
            logits = model(input_tensor)  # [1, vocab_size]
            probabilities = apply_temperature(logits[0], temperature)

            next_word_id = torch.multinomial(probabilities, 1).item()
            next_word = id_to_word.get(next_word_id, "<unk>")
            generated_sequence.append(next_word)

            input_ids.append(next_word_id)
            input_ids = input_ids[-largest_kernel_size:]
            input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)  # Ensure consistent device

    return generated_sequence


In [None]:
start_sequence = ["investors", "always", "dont"]
generated_sequence = generate_text(model, start_sequence, word_to_id, id_to_word, num_words=20, temperature=0.8)
print("Generated Sequence:", " ".join(generated_sequence))


Generated Sequence: investors always dont the <unk> of him <eos> the adviser have received the company <eos> it is just of the u.s. aircraft all


In [None]:
start_sequence = ["washington", "is", "in"]
generated_sequence = generate_text(model, start_sequence, word_to_id, id_to_word, num_words=20, temperature=0.8)
print("Generated Sequence:", " ".join(generated_sequence))

Generated Sequence: washington is in ruling <eos> the mexico is <unk> an <unk> <unk> cut <eos> the department paul <unk> <unk> <eos> they <unk> <eos>


In [None]:
def top_k_sampling(probabilities, k=10):
    top_probs, top_indices = torch.topk(probabilities, k)
    top_probs = top_probs / top_probs.sum()  # Normalize
    next_word_id = torch.multinomial(top_probs, 1).item()
    return top_indices[next_word_id].item()


In [None]:
import torch
import torch.nn.functional as F

def apply_temperature(logits, temperature=1.0):
    logits = logits / temperature
    return F.softmax(logits, dim=-1)

def top_k_sampling(probabilities, k=10):
    top_probs, top_indices = torch.topk(probabilities, k)
    top_probs = top_probs / top_probs.sum()  # Normalize
    next_word_id = torch.multinomial(top_probs, 1).item()
    return top_indices[next_word_id].item()

def generate_text(model, start_sequence, word_to_id, id_to_word, num_words=10, temperature=1.0, top_k=None):
    model.eval()
    device = next(model.parameters()).device  # get model's device
    generated_sequence = start_sequence.copy()

    # Convert start words to IDs
    input_ids = [word_to_id.get(word, word_to_id.get("<unk>", 0)) for word in start_sequence]

    # Ensure minimum input length to match largest CNN kernel
    largest_kernel_size = 5
    if len(input_ids) < largest_kernel_size:
        input_ids = [0] * (largest_kernel_size - len(input_ids)) + input_ids

    input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)  # (1, seq_len)

    with torch.no_grad():
        for _ in range(num_words):
            logits = model(input_tensor)  # (1, vocab_size)
            probs = apply_temperature(logits[0], temperature)  # (vocab_size)

            if top_k:
                next_word_id = top_k_sampling(probs, top_k)
            else:
                next_word_id = torch.multinomial(probs, 1).item()

            next_word = id_to_word.get(next_word_id, "<unk>")
            generated_sequence.append(next_word)

            # Slide the window
            input_ids.append(next_word_id)
            input_ids = input_ids[-largest_kernel_size:]
            input_tensor = torch.tensor(input_ids).unsqueeze(0).to(device)

    return generated_sequence


In [None]:
start_sequence = ["washington", "is", "in"]
# start_sequence = ["the", "film", "was"]
generated = generate_text(
    model,
    start_sequence,
    word_to_id,
    id_to_word,
    num_words=20,
    temperature=0.8,   # Lower = more confident, higher = more random
    top_k=10           # Optional: sample only from top 10 likely words
)

print("Generated:", " ".join(generated))


Generated: washington is in the country as the <unk> of the company 's the <unk> of the <unk> and <unk> <unk> and <unk> <unk>


In [None]:
start_sequence = ["stock", "trading", "in"]
generated_sequence = generate_text(
    model,
    start_sequence,
    word_to_id,
    id_to_word,
    num_words=15,
    temperature=0.8,
    top_k=10  # Only sample from top 10 words
)

print("Generated:", " ".join(generated_sequence))


Generated: stock trading in recent weeks <eos> the company has been sold <eos> in the u.s. and other <unk>


In [None]:
start_sequence = ["stock", "trading", "in"]
generated_sequence = generate_text(
    model,
    start_sequence,
    word_to_id,
    id_to_word,
    num_words=20,
    temperature=0.8,
    top_k=10  # Only sample from top 10 words
)

print("Generated:", " ".join(generated_sequence))


Generated: stock trading in N the new york court in the u.s. <eos> it said it is <unk> the <unk> <unk> and <unk> and


In [None]:
import torch
import math

def evaluate(model, data_loader, criterion):
    model.eval()
    total_loss = 0
    correct = 0
    total = 0

    with torch.no_grad():
        for inputs, targets in data_loader:
            inputs, targets = inputs.to(device), targets.to(device)
            outputs = model(inputs)
            loss = criterion(outputs, targets)
            total_loss += loss.item()
            _, predicted = torch.max(outputs, dim=1)
            correct += (predicted == targets).sum().item()
            total += targets.size(0)

    avg_loss = total_loss / len(data_loader)
    accuracy = 100 * correct / total
    perplexity = math.exp(avg_loss)  # <- Perplexity added here
    return avg_loss, accuracy, perplexity


In [None]:
val_loss, val_accuracy, val_ppl = evaluate(model, valid_loader, criterion)
print(f"Validation Loss: {val_loss:.4f}, Accuracy: {val_accuracy:.2f}%, Perplexity: {val_ppl:.2f}")

test_loss, test_accuracy, test_ppl = evaluate(model, test_loader, criterion)
print(f"Test Loss: {test_loss:.4f}, Accuracy: {test_accuracy:.2f}%, Perplexity: {test_ppl:.2f}")


Validation Loss: 5.7652, Accuracy: 19.75%, Perplexity: 319.00
Test Loss: 5.6367, Accuracy: 19.53%, Perplexity: 280.54


In [None]:
vocab_size = 10000
embed_size = 128
cnn_channels = 150
hidden_size = 150
num_layers = 1
output_size = vocab_size

# Instantiate the model
model = ResidualMGUNetwork(vocab_size, embed_size, cnn_channels, hidden_size, num_layers, output_size)

# Move to device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

# Load the saved weights
model_path = "residual_mgu_model.pth"
model.load_state_dict(torch.load(model_path, map_location=device))
model.eval()
print("✅ Model loaded successfully!")


✅ Model loaded successfully!


In [None]:
start_sequence = ["stock", "trading", "in"]
generated_sequence = generate_text(
    model,
    start_sequence,
    word_to_id,
    id_to_word,
    num_words=20,
    temperature=0.8,
    top_k=10  # Optional
)

print("Generated Sequence:", " ".join(generated_sequence))


Generated Sequence: stock trading in the u.s. <eos> the <unk> <unk> is a <unk> to be <unk> on the <unk> business <eos> it 's <unk>
