# Types of Unconditioned Prediction in Neural Networks
### Author: Mohammed HAMDAN © 2025

This notebook provides practical implementations and visualizations of different types of unconditioned prediction in neural networks. We'll explore:
1. Left-to-right Autoregressive Prediction
2. Left-to-right Markov Chain
3. Independent Prediction
4. Bidirectional Prediction

In [ ]:
# Install required packages
!pip install torch torchvision matplotlib seaborn numpy

In [ ]:
# Import necessary libraries
import torch
import torch.nn as nn
import torch.nn.functional as F
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from typing import List, Tuple

# Set random seeds for reproducibility
torch.manual_seed(42)
np.random.seed(42)

## 1. Autoregressive Prediction
The autoregressive model predicts each token based on all previous tokens, similar to how humans read text from left to right.

In [ ]:
class AutoregressivePredictor(nn.Module):
    def __init__(self, vocab_size: int, hidden_size: int):
        super().__init__()
        # Embedding layer converts token IDs to vectors
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        # GRU processes tokens sequentially
        self.rnn = nn.GRU(hidden_size, hidden_size, batch_first=True)
        # Output layer converts hidden states to token predictions
        self.output = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        embedded = self.embedding(x)
        output, _ = self.rnn(embedded)
        return self.output(output)

## 2. Markov Chain Prediction
The Markov model uses a fixed-size window of previous tokens, trading long-range understanding for computational efficiency.

In [ ]:
class MarkovPredictor(nn.Module):
    def __init__(self, vocab_size: int, hidden_size: int, context_size: int = 3):
        super().__init__()
        self.context_size = context_size
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        # Uses concatenated embeddings from the context window
        self.fc = nn.Linear(context_size * hidden_size, vocab_size)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        embedded = self.embedding(x)
        # Create sliding windows of context
        batch_size, seq_len, hidden_size = embedded.shape
        padded = F.pad(embedded, (0, 0, self.context_size-1, 0))
        windows = torch.stack([padded[:, i:i+seq_len] 
                              for i in range(self.context_size)], dim=2)
        flattened = windows.reshape(batch_size, seq_len, -1)
        return self.fc(flattened)

## 3. Independent Prediction
The independent model makes predictions for each position without considering context, like a unigram model.

In [ ]:
class IndependentPredictor(nn.Module):
    def __init__(self, vocab_size: int, hidden_size: int):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        # Direct mapping from embedding to prediction
        self.fc = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        embedded = self.embedding(x)
        return self.fc(embedded)

## 4. Bidirectional Prediction
The bidirectional model uses both past and future context, like BERT's masked language modeling.

In [ ]:
class BidirectionalPredictor(nn.Module):
    def __init__(self, vocab_size: int, hidden_size: int):
        super().__init__()
        self.embedding = nn.Embedding(vocab_size, hidden_size)
        # Transformer layer allows attention to all positions
        self.transformer = nn.TransformerEncoderLayer(
            d_model=hidden_size,
            nhead=4,
            dim_feedforward=hidden_size*4,
            batch_first=True
        )
        self.output = nn.Linear(hidden_size, vocab_size)
    
    def forward(self, x: torch.Tensor) -> torch.Tensor:
        embedded = self.embedding(x)
        encoded = self.transformer(embedded)
        return self.output(encoded)

## Visualization Functions
These functions help us understand how each model uses context for predictions.

In [ ]:
def visualize_dependencies(sequence_length: int = 6):
    """Creates visualizations matching our poster's design"""
    fig, axes = plt.subplots(2, 2, figsize=(12, 12))
    axes = axes.ravel()
    
    # Autoregressive pattern
    auto_matrix = np.tril(np.ones((sequence_length, sequence_length)))
    sns.heatmap(auto_matrix, ax=axes[0], cmap='Blues', cbar=False,
                xticklabels=False, yticklabels=False)
    axes[0].set_title('Autoregressive')
    
    # Markov pattern
    markov_matrix = np.zeros((sequence_length, sequence_length))
    context_size = 3
    for i in range(sequence_length):
        start = max(0, i - context_size + 1)
        markov_matrix[i, start:i+1] = 1
    sns.heatmap(markov_matrix, ax=axes[1], cmap='Blues', cbar=False,
                xticklabels=False, yticklabels=False)
    axes[1].set_title('Markov Chain')
    
    # Independent pattern
    ind_matrix = np.eye(sequence_length)
    sns.heatmap(ind_matrix, ax=axes[2], cmap='Blues', cbar=False,
                xticklabels=False, yticklabels=False)
    axes[2].set_title('Independent')
    
    # Bidirectional pattern
    bi_matrix = np.ones((sequence_length, sequence_length))
    sns.heatmap(bi_matrix, ax=axes[3], cmap='Blues', cbar=False,
                xticklabels=False, yticklabels=False)
    axes[3].set_title('Bidirectional')
    
    plt.tight_layout()
    return fig

## Demonstration
Let's see how each model makes predictions on a sample sequence.

In [ ]:
def demonstrate_models():
    # Set up parameters
    vocab_size = 100
    hidden_size = 64
    sequence_length = 10
    batch_size = 1
    
    # Create sample data
    x = torch.randint(0, vocab_size, (batch_size, sequence_length))
    
    # Initialize models
    models = {
        'Autoregressive': AutoregressivePredictor(vocab_size, hidden_size),
        'Markov': MarkovPredictor(vocab_size, hidden_size),
        'Independent': IndependentPredictor(vocab_size, hidden_size),
        'Bidirectional': BidirectionalPredictor(vocab_size, hidden_size)
    }
    
    # Make predictions
    results = {}
    for name, model in models.items():
        with torch.no_grad():
            logits = model(x)
            probs = F.softmax(logits, dim=-1)
            predictions = torch.argmax(probs, dim=-1)
            results[name] = predictions
    
    return results

# Run demonstration
results = demonstrate_models()

# Visualize dependencies
plt.figure(figsize=(12, 12))
visualize_dependencies()
plt.show()

# Print results summary
print("\nPrediction Analysis Summary")
print("=" * 50)
for name, preds in results.items():
    print(f"\n{name} Model:")
    print(f"- Prediction shape: {preds.shape}")
    print(f"- Unique predictions: {len(preds.unique())}")

print("\nMohammed HAMDAN © 2025 • An analysis of prediction architectures in neural networks")