In [7]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
from transformers import PretrainedConfig
import math
from typing import Tuple, Optional

## Configuration of the Model

In [3]:
class HoloConfig(PretrainedConfig):
    model_type = "holo"

    def __init__(
        self,
        vocab_size=50257,        # Default to GPT-2 tokenizer size
        hidden_size=768,         # This is 'd_model'
        hd_dim=2048,             # The Holographic Bus (Key/Value expansion)
        num_hidden_layers=12,    # Depth
        expansion_factor=4,      # MLP expansion (usually 4x hidden_size)
        max_position_embeddings=8192,
        layer_norm_eps=1e-5,
        initializer_range=0.02,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        tie_word_embeddings=False, # Whether to tie input/output embeddings
        **kwargs,
    ):
        """
        Configuration class for HoloGPT.
        
        Args:
            hd_dim (int): The dimension of the holographic binding space. 
                          Ideally 2x-4x larger than hidden_size to reduce 
                          superposition noise (crosstalk).
        """
        self.vocab_size = vocab_size
        self.d_model = hidden_size
        self.hd_dim = hd_dim
        self.num_hidden_layers = num_hidden_layers
        self.expansion_factor = expansion_factor
        self.max_position_embeddings = max_position_embeddings
        self.layer_norm_eps = layer_norm_eps
        self.initializer_range = initializer_range
        self.tie_word_embeddings = tie_word_embeddings

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )


### Displaying the configuration

In [4]:
config_small = HoloConfig(
    vocab_size = 50257,
    hidden_size = 768,       # Standard small width
    hd_dim = 1536,           # 2x expansion for memory clarity
    num_hidden_layers = 12,  # Standard depth
    expansion_factor = 4     # Standard MLP width
)
print(config_small)

HoloConfig {
  "bos_token_id": 1,
  "d_model": 768,
  "eos_token_id": 2,
  "expansion_factor": 4,
  "hd_dim": 1536,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 8192,
  "model_type": "holo",
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "transformers_version": "4.57.3",
  "vocab_size": 50257
}



In [5]:
config_medium = HoloConfig(
    vocab_size=50257,
    hidden_size=1024,       # Increased width
    hd_dim=3072,            # 3x expansion (High fidelity memory)
    num_hidden_layers=24,   # Deeper network for complex reasoning
    expansion_factor=4
)
print(config_medium)

HoloConfig {
  "bos_token_id": 1,
  "d_model": 1024,
  "eos_token_id": 2,
  "expansion_factor": 4,
  "hd_dim": 3072,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 8192,
  "model_type": "holo",
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "transformers_version": "4.57.3",
  "vocab_size": 50257
}



In [6]:
config_large = HoloConfig(
    vocab_size=50257,
    hidden_size=1600,       # GPT-2 XL width
    hd_dim=4096,            # Massive holographic bus
    num_hidden_layers=48,   # Very deep
    expansion_factor=4
)
print(config_large)

HoloConfig {
  "bos_token_id": 1,
  "d_model": 1600,
  "eos_token_id": 2,
  "expansion_factor": 4,
  "hd_dim": 4096,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 8192,
  "model_type": "holo",
  "num_hidden_layers": 48,
  "pad_token_id": 0,
  "transformers_version": "4.57.3",
  "vocab_size": 50257
}



### Debugging the Model Configuration

file functional.py

1. The Problem: "Local Blur" in Standard RoPE:
   - In standard Rotary Positional Embeddings (RoPE), the frequencies are generated using a geometric progression (decreasing frequencies).
   - Low Frequencies: Many dimensions in standard RoPE have very small frequencies (rotations).
   - The Consequence: If the rotation speed is slow, the rotor for position $t$ is almost identical to the rotor for position $t+1$.
   - Local Blur: When the model calculates attention (similarity) between tokens, nearby tokens (e.g., pos 50 and pos 51) have very high "positional overlap." The model struggles to distinguish precise ordering among neighbors.

In [10]:
from typing import Tuple, Optional

def generate_random_phasors(hd_dim: int, scaling_factor: float = 10.0, 
                            device: torch.device = None) -> torch.Tensor:
    """
    Generates fixed orthogonal high-frequency phases for the Holographic Key.
    Replaces Standard RoPE to fix the 'Local Blur' issue.
    
    Args:
        hd_dim: The holographic dimension (must be even if using view_as_complex logic later, 
                but here we assume complex64 tensors).
        scaling_factor: The scaling factor to force the frequencies to spin faster
                        (Preventing the Local Blur problem)
    
    Returns:
        freqs: A tensor of shape (hd_dim,) containing random frequencies.
    """
    # Drawn from a wide uniform distribution to ensure orthogonality across the spectrum
    # High frequencies (> 1.0) are critical for "Needle in a Haystack" precision.
    return torch.randn(hd_dim, device=device) * scaling_factor

In [9]:
def compute_rotors(
    seq_len: int, 
    freqs: torch.Tensor, 
    offset: int = 0
) -> torch.Tensor:
    """
    Creates the Holographic Rotors (Positional Encodings) in the Complex Plane.
    Formula: Rotor_t = exp(i * t * theta)
    
    Args:
        seq_len: Length of the sequence.
        freqs: The fixed random frequencies (hd_dim,).
        offset: Starting position index (for cache/inference steps).
        
    Returns:
        rotors: Complex tensor (1, seq_len, hd_dim)
    """
    # Create position indices [0, 1, 2, ...]
    t = torch.arange(seq_len, device=freqs.device, dtype=torch.float32) + offset
    
    # Outer product: positions * frequencies
    # Shape: (seq_len, hd_dim)
    angles = torch.outer(t, freqs)
    
    # Polar to Rectangular: exp(i * theta) = cos(theta) + i*sin(theta)
    rotors = torch.polar(torch.ones_like(angles), angles)
    
    return rotors.unsqueeze(0) # Add batch dim for broadcasting
