In [7]:
import torch 
import torch.nn as nn
import torch.nn.functional as F
from transformers import PretrainedConfig
import math
from typing import Tuple, Optional

## Configuration of the Model

In [17]:
class HoloConfig(PretrainedConfig):
    model_type = "holo"

    def __init__(
        self,
        vocab_size=50257,        # Default to GPT-2 tokenizer size
        hidden_size=768,         # This is 'd_model'
        hd_dim=2048,             # The Holographic Bus (Key/Value expansion)
        num_hidden_layers=12,    # Depth
        expansion_factor=4,      # MLP expansion (usually 4x hidden_size)
        max_position_embeddings=8192,
        layer_norm_eps=1e-5,
        initializer_range=0.02,
        dropout = 0.0,
        pad_token_id=0,
        bos_token_id=1,
        eos_token_id=2,
        tie_word_embeddings=False, # Whether to tie input/output embeddings
        **kwargs,
    ):
        """
        Configuration class for HoloGPT.
        
        Args:
            hd_dim (int): The dimension of the holographic binding space. 
                          Ideally 2x-4x larger than hidden_size to reduce 
                          superposition noise (crosstalk).
        """
        self.vocab_size = vocab_size
        self.d_model = hidden_size
        self.hd_dim = hd_dim
        self.num_hidden_layers = num_hidden_layers
        self.expansion_factor = expansion_factor
        self.max_position_embeddings = max_position_embeddings
        self.layer_norm_eps = layer_norm_eps
        self.initializer_range = initializer_range
        self.tie_word_embeddings = tie_word_embeddings
        self.dropout = dropout

        super().__init__(
            pad_token_id=pad_token_id,
            bos_token_id=bos_token_id,
            eos_token_id=eos_token_id,
            **kwargs,
        )


### Displaying the configuration

In [4]:
config_small = HoloConfig(
    vocab_size = 50257,
    hidden_size = 768,       # Standard small width
    hd_dim = 1536,           # 2x expansion for memory clarity
    num_hidden_layers = 12,  # Standard depth
    expansion_factor = 4     # Standard MLP width
)
print(config_small)

HoloConfig {
  "bos_token_id": 1,
  "d_model": 768,
  "eos_token_id": 2,
  "expansion_factor": 4,
  "hd_dim": 1536,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 8192,
  "model_type": "holo",
  "num_hidden_layers": 12,
  "pad_token_id": 0,
  "transformers_version": "4.57.3",
  "vocab_size": 50257
}



In [5]:
config_medium = HoloConfig(
    vocab_size=50257,
    hidden_size=1024,       # Increased width
    hd_dim=3072,            # 3x expansion (High fidelity memory)
    num_hidden_layers=24,   # Deeper network for complex reasoning
    expansion_factor=4
)
print(config_medium)

HoloConfig {
  "bos_token_id": 1,
  "d_model": 1024,
  "eos_token_id": 2,
  "expansion_factor": 4,
  "hd_dim": 3072,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 8192,
  "model_type": "holo",
  "num_hidden_layers": 24,
  "pad_token_id": 0,
  "transformers_version": "4.57.3",
  "vocab_size": 50257
}



In [6]:
config_large = HoloConfig(
    vocab_size=50257,
    hidden_size=1600,       # GPT-2 XL width
    hd_dim=4096,            # Massive holographic bus
    num_hidden_layers=48,   # Very deep
    expansion_factor=4
)
print(config_large)

HoloConfig {
  "bos_token_id": 1,
  "d_model": 1600,
  "eos_token_id": 2,
  "expansion_factor": 4,
  "hd_dim": 4096,
  "initializer_range": 0.02,
  "layer_norm_eps": 1e-05,
  "max_position_embeddings": 8192,
  "model_type": "holo",
  "num_hidden_layers": 48,
  "pad_token_id": 0,
  "transformers_version": "4.57.3",
  "vocab_size": 50257
}



### Debugging the Model Configuration

file functional.py

1. The Problem: "Local Blur" in Standard RoPE:
   - In standard Rotary Positional Embeddings (RoPE), the frequencies are generated using a geometric progression (decreasing frequencies).
   - Low Frequencies: Many dimensions in standard RoPE have very small frequencies (rotations).
   - The Consequence: If the rotation speed is slow, the rotor for position $t$ is almost identical to the rotor for position $t+1$.
   - Local Blur: When the model calculates attention (similarity) between tokens, nearby tokens (e.g., pos 50 and pos 51) have very high "positional overlap." The model struggles to distinguish precise ordering among neighbors.

In [19]:
from typing import Tuple, Optional, Union

def generate_random_phasors(hd_dim: int, scaling_factor: float = 10.0, 
                            device: torch.device = None) -> torch.Tensor:
    """
    Generates fixed orthogonal high-frequency phases for the Holographic Key.
    Replaces Standard RoPE to fix the 'Local Blur' issue.
    
    Args:
        hd_dim: The holographic dimension (must be even if using view_as_complex logic later, 
                but here we assume complex64 tensors).
        scaling_factor: The scaling factor to force the frequencies to spin faster
                        (Preventing the Local Blur problem)
    
    Returns:
        freqs: A tensor of shape (hd_dim,) containing random frequencies.
    """
    # Drawn from a wide uniform distribution to ensure orthogonality across the spectrum
    # High frequencies (> 1.0) are critical for "Needle in a Haystack" precision.
    return torch.randn(hd_dim, device=device) * scaling_factor

In [9]:
def compute_rotors(
    seq_len: int, 
    freqs: torch.Tensor, 
    offset: int = 0
) -> torch.Tensor:
    """
    Creates the Holographic Rotors (Positional Encodings) in the Complex Plane.
    Formula: Rotor_t = exp(i * t * theta)
    
    Args:
        seq_len: Length of the sequence.
        freqs: The fixed random frequencies (hd_dim,).
        offset: Starting position index (for cache/inference steps).
        
    Returns:
        rotors: Complex tensor (1, seq_len, hd_dim)
    """
    # Create position indices [0, 1, 2, ...]
    t = torch.arange(seq_len, device=freqs.device, dtype=torch.float32) + offset
    
    # Outer product: positions * frequencies
    # Shape: (seq_len, hd_dim)
    angles = torch.outer(t, freqs)
    
    # Polar to Rectangular: exp(i * theta) = cos(theta) + i*sin(theta)
    rotors = torch.polar(torch.ones_like(angles), angles)
    
    return rotors.unsqueeze(0) # Add batch dim for broadcasting


In [11]:
def holo_bind_and_accumulate(
    v: torch.Tensor, 
    rotors: torch.Tensor
) -> Tuple[torch.Tensor, torch.Tensor]:
    """
    The Core Holographic Memory Operation (The 'Write' Head).
    Performs Binding -> Accumulation with Mixed Precision stability.
    
    Args:
        v: Value tensor (Batch, Seq, HD_Dim) [Complex64]
        rotors: Positional Rotors (1, Seq, HD_Dim) [Complex64]
        
    Returns:
        memory_trace: The cumulative sum of bound states.
    """
    # 1. Holographic Binding (Element-wise Rotation)
    # This encodes the "Position" into the "Value"
    bound_state = v * rotors
    
    # 2. Linear Memory Accumulation (The Scan)
    # CRITICAL: Force FP32 for the cumulative sum to prevent "Swamping" 
    # (where small new tokens vanish in long contexts).
    # We cast to complex128 (Float64 real/imag) or complex64 (Float32 real/imag).
    # If input is BF16/FP16, this step must upgrade precision.
    bound_state_fp32 = bound_state.to(torch.complex64) 
    
    memory_trace = torch.cumsum(bound_state_fp32, dim=1)
    
    return memory_trace



In [12]:
def holo_retrieve(
    memory_trace: torch.Tensor, 
    rotors: torch.Tensor
) -> torch.Tensor:
    """
    The Holographic Retrieval Operation (The 'Read' Head).
    Performs Unbinding -> Normalization.
    
    Args:
        memory_trace: Accumulated memory (Batch, Seq, HD_Dim)
        rotors: Positional Rotors (1, Seq, HD_Dim)
        
    Returns:
        retrieved: The decoded signal (Real-valued projection ready for output).
    """
    B, T, D = memory_trace.shape
    
    # 1. Unbinding (Derotation)
    # Multiply by the CONJUGATE of the rotor. 
    # If we bound with exp(i*theta), we unbind with exp(-i*theta).
    # This cancels the phase for the target position, leaving the signal at DC (0 freq).
    raw_retrieval = memory_trace * torch.conj(rotors)
    
    # 2. Normalization (The Scaling Law Fix)
    # The magnitude of a random walk grows by sqrt(T). 
    # We divide by sqrt(T) to keep the signal variance roughly 1.0 for the MLP.
    scale = torch.sqrt(
        torch.arange(1, T + 1, device=memory_trace.device, dtype=torch.float32)
    ).view(1, T, 1)
    
    # Avoid div by zero (sanity check)
    scale = torch.clamp(scale, min=1.0)
    
    normalized_retrieval = raw_retrieval / scale
    
    # 3. Project to Real
    # The information is stored in the Magnitude/Real alignment.
    # We return the real part. The Imaginary part contains the "crosstalk noise" 
    # from other positions, which the MLP will filter out.
    return normalized_retrieval.real


file "layers.py** file

This file defines the mechansim of the Attention (HoloAttention) 

In [23]:
class HoloAttention(nn.Module):
    """
    The Holographic 'Attention' Mechanism.
    Replaces N^2 Softmax Attention with O(N) Complex-Valued Recurrence.
    """
    def __init__(self, config):
        super().__init__()
        self.d_model = config.d_model
        self.hd_dim = config.hd_dim 

        # 1. Projections (Real -> Complex)
        # We project inputs into the Holographic "Hyper-Dimension"
        self.k_proj = nn.Linear(config.d_model, config.hd_dim, bias=False)
        self.v_proj = nn.Linear(config.d_model, config.hd_dim, bias=False)
        self.o_proj = nn.Linear(config.d_model, config.d_model, bias=False) # Output is Real

        # 2. Fixed Random Phasors (The "Keys")
        # Registered as buffer so they save with the model don't update via GD
        self.register_buffer("freqs", generate_random_phasors(config.hd_dim))

        # 3. Residual Dropout
        self.resid_dropout = nn.Dropout(config.dropout)


    def forward(self, x): 
        B, T, C = x.shape

        # --- Step 1: Project to Holographic Space --- 
        # k, v shape: (B, T, hd_dim)
        # We cast to complex64 immediately to enable phase operations
        k_real = self.k_proj(x)
        v_real = self.v_proj(x)

        # In a full implementation, K determines *which* frequency to write to.
        # For this version (Linear Associative Memory), we use V as the content
        # and implicit position as the key.
        # Future improvement: Use K to modulate the frequencies (Data-Dependent).
        v = v_real.to(torch.complex64)

        # --- Step 2: Generate Positional Rotors ---
        # Rotors shape: (1, T, hd_dim)
        rotors = compute_rotors(T, self.freqs)

        # --- Step 3: Bind & Accumulate (The O(N) Magic) ---
        # This replaces the Attention Matrix calculation
        memory_trace = holo_bind_and_accumulate(v, rotors)

        # --- Step 4: Retrieve (Derotate) --- 
        # This replaces the Attention * Value calculation 
        output_complex = holo_retrieve(memory_trace, rotors)
        output_real = output_complex.real

        projected = self.o_proj(output_real)
        
        # --- Step 5: Project Output ---
        # We take the Real part (Magnitude/Phase alignment)
        return self.resid_dropout(projected)

In [24]:
class HoloBlock(nn.Module):
    """
    Standard Transformer Block structure, but swapping Self-Attention 
    for HoloAttention.
    Structure: Input -> LN -> Holo -> Add -> LN -> MLP -> Add
    """
    def __init__(self, config):
        super().__init__()
        self.ln1 = nn.LayerNorm(config.d_model)
        self.attn = HoloAttention(config)

        
        self.ln2 = nn.LayerNorm(config.d_model)
        self.mlp = nn.Sequential(
            nn.Linear(config.d_model, config.d_model * config.expansion_factor),
            nn.GELU(),
            nn.Linear(config.d_model * config.expansion_factor, config.d_model),
            nn.Dropout(config.dropout)
        )

    def forward(self, x):
        # 1. Holographic Mixer Path
        residual = x
        x = self.ln1(x)
        x = residual + self.attn(x)
        
        # 2. MLP Path (The "Denoising" Step)
        residual = x
        x = self.ln2(x)
        x = residual + self.mlp(x)
        
        return x

SyntaxError: invalid syntax. Perhaps you forgot a comma? (139791767.py, line 17)

file **modeling_holo.py** file

Creating a Model with all of the contents take the inspiration from HuggingFace

In [15]:
from transformers import PreTrainedModel, AutoModelForCausalLM
from transformers.modeling_outputs import BaseModelOutputWithPast, \
                                            CausalLMOutputWithPast

In [16]:
class HoloPreTrainedModel(PreTrainedModel):
    """
    Base class for Holo-Transformer weights initialization and utilities
    """
    config_class = HoloConfig
    base_model_prefix = "holo"
    supports_gradient_checkpointing = True
    _no_split_modules = ["HoloBlock"]

    def _init_weights(self, module):
        """
        Standard GPT-style initialization.
        """
        std = self.config.initializer_range
        if isinstance(module, nn.Linear):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.bias is not None:
                module.bias.data.zero_()
        elif isinstance(module, nn.Embedding):
            module.weight.data.normal_(mean=0.0, std=std)
            if module.padding_idx is not None:
                module.weight.data[module.padding_idx].zero_()
        elif isinstance(module, nn.LayerNorm):
            module.bias.data.zero_()
            module.weight.data.fill_(1.0)


In [20]:
class HoloModel(HoloPreTrainedModel):
    """
    The bare Holo-Transformer backbone (Embeddings + Layers).
    """
    def __init__(self, config: HoloConfig):
        super().__init__(config)
        self.config = config

        # 1. Embeddings
        # Note: We do NOT use Positional Embeddings here. 
        # The Holographic Layer handles position via complex rotation internally.
        self.wte = nn.Embedding(config.vocab_size, config.d_model)
        self.drop = nn.Dropout(config.dropout) # Usually 0 for LLMs, but kept for interface

        # 2. The Stack
        self.h = nn.ModuleList([HoloBlock(config) for _ in range(config.num_hidden_layers)])
        
        # 3. Final Norm
        self.ln_f = nn.LayerNorm(config.d_model, eps=config.layer_norm_eps)

        self.post_init()

    def get_input_embeddings(self):
        return self.wte

    def set_input_embeddings(self, value):
        self.wte = value

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        return_dict: Optional[bool] = None,
        **kwargs # Catch-all for past_key_values if added later
    ) -> Union[Tuple, BaseModelOutputWithPast]:
        
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 1. Prepare Input
        if inputs_embeds is None:
            inputs_embeds = self.wte(input_ids)
        
        hidden_states = inputs_embeds
        hidden_states = self.drop(hidden_states)

        # 2. Run Layers
        all_hidden_states = () if return_dict else None
        
        for block in self.h:
            if return_dict:
                all_hidden_states = all_hidden_states + (hidden_states,)
            
            # The Magic happens here
            hidden_states = block(hidden_states)

        # 3. Finalize
        hidden_states = self.ln_f(hidden_states)

        if not return_dict:
            return (hidden_states,)

        return BaseModelOutputWithPast(
            last_hidden_state=hidden_states,
            hidden_states=all_hidden_states,
        )


Defining the HoloBlock :))) 

In [None]:
class HoloForCausalLM(HoloPreTrainedModel):
    """
    The End-to-End Language Model (Backbone + LM Head).
    Use this for training.
    """
    def __init__(self, config: HoloConfig):
        super().__init__(config)
        self.holo = HoloModel(config)
        
        # LM Head (Projects back to Vocab)
        self.lm_head = nn.Linear(config.d_model, config.vocab_size, bias=False)

        # Weight Tying (Optional but standard)
        if config.tie_word_embeddings:
            self.lm_head.weight = self.holo.wte.weight

        self.post_init()

    def get_output_embeddings(self):
        return self.lm_head

    def set_output_embeddings(self, new_embeddings):
        self.lm_head = new_embeddings

    def forward(
        self,
        input_ids: torch.LongTensor = None,
        labels: Optional[torch.LongTensor] = None,
        inputs_embeds: Optional[torch.FloatTensor] = None,
        return_dict: Optional[bool] = None,
        **kwargs
    ) -> Union[Tuple, CausalLMOutputWithPast]:
        
        return_dict = return_dict if return_dict is not None else self.config.use_return_dict

        # 1. Run Backbone
        outputs = self.holo(
            input_ids=input_ids,
            inputs_embeds=inputs_embeds,
            return_dict=return_dict,
            **kwargs
        )
        hidden_states = outputs[0]

        # 2. Compute Logits
        logits = self.lm_head(hidden_states)

        # 3. Compute Loss (if labels provided)
        loss = None
        if labels is not None:
            # Shift so that tokens < n predict n
            shift_logits = logits[..., :-1, :].contiguous()
            shift_labels = labels[..., 1:].contiguous()
            
            # Flatten tokens
            loss_fct = nn.CrossEntropyLoss()
            loss = loss_fct(shift_logits.view(-1, self.config.vocab_size), shift_labels.view(-1))

        if not return_dict:
            output = (logits,) + outputs[1:]
            return ((loss,) + output) if loss is not None else output

        return CausalLMOutputWithPast(
            loss=loss,
            logits=logits,
            hidden_states=outputs.hidden_states,
        )
