# üß† Cortex-1 Research Lab

Bienvenido al entorno de desarrollo de **Cortex-1**, una arquitectura h√≠brida dise√±ada para superar a los Transformers tradicionales.

## üß¨ La Arquitectura: Hybrid Mamba-MoE
En lugar de usar solo Atenci√≥n (como GPT), usamos un enfoque biol√≥gico:
1.  **Mamba (SSM)**: Act√∫a como el "Hipocampo", proporcionando memoria de largo plazo infinita y lineal.
2.  **Atenci√≥n**: Act√∫a como la "Corteza Prefrontal", razonando sobre la informaci√≥n inmediata.
3.  **MoE (Mixture of Experts)**: Regiones especializadas del cerebro que se activan solo cuando es necesario.
4.  **Byte-Level**: Sin tokenizador. El modelo lee bytes crudos (0-255), entendiendo el "ADN" de la informaci√≥n.

### Diagrama del Sistema
```mermaid
graph TD
    A[Input Bytes] --> B(Byte Embedding)
    B --> C{Backbone Loop}
    C -->|Layer 1, 3...| D[Mamba Block <br> Long-Term Memory]
    C -->|Layer 2, 4...| E[Attention Block <br> Reasoning]
    D --> C
    E --> C
    C --> F[MoE Router]
    F --> G[Expert 1: Logic]
    F --> H[Expert 2: Code]
    F --> I[Expert 3: Arts]
    G & H & I --> J(Layer Norm)
    J --> K[Output Logits]
```

In [None]:
# 1. Configuraci√≥n del Entorno
import torch
import torch.nn as nn
import torch.nn.functional as F
import math
import time
import matplotlib.pyplot as plt
from IPython.display import clear_output
import urllib.request
import xml.etree.ElementTree as ET
import os

# Configuraci√≥n de dispositivo
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"üöÄ Cortex-1 Engine running on: {device.upper()}")

# Hiperpar√°metros Gen√©ticos (Genome)
config = {
    'vocab_size': 256,      # Byte-level
    'd_model': 384,         # Dimensi√≥n del embedding
    'n_layers': 6,          # Profundidad
    'n_experts': 4,         # N√∫mero de expertos MoE
    'top_k': 2,             # Expertos activos por token
    'block_size': 256,      # Contexto para la parte de atenci√≥n
    'dropout': 0.1,
    'learning_rate': 3e-4,
    'batch_size': 32,
    'max_iters': 1000
}

## üìö Fase 1: Universal Curriculum (Data Scraper)
Vamos a descargar papers "disruptivos" en tiempo real para entrenar al modelo.

In [None]:
# 2. El Recolector de Conocimiento (Scraper)
def download_disruptive_papers():
    print("üì° Escaneando ArXiv en busca de conocimiento disruptivo...")
    # Consulta simplificada para demostraci√≥n
    url = 'http://export.arxiv.org/api/query?search_query=all:transformer+AND+all:attention&start=0&max_results=3'
    data = urllib.request.urlopen(url).read().decode('utf-8')
    root = ET.fromstring(data)
    
    texts = []
    ns = {'atom': 'http://www.w3.org/2005/Atom'}
    
    for entry in root.findall('atom:entry', ns):
        title = entry.find('atom:title', ns).text.strip()
        summary = entry.find('atom:summary', ns).text.strip()
        print(f"   üìÑ Ingestando: {title[:50]}...")
        texts.append(f"Title: {title}\nAbstract: {summary}\n\n")
    
    return "".join(texts)

# Si no tenemos datos locales, descargamos algo para probar
raw_text = download_disruptive_papers()
print(f"\n‚úÖ Dataset cargado: {len(raw_text)} caracteres.")

# Preprocesamiento Byte-Level (Sin Tokenizer)
data_tensor = torch.tensor([b for b in raw_text.encode('utf-8')], dtype=torch.long)
n = int(0.9 * len(data_tensor))
train_data = data_tensor[:n]
val_data = data_tensor[n:]

def get_batch(split):
    data = train_data if split == 'train' else val_data
    ix = torch.randint(len(data) - config['block_size'], (config['batch_size'],))
    x = torch.stack([data[i:i+config['block_size']] for i in ix])
    y = torch.stack([data[i+1:i+config['block_size']+1] for i in ix])
    return x.to(device), y.to(device)

## üß† Fase 2: Implementaci√≥n de Cortex-1
Aqu√≠ reside la magia: **Mamba Block** (memoria) + **MoE Layer** (especializaci√≥n).

In [None]:
# 3. Componentes de la Arquitectura

class MambaBlock(nn.Module):
    """ Implementaci√≥n simplificada de State Space Model para demostraci√≥n """
    def __init__(self, config):
        super().__init__()
        dim = config['d_model']
        self.in_proj = nn.Linear(dim, dim * 2)
        self.out_proj = nn.Linear(dim, dim)
        self.dt_proj = nn.Linear(dim, dim)
        # En una implementaci√≥n real, esto usa CUDA kernels optimizados
        
    def forward(self, x):
        # Simulaci√≥n funcional del paso selectivo
        B, L, D = x.shape
        x_and_res = self.in_proj(x)
        x_val, res = x_and_res.chunk(2, dim=-1)
        # Bypass simple para demo (el verdadero Mamba requiere compilaci√≥n compleja)
        x_val = x_val * F.sigmoid(self.dt_proj(x_val))
        return self.out_proj(x_val * F.silu(res))

class Expert(nn.Module):
    """ Un experto individual (Feed Forward Network) """
    def __init__(self, config):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(config['d_model'], 4 * config['d_model']),
            nn.GELU(),
            nn.Linear(4 * config['d_model'], config['d_model']),
            nn.Dropout(config['dropout'])
        )

    def forward(self, x):
        return self.net(x)

class MoELayer(nn.Module):
    """ Capa Mixture of Experts con Gating Top-K """
    def __init__(self, config):
        super().__init__()
        self.num_experts = config['n_experts']
        self.top_k = config['top_k']
        self.experts = nn.ModuleList([Expert(config) for _ in range(self.num_experts)])
        self.gate = nn.Linear(config['d_model'], self.num_experts)
        self.last_gate_logits = None # Para visualizaci√≥n

    def forward(self, x):
        B, T, C = x.shape
        x_flat = x.view(-1, C)
        gate_logits = self.gate(x_flat)
        self.last_gate_logits = gate_logits.detach() # Guardamos para graficar
        
        weights, indices = torch.topk(gate_logits, self.top_k, dim=-1)
        weights = F.softmax(weights, dim=-1)
        
        results = torch.zeros_like(x_flat)
        # Ejecuci√≥n naive (lenta) para demo. En prod usar scatter/gather.
        for i, expert in enumerate(self.experts):
            mask = (indices == i).any(dim=-1)
            if mask.any():
                expert_out = expert(x_flat[mask])
                # Simplificaci√≥n de la suma ponderada para mantener el c√≥digo corto
                results[mask] += expert_out 
                
        return results.view(B, T, C)

class CortexHybrid(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.embedding = nn.Embedding(config['vocab_size'], config['d_model'])
        self.layers = nn.ModuleList()
        
        for i in range(config['n_layers']):
            # Alternamos Mamba (Memoria) y Atenci√≥n (Razonamiento)
            if i % 2 == 0:
                self.layers.append(MambaBlock(config))
            else:
                self.layers.append(nn.TransformerEncoderLayer(
                    d_model=config['d_model'], nhead=4, dim_feedforward=4*config['d_model'], 
                    dropout=config['dropout'], batch_first=True
                ))
        
        self.moe = MoELayer(config)
        self.ln_f = nn.LayerNorm(config['d_model'])
        self.head = nn.Linear(config['d_model'], config['vocab_size'])

    def forward(self, idx, targets=None):
        B, T = idx.shape
        x = self.embedding(idx)
        
        for layer in self.layers:
            x = layer(x)
            
        x = self.moe(x)
        x = self.ln_f(x)
        logits = self.head(x)
        
        loss = None
        if targets is not None:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
            
        return logits, loss

model = CortexHybrid(config).to(device)
print(f"üß† Cortex-1 Inicializado. Par√°metros: {sum(p.numel() for p in model.parameters())/1e6:.2f}M")

## üìä Fase 3: Entrenamiento y Visualizaci√≥n en Tiempo Real
Observa c√≥mo el modelo aprende y c√≥mo decide qu√© expertos usar.

In [None]:
# 4. Bucle de Entrenamiento con Dashboard
optimizer = torch.optim.AdamW(model.parameters(), lr=config['learning_rate'])

train_losses = []
expert_usage = torch.zeros(config['n_experts'])

plt.ion()
fig, (ax1, ax2) = plt.subplots(1, 2, figsize=(15, 5))

print("üî• Iniciando ignici√≥n de Cortex-1...")

for iter in range(config['max_iters']):
    xb, yb = get_batch('train')
    logits, loss = model(xb, yb)
    
    optimizer.zero_grad(set_to_none=True)
    loss.backward()
    optimizer.step()
    
    train_losses.append(loss.item())
    
    # Capturar uso de expertos (simulado desde logits del gate)
    if model.moe.last_gate_logits is not None:
        # Sumar activaciones crudas para ver preferencias
        usage = model.moe.last_gate_logits.mean(dim=0).cpu().detach()
        expert_usage = 0.9 * expert_usage + 0.1 * usage # Promedio m√≥vil

    if iter % 50 == 0:
        clear_output(wait=True)
        
        # Gr√°fica 1: P√©rdida (Aprendizaje)
        ax1.clear()
        ax1.plot(train_losses, label='Training Loss', color='#38bdf8')
        ax1.set_title('Curva de Aprendizaje (Loss)')
        ax1.set_xlabel('Iteraciones')
        ax1.set_ylabel('Loss')
        ax1.grid(True, alpha=0.1)
        
        # Gr√°fica 2: Distribuci√≥n de Expertos MoE
        ax2.clear()
        ax2.bar(range(config['n_experts']), expert_usage, color='#818cf8')
        ax2.set_title('Activaci√≥n de Expertos (MoE)')
        ax2.set_xlabel('ID del Experto')
        ax2.set_ylabel('Nivel de Actividad')
        
        display(fig)
        print(f"Iter {iter}: Loss {loss.item():.4f}")

plt.ioff()
print("‚úÖ Entrenamiento completado.")

In [None]:
# 5. Prueba de Generaci√≥n (Inferencia)
def generate(prompt, max_new_tokens=100):
    # Convertir texto a bytes
    idx = torch.tensor([b for b in prompt.encode('utf-8')], dtype=torch.long, device=device).unsqueeze(0)
    
    for _ in range(max_new_tokens):
        idx_cond = idx[:, -config['block_size']:]
        logits, _ = model(idx_cond)
        logits = logits[:, -1, :]
        probs = F.softmax(logits, dim=-1)
        idx_next = torch.multinomial(probs, num_samples=1)
        idx = torch.cat((idx, idx_next), dim=1)
        
    # Decodificar bytes a texto (ignorando errores de utf-8 parciales)
    return bytes(idx[0].tolist()).decode('utf-8', errors='replace')

print("ü§ñ Cortex-1 dice:")
print(generate("Artificial Intelligence is"))