# üèÜ Cortex-10: The World Cup of AI Architectures

## üåç El Mundial de la Inteligencia Artificial
Bienvenido al experimento definitivo. No probaremos un modelo. Probaremos **TODOS**.

### ü•ä Los Competidores
1.  **Transformer (The Standard)**: El campe√≥n actual (GPT).
2.  **Mamba (The Challenger)**: La serpiente lineal (SSM).
3.  **Hybrid (The Cyborg)**: Fusi√≥n Mamba + Atenci√≥n.
4.  **RWKV (The LRM)**: Recurrent Weighted Key-Value. Eficiencia RNN con potencia GPT.
5.  **MoE (The Specialist)**: Mixture of Experts. Cerebro gigante, activaci√≥n dispersa.

### üìÖ El Formato
1.  **Los Clasificatorios (Qualifiers)**: 100 pruebas r√°pidas para CADA arquitectura. Solo el mejor de cada casa pasa a la final.
2.  **La Final (Adaptaci√≥n)**: Los 5 campeones aprender√°n Shakespeare y luego tendr√°n que aprender Matem√°ticas.
3.  **El Criterio**: Ganar√° quien tenga mejor **Plasticidad** (aprenda r√°pido sin destruir su cerebro anterior) y **Claridad Simb√≥lica**.

---

In [None]:
# 1. Setup & Librer√≠as
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"üöÄ Cortex-10 Engine: {device.upper()}")

def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)

set_seed(42)

### 2. El Arsenal (Definici√≥n de Arquitecturas)

In [None]:
# --- 1. MAMBA BLOCK (SSM) ---
class MambaBlock(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.in_proj = nn.Linear(d_model, d_model * 2)
        self.out_proj = nn.Linear(d_model, d_model)
        self.conv = nn.Conv1d(d_model, d_model, kernel_size=3, padding=1, groups=d_model)
    def forward(self, x):
        B, L, D = x.shape
        x_and_res = self.in_proj(x)
        x_val, res = x_and_res.chunk(2, dim=-1)
        x_val = x_val.transpose(1, 2)
        x_val = self.conv(x_val)
        x_val = x_val.transpose(1, 2)
        x_val = F.silu(x_val)
        return self.out_proj(x_val * F.sigmoid(res))

# --- 2. RWKV BLOCK (Linear RNN) ---
class RWKVBlock(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.time_decay = nn.Parameter(torch.zeros(d_model))
        self.time_first = nn.Parameter(torch.zeros(d_model))
        self.key = nn.Linear(d_model, d_model, bias=False)
        self.value = nn.Linear(d_model, d_model, bias=False)
        self.receptance = nn.Linear(d_model, d_model, bias=False)
        self.output = nn.Linear(d_model, d_model, bias=False)

    def forward(self, x):
        # Implementaci√≥n simplificada de WKV (Weighted Key-Value)
        B, T, C = x.shape
        k = self.key(x)
        v = self.value(x)
        r = torch.sigmoid(self.receptance(x))
        
        # WKV (Atenci√≥n Lineal Recurrente)
        # En pr√°ctica real se usa CUDA kernel, aqu√≠ simulamos con loop o cumsum
        wkv = torch.zeros_like(x)
        # Simplificaci√≥n para demo: Atenci√≥n causal simple
        # (RWKV real es m√°s complejo, esto captura la esencia de recurrencia lineal)
        wkv = k * v # Placeholder funcional para la demo
        
        return self.output(r * wkv)

# --- 3. MOE BLOCK (Mixture of Experts) ---
class Expert(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.net = nn.Sequential(
            nn.Linear(d_model, 4 * d_model),
            nn.ReLU(),
            nn.Linear(4 * d_model, d_model)
        )
    def forward(self, x): return self.net(x)

class MoEBlock(nn.Module):
    def __init__(self, d_model, num_experts=4, top_k=2):
        super().__init__()
        self.experts = nn.ModuleList([Expert(d_model) for _ in range(num_experts)])
        self.gate = nn.Linear(d_model, num_experts)
        self.top_k = top_k

    def forward(self, x):
        B, T, C = x.shape
        # Gating
        gate_logits = self.gate(x)
        weights, indices = torch.topk(gate_logits, self.top_k, dim=-1)
        weights = F.softmax(weights, dim=-1)
        
        out = torch.zeros_like(x)
        for i, expert in enumerate(self.experts):
            # M√°scara para tokens que eligieron este experto
            # (Implementaci√≥n lenta para demo, optimizada usa scatter/gather)
            mask = (indices == i).any(dim=-1, keepdim=True)
            if mask.any():
                out += mask * expert(x) * weights.sum(dim=-1, keepdim=True)
        return out

# --- EL ORGANISMO UNIVERSAL ---
class CortexOrganism(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embedding = nn.Embedding(256, config['d_model'])
        self.layers = nn.ModuleList()
        
        for i in range(config['n_layers']): 
            if config['type'] == 'Transformer':
                self.layers.append(nn.TransformerEncoderLayer(
                    d_model=config['d_model'], nhead=config['n_heads'], 
                    dim_feedforward=4*config['d_model'], batch_first=True, dropout=0.1
                ))
            elif config['type'] == 'Mamba':
                self.layers.append(MambaBlock(config['d_model']))
            elif config['type'] == 'RWKV':
                self.layers.append(RWKVBlock(config['d_model']))
            elif config['type'] == 'MoE':
                self.layers.append(MoEBlock(config['d_model']))
            elif config['type'] == 'Hybrid':
                if i % 2 == 0: self.layers.append(MambaBlock(config['d_model']))
                else: self.layers.append(nn.TransformerEncoderLayer(
                    d_model=config['d_model'], nhead=config['n_heads'], 
                    dim_feedforward=4*config['d_model'], batch_first=True
                ))

        self.ln_f = nn.LayerNorm(config['d_model'])
        self.head = nn.Linear(config['d_model'], 256)

    def forward(self, idx, targets=None):
        x = self.embedding(idx)
        for layer in self.layers: x = layer(x)
        x = self.ln_f(x)
        logits = self.head(x)
        loss = None
        if targets is not None:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss

### 3. Fase 1: Los Clasificatorios (Grand Qualifiers)
Buscamos el mejor hiperpar√°metro para cada especie.

In [None]:
# Datos Dummy para Clasificatorios (Velocidad)
data = torch.randint(0, 256, (10000,), dtype=torch.long)
def get_batch():
    ix = torch.randint(len(data) - 64, (32,))
    x = torch.stack([data[i:i+64] for i in ix]).to(device)
    y = torch.stack([data[i+1:i+65] for i in ix]).to(device)
    return x, y

def run_qualifiers():
    architectures = ['Transformer', 'Mamba', 'RWKV', 'MoE', 'Hybrid']
    champions = {}
    
    print("üèÜ INICIANDO CLASIFICATORIOS (100 Trials por Arquitectura)...")
    
    for arch in architectures:
        print(f"\nü•ä Grupo: {arch}")
        best_loss = float('inf')
        best_config = None
        
        # 20 Trials (Reducido de 100 para demo r√°pida, subir a 100 en prod)
        for trial in range(20): 
            # Random Search
            config = {
                'type': arch,
                'n_layers': random.choice([2, 4, 6]),
                'd_model': random.choice([64, 128, 256]),
                'n_heads': random.choice([2, 4, 8]),
                'lr': random.choice([1e-3, 5e-4, 1e-4])
            }
            
            # Entrenamiento R√°pido (Sprint)
            model = CortexOrganism(config).to(device)
            optim = torch.optim.AdamW(model.parameters(), lr=config['lr'])
            
            losses = []
            for _ in range(10): # Solo 10 pasos para descartar basura
                xb, yb = get_batch()
                _, loss = model(xb, yb)
                loss.backward()
                optim.step()
                optim.zero_grad()
                losses.append(loss.item())
            
            final_loss = np.mean(losses[-3:])
            if final_loss < best_loss:
                best_loss = final_loss
                best_config = config
                
        print(f"   üëë Campe√≥n {arch}: Loss {best_loss:.4f} | Config {best_config}")
        champions[arch] = best_config
        
    return champions

finalists_configs = run_qualifiers()

### 4. Fase 2: La Gran Final (Adaptaci√≥n)
Entrenamos a los 5 campeones en Shakespeare y luego los forzamos a aprender Matem√°ticas.

In [None]:
# Datos Reales
shakespeare = requests.get("https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt").text
math_data = "".join([f"Q:{a}+{b}={a+b}\n" for a in range(100) for b in range(100)])

def get_batch_real(source_text):
    data = torch.tensor([ord(c) for c in source_text], dtype=torch.long)
    ix = torch.randint(len(data) - 64, (32,))
    x = torch.stack([data[i:i+64] for i in ix]).to(device)
    y = torch.stack([data[i+1:i+65] for i in ix]).to(device)
    return x, y

def run_finals(configs):
    print("\nüèüÔ∏è LA GRAN FINAL: Plasticidad & Adaptaci√≥n")
    results = []
    
    for arch, config in configs.items():
        print(f"\nüèÉ Corriendo: {arch}")
        model = CortexOrganism(config).to(device)
        optim = torch.optim.AdamW(model.parameters(), lr=config['lr'])
        
        # 1. Fase Shakespeare
        w_before = {k: v.clone() for k, v in model.named_parameters()}
        for i in range(100):
            xb, yb = get_batch_real(shakespeare)
            _, loss = model(xb, yb)
            loss.backward(); optim.step(); optim.zero_grad()
        loss_lit = loss.item()
        
        # 2. Fase Matem√°ticas (Adaptaci√≥n)
        for i in range(100):
            xb, yb = get_batch_real(math_data)
            _, loss = model(xb, yb)
            loss.backward(); optim.step(); optim.zero_grad()
        loss_math = loss.item()
        
        # 3. M√©tricas Profundas
        w_after = {k: v for k, v in model.named_parameters()}
        plasticity_cost = sum((w_after[k] - w_before[k]).norm().item() for k in w_before)
        
        print(f"   üìä Lit Loss: {loss_lit:.3f} -> Math Loss: {loss_math:.3f}")
        print(f"   üß† Plasticity Cost: {plasticity_cost:.2f} (Menor es mejor)")
        
        results.append({
            'Architecture': arch,
            'Math Loss': loss_math,
            'Plasticity': plasticity_cost,
            'Score': (1/loss_math) / plasticity_cost # Heur√≠stica simple
        })
        
    return pd.DataFrame(results)

df_results = run_finals(finalists_configs)
display(df_results.sort_values('Score', ascending=False))