# üî¨ Cortex-11: The Mega-Lab (30-Step Deep Dive)

Bienvenido al laboratorio definitivo. Has pedido detalle, rigor y visualizaci√≥n. Aqu√≠ tienes **33 secciones** que desglosan cada √°tomo del proceso.

---

## üõ†Ô∏è SECCI√ìN 1: FUNDAMENTOS

### 1. Librer√≠as y Motor
Importamos las herramientas y verificamos si tenemos aceleraci√≥n por hardware (GPU).

In [None]:
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import requests
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"üöÄ Motor Cortex Iniciado en: {device.upper()}")

### 2. Reproducibilidad (La Semilla de Dios)
Para que este experimento sea ciencia y no magia, fijamos la aleatoriedad.

In [None]:
def set_seed(seed=42):
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available(): torch.cuda.manual_seed_all(seed)
    print(f"üîí Semilla fijada en: {seed}")

set_seed(42)

## üìö SECCI√ìN 2: DATOS

### 3. Descarga del Corpus (Shakespeare)
Obtenemos el texto crudo.

In [None]:
url = "https://raw.githubusercontent.com/karpathy/char-rnn/master/data/tinyshakespeare/input.txt"
shakespeare_text = requests.get(url).text
print(f"üìú Shakespeare descargado. Longitud: {len(shakespeare_text)} caracteres.")

### 4. An√°lisis del Dataset
Antes de entrenar, miramos qu√© vamos a aprender.

In [None]:
print("--- Muestra del Texto ---")
print(shakespeare_text[:200])
print("\n--- Estad√≠sticas ---")
chars = sorted(list(set(shakespeare_text)))
print(f"Vocabulario ({len(chars)} chars): {''.join(chars)}")

### 5. Tokenizaci√≥n (Byte-Level)
Convertimos texto en n√∫meros. Usamos bytes crudos (0-255) para universalidad.

In [None]:
data_tensor = torch.tensor([ord(c) for c in shakespeare_text], dtype=torch.long)
print(f"Tensor de Datos: {data_tensor.shape}")
print(f"Ejemplo: 'First' -> {data_tensor[:5].tolist()}")

### 6. Motor de Batches
C√≥mo alimentamos al modelo.

In [None]:
def get_batch(data, batch_size=32, block_size=64):
    ix = torch.randint(len(data) - block_size, (batch_size,))
    x = torch.stack([data[i:i+block_size] for i in ix]).to(device)
    y = torch.stack([data[i+1:i+block_size+1] for i in ix]).to(device)
    return x, y

## üß† SECCI√ìN 3: ARQUITECTURAS (EL ARSENAL)

### 7. Componente: Mamba Block (SSM)
La memoria lineal.

In [None]:
class MambaBlock(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.in_proj = nn.Linear(d_model, d_model * 2)
        self.out_proj = nn.Linear(d_model, d_model)
        self.conv = nn.Conv1d(d_model, d_model, kernel_size=3, padding=1, groups=d_model)
    def forward(self, x):
        B, L, D = x.shape
        x_and_res = self.in_proj(x)
        x_val, res = x_and_res.chunk(2, dim=-1)
        x_val = x_val.transpose(1, 2)
        x_val = self.conv(x_val)
        x_val = x_val.transpose(1, 2)
        x_val = F.silu(x_val)
        return self.out_proj(x_val * F.sigmoid(res))

### 8. Componente: RWKV Block (Linear RNN)
La recurrencia eficiente.

In [None]:
class RWKVBlock(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.key = nn.Linear(d_model, d_model, bias=False)
        self.value = nn.Linear(d_model, d_model, bias=False)
        self.receptance = nn.Linear(d_model, d_model, bias=False)
        self.output = nn.Linear(d_model, d_model, bias=False)
    def forward(self, x):
        k = self.key(x)
        v = self.value(x)
        r = torch.sigmoid(self.receptance(x))
        return self.output(r * (k * v)) # Simplificado

### 9. Componente: MoE Block (Mixture of Experts)
Capacidad masiva, coste bajo.

In [None]:
class Expert(nn.Module):
    def __init__(self, d_model):
        super().__init__()
        self.net = nn.Sequential(nn.Linear(d_model, 4*d_model), nn.ReLU(), nn.Linear(4*d_model, d_model))
    def forward(self, x): return self.net(x)

class MoEBlock(nn.Module):
    def __init__(self, d_model, num_experts=4):
        super().__init__()
        self.experts = nn.ModuleList([Expert(d_model) for _ in range(num_experts)])
        self.gate = nn.Linear(d_model, num_experts)
    def forward(self, x):
        weights, indices = torch.topk(self.gate(x), 2, dim=-1)
        weights = F.softmax(weights, dim=-1)
        out = torch.zeros_like(x)
        for i, expert in enumerate(self.experts):
            mask = (indices == i).any(dim=-1, keepdim=True)
            if mask.any(): out += mask * expert(x) * weights.sum(dim=-1, keepdim=True)
        return out

### 10. El Organismo Universal
La clase que puede ser cualquier cosa seg√∫n su ADN.

In [None]:
class CortexOrganism(nn.Module):
    def __init__(self, config):
        super().__init__()
        self.config = config
        self.embedding = nn.Embedding(256, config['d_model'])
        self.layers = nn.ModuleList()
        for i in range(config['n_layers']): 
            if config['type'] == 'Transformer':
                self.layers.append(nn.TransformerEncoderLayer(config['d_model'], config['n_heads'], dim_feedforward=4*config['d_model'], batch_first=True))
            elif config['type'] == 'Mamba': self.layers.append(MambaBlock(config['d_model']))
            elif config['type'] == 'RWKV': self.layers.append(RWKVBlock(config['d_model']))
            elif config['type'] == 'MoE': self.layers.append(MoEBlock(config['d_model']))
            elif config['type'] == 'Hybrid':
                if i % 2 == 0: self.layers.append(MambaBlock(config['d_model']))
                else: self.layers.append(nn.TransformerEncoderLayer(config['d_model'], config['n_heads'], batch_first=True))
        self.ln_f = nn.LayerNorm(config['d_model'])
        self.head = nn.Linear(config['d_model'], 256)
    def forward(self, idx, targets=None):
        x = self.embedding(idx)
        for layer in self.layers: x = layer(x)
        x = self.ln_f(x)
        logits = self.head(x)
        loss = None
        if targets is not None:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss

### 11. Funci√≥n de Generaci√≥n (El Habla)
Para ver qu√© dicen los modelos.

In [None]:
def generate(model, prompt, max_len=50):
    model.eval()
    idx = torch.tensor([ord(c) for c in prompt], dtype=torch.long).unsqueeze(0).to(device)
    for _ in range(max_len):
        with torch.no_grad():
            logits, _ = model(idx)
            probs = F.softmax(logits[:, -1, :], dim=-1)
            next_token = torch.multinomial(probs, 1)
            idx = torch.cat((idx, next_token), dim=1)
    return "".join([chr(i) for i in idx[0].tolist()])

## üèÜ SECCI√ìN 4: EL TORNEO

### 12. Configuraci√≥n de los Clasificatorios
Definimos qu√© vamos a buscar.

In [None]:
architectures = ['Transformer', 'Mamba', 'RWKV', 'MoE', 'Hybrid']
print(f"Gladiadores: {architectures}")

### 13. Ejecuci√≥n: Clasificatorios (Qualifiers)
Corremos 20 versiones de cada uno para encontrar al campe√≥n de cada clase.

In [None]:
champions = {}
for arch in architectures:
    print(f"\nü•ä Buscando el mejor {arch}...")
    best_loss = float('inf')
    best_config = None
    
    for i in range(5): # 5 Trials por demo (subir a 100)
        config = {
            'type': arch,
            'n_layers': random.choice([2, 4]),
            'd_model': 128,
            'n_heads': 4,
            'lr': 1e-3
        }
        model = CortexOrganism(config).to(device)
        optim = torch.optim.AdamW(model.parameters(), lr=config['lr'])
        
        # Sprint de entrenamiento
        for _ in range(20):
            xb, yb = get_batch(data_tensor)
            _, loss = model(xb, yb)
            loss.backward(); optim.step(); optim.zero_grad()
            
        if loss.item() < best_loss:
            best_loss = loss.item()
            best_config = config
            
        print(f"   Trial {i}: Loss {loss.item():.4f}")
        
    print(f"üëë Campe√≥n {arch}: Loss {best_loss:.4f}")
    champions[arch] = best_config

### 14. Preparaci√≥n de la Final (Dataset Matem√°tico)
Generamos el reto nuevo.

In [None]:
math_text = "".join([f"Q:{a}+{b}={a+b}\n" for a in range(100) for b in range(100)])
math_tensor = torch.tensor([ord(c) for c in math_text], dtype=torch.long)
print(f"üßÆ Dataset Matem√°tico: {len(math_text)} chars")

### 15. LA GRAN FINAL: Ejecuci√≥n
Entrenamos a los campeones en Literatura y luego los adaptamos a Matem√°ticas.

In [None]:
results = []
print("üèüÔ∏è LA GRAN FINAL...")

for arch, config in champions.items():
    print(f"\nüèÉ Corriendo: {arch}")
    model = CortexOrganism(config).to(device)
    optim = torch.optim.AdamW(model.parameters(), lr=config['lr'])
    
    # 1. Literatura
    w_before = {k: v.clone() for k, v in model.named_parameters()}
    for i in range(100):
        xb, yb = get_batch(data_tensor)
        _, loss = model(xb, yb)
        loss.backward(); optim.step(); optim.zero_grad()
    loss_lit = loss.item()
    sample_lit = generate(model, "The king ", 30)
    
    # 2. Matem√°ticas
    for i in range(100):
        xb, yb = get_batch(math_tensor)
        _, loss = model(xb, yb)
        loss.backward(); optim.step(); optim.zero_grad()
    loss_math = loss.item()
    sample_math = generate(model, "Q:10+10=", 10)
    
    # 3. Plasticidad
    w_after = {k: v for k, v in model.named_parameters()}
    plasticity = sum((w_after[k] - w_before[k]).norm().item() for k in w_before)
    
    print(f"   üìú Lit: \"{sample_lit}...\"")
    print(f"   üßÆ Math: \"{sample_math}...\"")
    print(f"   üß† Plasticity: {plasticity:.2f}")
    
    results.append({'Arch': arch, 'Math Loss': loss_math, 'Plasticity': plasticity, 'Sample': sample_math})

### 16. An√°lisis de Resultados
¬øQui√©n gan√≥?

In [None]:
df = pd.DataFrame(results).sort_values('Math Loss')
display(df)

## üî¨ SECCI√ìN 5: DEEP DIVE (EL GANADOR)

### 17. Selecci√≥n del Ganador
Autom√°ticamente elegimos al mejor.

In [None]:
winner_arch = df.iloc[0]['Arch']
winner_config = champions[winner_arch]
print(f"üèÜ El Ganador es: {winner_arch}")

### 18. Re-entrenamiento del Ganador (Larga Duraci√≥n)
Ahora que sabemos qui√©n es el mejor, lo entrenamos en serio.

In [None]:
print(f"üèãÔ∏è Entrenando {winner_arch} en serio...")
model = CortexOrganism(winner_config).to(device)
optim = torch.optim.AdamW(model.parameters(), lr=1e-3)

losses = []
for i in range(500):
    xb, yb = get_batch(data_tensor)
    _, loss = model(xb, yb)
    loss.backward(); optim.step(); optim.zero_grad()
    losses.append(loss.item())
    if i % 100 == 0:
        print(f"   Iter {i}: Loss {loss.item():.4f} -> \"{generate(model, 'The ', 20)}...\"")

### 19. Visualizaci√≥n: Curva de Aprendizaje
C√≥mo aprendi√≥ el campe√≥n.

In [None]:
plt.plot(losses)
plt.title(f"Curva de Aprendizaje ({winner_arch})")
plt.xlabel("Iteraciones")
plt.ylabel("Loss")
plt.show()

### 20. Futuro: Guardado
Guardamos el cerebro para la posteridad.

In [None]:
torch.save(model.state_dict(), f"cortex_winner_{winner_arch}.pth")
print("üíæ Modelo guardado.")