# üß¨ Cortex-2: Advanced Evolutionary Research Lab

## üìú Estado del Arte y Justificaci√≥n Cient√≠fica
Este entorno implementa una arquitectura de frontera basada en los siguientes papers disruptivos:

1.  **Mamba (SSM)**: *Gu, A., & Dao, T. (2023). "Mamba: Linear-Time Sequence Modeling with Selective State Spaces".* [arXiv:2312.00752](https://arxiv.org/abs/2312.00752)
    *   *Por qu√©*: Resuelve el cuello de botella cuadr√°tico de los Transformers ($O(N^2)$) permitiendo contextos infinitos con coste lineal ($O(N)$).
2.  **Mixture of Experts (MoE)**: *Shazeer et al. (2017). "Outrageously Large Neural Networks: The Sparsely-Gated Mixture-of-Experts Layer".* [arXiv:1701.06538](https://arxiv.org/abs/1701.06538)
    *   *Por qu√©*: Desacopla la capacidad de computaci√≥n (FLOPs) de la capacidad de memoria (Par√°metros). Permite modelos gigantes que corren r√°pido.
3.  **Byte-Level Modeling**: *Xue et al. (2022). "ByT5: Towards a Token-Free Future".* [arXiv:2105.13626](https://arxiv.org/abs/2105.13626)
    *   *Por qu√©*: Elimina el sesgo humano del Tokenizer. Hace al modelo robusto a "ruido" y multiling√ºe por defecto.
4.  **Neural Architecture Search (NAS)**: *Real et al. (2019). "Regularized Evolution for Image Classifier Architecture Search".* [arXiv:1802.01548](https://arxiv.org/abs/1802.01548)
    *   *Por qu√©*: La intuici√≥n humana falla en espacios de alta dimensi√≥n. La evoluci√≥n encuentra √≥ptimos locales que nosotros ignoramos.

---

In [None]:
# 0. Configuraci√≥n e Importaciones
import torch
import torch.nn as nn
import torch.nn.functional as F
import random
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import plotly.graph_objects as go
import plotly.express as px
import pandas as pd
from sklearn.decomposition import PCA
from IPython.display import clear_output, display

device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f"üöÄ Cortex-2 Engine Active on: {device.upper()}")

def set_seed(seed=42):
    """ Garantiza reproducibilidad total """
    random.seed(seed)
    np.random.seed(seed)
    torch.manual_seed(seed)
    if torch.cuda.is_available():
        torch.cuda.manual_seed_all(seed)
    print(f"üîí Semilla fijada en: {seed}")

set_seed(42)

## üß† Arquitectura Modular (Caja Blanca)
Hemos instrumentado el c√≥digo para extraer **telemetr√≠a interna**. No es solo "forward pass", es un esc√°ner cerebral.

In [None]:
# --- Componentes Instrumentados ---

class InstrumentedAttention(nn.Module):
    """ Atenci√≥n con captura de mapas de calor """
    def __init__(self, config):
        super().__init__()
        self.mha = nn.MultiheadAttention(config['d_model'], config['n_heads'], batch_first=True)
        self.last_attn_weights = None
        
    def forward(self, x):
        # Capturamos los pesos de atenci√≥n (Average across heads for simplicity in visualization)
        out, weights = self.mha(x, x, x, need_weights=True, average_attn_weights=True)
        self.last_attn_weights = weights.detach().cpu()
        return out

class MambaBlock(nn.Module):
    """ Bloque Mamba Simplificado """
    def __init__(self, d_model):
        super().__init__()
        self.in_proj = nn.Linear(d_model, d_model * 2)
        self.out_proj = nn.Linear(d_model, d_model)
        self.conv = nn.Conv1d(d_model, d_model, kernel_size=3, padding=1, groups=d_model)
        
    def forward(self, x):
        B, L, D = x.shape
        x_and_res = self.in_proj(x)
        x_val, res = x_and_res.chunk(2, dim=-1)
        x_val = x_val.transpose(1, 2)
        x_val = self.conv(x_val)
        x_val = x_val.transpose(1, 2)
        x_val = F.silu(x_val)
        return self.out_proj(x_val * F.sigmoid(res))

class MoELayer(nn.Module):
    """ MoE con Telemetr√≠a de Routing """
    def __init__(self, d_model, n_experts, top_k=2):
        super().__init__()
        self.experts = nn.ModuleList([
            nn.Sequential(
                nn.Linear(d_model, 4 * d_model), nn.GELU(), 
                nn.Linear(4 * d_model, d_model), nn.Dropout(0.1)
            ) for _ in range(n_experts)
        ])
        self.gate = nn.Linear(d_model, n_experts)
        self.top_k = top_k
        self.last_routing_dist = None

    def forward(self, x):
        gate_logits = self.gate(x)
        weights, indices = torch.topk(gate_logits, self.top_k, dim=-1)
        weights = F.softmax(weights, dim=-1)
        
        # Telemetr√≠a: ¬øQu√© expertos se activaron?
        self.last_routing_dist = indices.detach().cpu().view(-1).bincount(minlength=len(self.experts))
        
        out = torch.zeros_like(x)
        for i, expert in enumerate(self.experts):
            mask = (indices == i).any(dim=-1)
            if mask.any():
                out[mask] += expert(x[mask])
        return out

class CortexOrganism(nn.Module):
    def __init__(self, dna):
        super().__init__()
        self.dna = dna
        self.embedding = nn.Embedding(256, dna['d_model'])
        self.layers = nn.ModuleList()
        
        for i in range(dna['n_layers']):
            if dna['backbone'] == 'mamba':
                self.layers.append(MambaBlock(dna['d_model']))
            elif dna['backbone'] == 'hybrid' and i % 2 == 0:
                self.layers.append(MambaBlock(dna['d_model']))
            else:
                # Usamos nuestra Atenci√≥n Instrumentada
                self.layers.append(InstrumentedAttention(dna))
        
        if dna['moe_experts'] > 0:
            self.final_layer = MoELayer(dna['d_model'], dna['moe_experts'])
        else:
            self.final_layer = nn.Linear(dna['d_model'], dna['d_model'])
            
        self.ln_f = nn.LayerNorm(dna['d_model'])
        self.head = nn.Linear(dna['d_model'], 256)

    def forward(self, idx, targets=None):
        x = self.embedding(idx)
        for layer in self.layers:
            x = layer(x)
        if self.dna['moe_experts'] > 0:
            x = self.final_layer(x)
        x = self.ln_f(x)
        logits = self.head(x)
        
        loss = None
        if targets is not None:
            B, T, C = logits.shape
            loss = F.cross_entropy(logits.view(B*T, C), targets.view(B*T))
        return logits, loss

## üî¨ Visualizaci√≥n Avanzada
Aqu√≠ definimos las herramientas para inspeccionar el modelo.

In [None]:
def visualize_attention(model, input_bytes):
    """ Muestra qu√© bytes miran a qu√© bytes """
    # Buscar la √∫ltima capa de atenci√≥n
    attn_layer = None
    for layer in model.layers:
        if isinstance(layer, InstrumentedAttention):
            attn_layer = layer
            
    if attn_layer is None or attn_layer.last_attn_weights is None:
        print("‚ö†Ô∏è No hay capas de atenci√≥n activas o registradas.")
        return

    weights = attn_layer.last_attn_weights[0] # Primer batch
    
    plt.figure(figsize=(10, 8))
    sns.heatmap(weights.numpy(), cmap='viridis')
    plt.title("Mapa de Calor de Atenci√≥n (Razonamiento)")
    plt.xlabel("Key Token")
    plt.ylabel("Query Token")
    plt.show()

def visualize_moe_routing(model):
    """ Muestra la carga de trabajo de cada experto """
    if not hasattr(model, 'final_layer') or not isinstance(model.final_layer, MoELayer):
        return
        
    dist = model.final_layer.last_routing_dist
    if dist is None: return
    
    plt.figure(figsize=(8, 4))
    plt.bar(range(len(dist)), dist.numpy(), color='#818cf8')
    plt.title("Distribuci√≥n de Carga de Expertos (MoE)")
    plt.xlabel("ID del Experto")
    plt.ylabel("Tokens Procesados")
    plt.show()

## ‚öîÔ∏è M√≥dulo 4: El Torneo Evolutivo
Aqu√≠ ocurre la magia de la selecci√≥n natural.

In [None]:
# --- Genoma y Bucle Evolutivo ---
class Genome:
    def __init__(self):
        self.genes = {
            'n_layers': [2, 4, 6], 
            'd_model': [128, 256], 
            'n_heads': [2, 4], 
            'backbone': ['transformer', 'hybrid'], 
            'moe_experts': [0, 4, 8], 
            'learning_rate': [1e-3, 3e-4]
        }
        self.dna = {k: random.choice(v) for k, v in self.genes.items()}
        self.fitness = 0.0
    
    def mutate(self):
        k = random.choice(list(self.genes.keys()))
        self.dna[k] = random.choice(self.genes[k])
        return f"üß¨ Mutaci√≥n: {k} -> {self.dna[k]}"
        
    def crossover(self, other):
        child = Genome()
        for k in self.genes:
            child.dna[k] = self.dna[k] if random.random() > 0.5 else other.dna[k]
        return child

# Datos Dummy
dummy_data = torch.randint(0, 256, (1000,), dtype=torch.long)
def get_batch():
    ix = torch.randint(len(dummy_data) - 32, (16,))
    x = torch.stack([dummy_data[i:i+32] for i in ix]).to(device)
    y = torch.stack([dummy_data[i+1:i+33] for i in ix]).to(device)
    return x, y

def run_evolution(generations=3, pop_size=4):
    population = [Genome() for _ in range(pop_size)]
    
    for gen in range(generations):
        print(f"\nüèÅ Generaci√≥n {gen+1}")
        for i, genome in enumerate(population):
            model = CortexOrganism(genome.dna).to(device)
            optim = torch.optim.AdamW(model.parameters(), lr=genome.dna['learning_rate'])
            
            # Sprint Training
            losses = []
            for _ in range(10):
                xb, yb = get_batch()
                _, loss = model(xb, yb)
                optim.zero_grad()
                loss.backward()
                optim.step()
                losses.append(loss.item())
            
            genome.fitness = sum(losses[-3:]) / 3
            print(f"   Individuo {i} ({genome.dna['backbone']}): Loss {genome.fitness:.4f}")
        
        # Selecci√≥n
        population.sort(key=lambda x: x.fitness)
        survivors = population[:pop_size//2]
        
        # Reproducci√≥n
        new_pop = survivors[:]
        while len(new_pop) < pop_size:
            parent = random.choice(survivors)
            child = parent.crossover(random.choice(survivors))
            if random.random() < 0.4: child.mutate()
            new_pop.append(child)
        population = new_pop

# run_evolution() # Descomentar para correr el torneo de nuevo

## üèÜ Fase 5: Entrenamiento Final (Producci√≥n)

Una vez que la evoluci√≥n ha encontrado el "ADN Perfecto", lo usamos para entrenar el modelo final de forma seria y reproducible.

In [None]:
# 1. Pegar aqu√≠ el ADN Ganador (Resultado de la Evoluci√≥n)
winning_dna = {'n_layers': 2, 'd_model': 256, 'n_heads': 8, 'backbone': 'hybrid', 'moe_experts': 0, 'learning_rate': 0.001}

# 2. Fijar Semilla para Reproducibilidad
set_seed(42)

# 3. Instanciar Modelo Final
print(f"üèóÔ∏è Construyendo Cortex-Final con ADN: {winning_dna}")
final_model = CortexOrganism(winning_dna).to(device)
optimizer = torch.optim.AdamW(final_model.parameters(), lr=winning_dna['learning_rate'])

# 4. Entrenamiento Largo (1000 Iteraciones)
print("üî• Iniciando Entrenamiento de Producci√≥n...")
losses = []
for i in range(1000):
    xb, yb = get_batch()
    _, loss = final_model(xb, yb)
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    losses.append(loss.item())
    
    if i % 100 == 0:
        print(f"Iter {i}: Loss {loss.item():.4f}")

# 5. Guardar Modelo
torch.save(final_model.state_dict(), "cortex_final.pth")
print("üíæ Modelo guardado como 'cortex_final.pth'")

# 6. Visualizar Curva Final
plt.plot(losses)
plt.title("Curva de Aprendizaje Final")
plt.show()

In [None]:
# 7. Inferencia (Prueba de Fuego)
def generate(model, prompt, max_len=100):
    model.eval()
    idx = torch.tensor([b for b in prompt.encode('utf-8')], dtype=torch.long).unsqueeze(0).to(device)
    for _ in range(max_len):
        with torch.no_grad():
            logits, _ = model(idx)
            probs = F.softmax(logits[:, -1, :], dim=-1)
            next_token = torch.multinomial(probs, 1)
            idx = torch.cat((idx, next_token), dim=1)
    return bytes(idx[0].tolist()).decode('utf-8', errors='replace')

print(generate(final_model, "Artificial Intelligence is"))