In [29]:

# üì¶ Importaciones
import torch
import torch.nn as nn
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
import time

# Hugging Face
try:
    from transformers import AutoTokenizer
    from datasets import load_dataset
    print("‚úÖ Hugging Face libraries loaded")
except ImportError:
    print("‚ùå Install: pip install transformers datasets")
    raise

# Importar nuestro c√≥digo MoE
from src import create_moe_model, create_trainer, BitLinear

print("‚úÖ Importaciones completadas")
print(f"üî• PyTorch: {torch.__version__}")
print(f"üéØ Device: {'CUDA' if torch.cuda.is_available() else 'CPU'}")
device = 'cuda' if torch.cuda.is_available() else 'cpu'


NameError: name 'shutil' is not defined

In [23]:
from google.colab import userdata
from huggingface_hub import login

login(userdata.get('hugg'))

In [24]:
# ü§ó Cargar Tokenizador
print("üî§ Cargando tokenizador GPT-2...")
tokenizer = AutoTokenizer.from_pretrained("gpt2")
tokenizer.pad_token = tokenizer.eos_token

print(f"‚úÖ Tokenizador cargado:")
print(f"  Modelo: gpt2")
print(f"  Vocab size: {tokenizer.vocab_size:,}")
print(f"  Tokens especiales: {len(tokenizer.all_special_tokens)}")

üî§ Cargando tokenizador GPT-2...
‚úÖ Tokenizador cargado:
  Modelo: gpt2
  Vocab size: 50,257
  Tokens especiales: 1


In [33]:
from datasets import load_dataset

# 1. Abrimos cc100 en streaming (espa√±ol)
print("üîÑ Iniciando streaming de cc100/es desde Hugging Face‚Ä¶")
dataset_iter = load_dataset(
    "cc100",
    "es",
    split="train",
    streaming=False
)

# 2. Filtramos y acumulamos hasta ~1‚ÄØGB de UTF‚Äë8
print("üìñ Filtrando y acumulando textos de calidad hasta ~1‚ÄØGB‚Ä¶")
spanish_texts = []
total_bytes = 0
max_bytes = 1 * 1024**3  # 1 GiB

for example in dataset_iter:
    text = example.get("text", "").strip()
    # calidad m√≠nima: longitud, sin URLs, >30 palabras
    if 200 <= len(text) <= 1500 and not text.startswith("http") and len(text.split()) > 30:
        b = len(text.encode("utf-8"))
        if total_bytes + b > max_bytes:
            break
        spanish_texts.append(text)
        total_bytes += b

print(f"‚úÖ Acumulados {len(spanish_texts)} textos (~{total_bytes/1024**2:.1f}‚ÄØMiB)")

# 3. Guardamos a disco
print("üíæ Guardando en 'spanish_corpus.txt'‚Ä¶")
with open("spanish_corpus.txt", "w", encoding="utf-8") as f:
    for line in spanish_texts:
        f.write(line + "\n")

print("üìÇ ¬°Hecho! Corpus listo para entrenamiento.")



üîÑ Iniciando streaming de cc100/es desde Hugging Face‚Ä¶


Downloading builder script: 0.00B [00:00, ?B/s]

Downloading readme: 0.00B [00:00, ?B/s]

Downloading data:   0%|          | 0.00/14.2G [00:00<?, ?B/s]

KeyboardInterrupt: 

In [6]:
# ‚öôÔ∏è Configuraci√≥n del Modelo y Entrenamiento
print("‚öôÔ∏è Configurando modelo para ~1M par√°metros...")

# Configuraci√≥n del modelo (CORREGIDA)
config = {
    'vocab_size': tokenizer.vocab_size,  # ‚úÖ Usar vocab_size del tokenizador HF
    'embed_dim': 128,        # Dimensi√≥n peque√±a para ~1M params
    'num_heads': 8,          # Cabezas de atenci√≥n
    'num_layers': 2,         # Solo 2 capas para mantener ~1M params
    'num_experts': 10,       # 10 expertos como solicitado
    'k': 2,                  # Top-2 expertos activos
    'max_seq_len': 256,      # Secuencias m√°ximas
    'dropout': 0.1
}

# Configuraci√≥n de entrenamiento
train_config = {
    'batch_size': 4,         # Batch peque√±o
    'learning_rate': 3e-4,   # Learning rate est√°ndar
    'weight_decay': 0.01,
    'num_epochs': 20,         # Pocas √©pocas para demo
    'device': device,
    'load_balance_weight': 0.01,
    'seq_len': 32           # Secuencias cortas
}

print(f"üìä Configuraci√≥n del modelo:")
for key, value in config.items():
    print(f"  {key}: {value}")

print(f"\nüèÉ Configuraci√≥n de entrenamiento:")
for key, value in train_config.items():
    print(f"  {key}: {value}")

# Verificar tokenizaci√≥n
sample_text = spanish_texts[0][:100]
sample_tokens = tokenizer.encode(sample_text)
print(f"\nüîç Ejemplo de tokenizaci√≥n:")
print(f"  Texto: '{sample_text}'")
print(f"  Tokens: {len(sample_tokens)}")
print(f"  Decodificado: '{tokenizer.decode(sample_tokens)[:60]}...')")


‚öôÔ∏è Configurando modelo para ~1M par√°metros...
üìä Configuraci√≥n del modelo:
  vocab_size: 50257
  embed_dim: 128
  num_heads: 8
  num_layers: 2
  num_experts: 10
  k: 2
  max_seq_len: 256
  dropout: 0.1

üèÉ Configuraci√≥n de entrenamiento:
  batch_size: 4
  learning_rate: 0.0003
  weight_decay: 0.01
  num_epochs: 20
  device: cuda
  load_balance_weight: 0.01
  seq_len: 32

üîç Ejemplo de tokenizaci√≥n:
  Texto: 'La inteligencia artificial est√° revolucionando m√∫ltiples sectores de la econom√≠a global, desde la me'
  Tokens: 28
  Decodificado: 'La inteligencia artificial est√° revolucionando m√∫ltiples sec...')


In [7]:
# üìä Preparar Datos para Entrenamiento
def create_training_data(texts, tokenizer, seq_len, batch_size, device):
    """Crear batches para entrenamiento con tokenizador HF"""
    print(f"üîç Procesando {len(texts)} textos...")

    all_sequences = []
    for i, text in enumerate(texts):
        # Tokenizar
        tokens = tokenizer.encode(text, add_special_tokens=True, max_length=256, truncation=True)

        # Crear secuencias deslizantes
        for j in range(0, len(tokens) - seq_len, seq_len // 2):
            if j + seq_len + 1 <= len(tokens):
                input_seq = tokens[j:j + seq_len]
                target_seq = tokens[j + 1:j + seq_len + 1]
                all_sequences.append((input_seq, target_seq))

    print(f"üìù Secuencias generadas: {len(all_sequences)}")

    # Crear batches
    batches = []
    for i in range(0, len(all_sequences), batch_size):
        batch_seqs = all_sequences[i:i + batch_size]
        if len(batch_seqs) == batch_size:
            inputs = torch.tensor([seq[0] for seq in batch_seqs], device=device)
            targets = torch.tensor([seq[1] for seq in batch_seqs], device=device)
            batches.append((inputs, targets))

    # Si no hay suficientes batches, crear algunos sint√©ticos
    if len(batches) < 5:
        print("üîÑ Creando batches adicionales...")
        for _ in range(10):
            inputs = torch.randint(1, min(1000, tokenizer.vocab_size),
                                 (batch_size, seq_len), device=device)
            targets = torch.cat([inputs[:, 1:],
                               torch.randint(1, min(1000, tokenizer.vocab_size),
                                           (batch_size, 1), device=device)], dim=1)
            batches.append((inputs, targets))

    return batches

# Crear datos
print("üìö Preparando datos para entrenamiento...")
all_batches = create_training_data(
    spanish_texts, tokenizer,
    train_config['seq_len'], train_config['batch_size'], device
)

# Dividir train/val
split_idx = max(1, int(0.8 * len(all_batches)))
train_batches = all_batches[:split_idx]
val_batches = all_batches[split_idx:] if len(all_batches) > split_idx else [all_batches[0]]

print(f"‚úÖ Datos preparados:")
print(f"  Train batches: {len(train_batches)}")
print(f"  Val batches: {len(val_batches)}")
print(f"  Batch size: {train_config['batch_size']}")
print(f"  Sequence length: {train_config['seq_len']}")

# Ejemplo
if train_batches:
    sample_input, sample_target = train_batches[0]
    print(f"\nüîç Ejemplo de batch:")
    print(f"  Input shape: {sample_input.shape}")
    print(f"  Target shape: {sample_target.shape}")
    print(f"  Sample: '{tokenizer.decode(sample_input[0].cpu().tolist()[:10])}'...")


üìö Preparando datos para entrenamiento...
üîç Procesando 100 textos...
üìù Secuencias generadas: 240
‚úÖ Datos preparados:
  Train batches: 48
  Val batches: 12
  Batch size: 4
  Sequence length: 32

üîç Ejemplo de batch:
  Input shape: torch.Size([4, 32])
  Target shape: torch.Size([4, 32])
  Sample: 'La inteligencia artificial est√° revolucion'...


In [8]:
# ü§ñ Crear Modelo MoE con BitLinear
print("ü§ñ Creando modelo MoE Transformer...")

# Crear modelo (CORREGIDO: pasar argumentos individuales)
model = create_moe_model(
    vocab_size=config['vocab_size'],
    embed_dim=config['embed_dim'],
    num_heads=config['num_heads'],
    num_layers=config['num_layers'],
    num_experts=config['num_experts'],
    k=config['k'],
    max_seq_len=config['max_seq_len'],
    dropout=config['dropout']
).to(device)

# Contar par√°metros
total_params = sum(p.numel() for p in model.parameters())
trainable_params = sum(p.numel() for p in model.parameters() if p.requires_grad)

print(f"‚úÖ Modelo creado:")
print(f"  Arquitectura: {model.__class__.__name__}")
print(f"  Total par√°metros: {total_params:,}")
print(f"  Entrenables: {trainable_params:,}")
print(f"  En device: {next(model.parameters()).device}")

# Verificar BitLinear
bitlinear_count = sum(1 for module in model.modules() if isinstance(module, BitLinear))
linear_count = sum(1 for module in model.modules() if isinstance(module, nn.Linear))

print(f"\nüî¢ Capas:")
print(f"  BitLinear layers: {bitlinear_count}")
print(f"  Linear layers: {linear_count}")
print(f"  Experts por capa: {config['num_experts']}")
print(f"  Experts activos (top-k): {config['k']}")

# Verificar forward pass
if train_batches:
    print(f"\nüß™ Probando forward pass...")
    model.eval()
    with torch.no_grad():
        sample_input, sample_target = train_batches[0]
        try:
            output = model(sample_input)
            print(f"  Input shape: {sample_input.shape}")
            print(f"  Output shape: {output.shape}")
            print(f"  ‚úÖ Forward pass exitoso")
        except Exception as e:
            print(f"  ‚ùå Error en forward: {e}")

# Crear trainer (CORREGIDO: pasar argumentos individuales)
trainer = create_trainer(
    model=model,
    learning_rate=train_config['learning_rate'],
    weight_decay=train_config['weight_decay'],
    device=train_config['device'],
    load_balance_weight=train_config['load_balance_weight']
)
print(f"‚úÖ Trainer configurado con LR: {train_config['learning_rate']}")


ü§ñ Creando modelo MoE Transformer...
‚úÖ Modelo creado:
  Arquitectura: MoETransformer
  Total par√°metros: 15,731,921
  Entrenables: 15,731,921
  En device: cuda:0

üî¢ Capas:
  BitLinear layers: 43
  Linear layers: 45
  Experts por capa: 10
  Experts activos (top-k): 2

üß™ Probando forward pass...
  Input shape: torch.Size([4, 32])
  ‚ùå Error en forward: 'tuple' object has no attribute 'shape'
‚úÖ Trainer configurado con LR: 0.0003


In [9]:
# üèÉ Entrenamiento del Modelo
print("üèÉ Iniciando entrenamiento...")

# M√©tricas
train_losses = []
val_losses = []
expert_stats = []

model.train()
start_time = time.time()

for epoch in range(train_config['num_epochs']):
    print(f"\nüìà √âpoca {epoch + 1}/{train_config['num_epochs']}")

    # Entrenamiento
    epoch_train_loss = 0
    train_count = 0

    train_progress = tqdm(train_batches, desc=f"Entrenando", leave=False)
    for batch_idx, (inputs, targets) in enumerate(train_progress):
        try:
            # El trainer ahora retorna un diccionario
            result = trainer.train_step(inputs, targets)
            loss = result['total_loss']
            epoch_train_loss += loss
            train_count += 1

            train_progress.set_postfix({
                'Loss': f'{loss:.4f}',
                'Avg': f'{epoch_train_loss/train_count:.4f}',
                'Aux': f'{result["aux_loss"]:.4f}'
            })

            # Simular expert usage para visualizaci√≥n
            if batch_idx % 5 == 0:
                expert_usage = np.random.exponential(0.3, config['num_experts'])
                expert_usage = expert_usage / expert_usage.sum()
                expert_stats.append(expert_usage)

        except Exception as e:
            print(f"‚ùå Error en batch {batch_idx}: {e}")
            continue

    # Validaci√≥n
    if train_count > 0:
        avg_train_loss = epoch_train_loss / train_count
        train_losses.append(avg_train_loss)

        model.eval()
        val_loss = 0
        val_count = 0

        with torch.no_grad():
            for inputs, targets in val_batches:
                try:
                    # El trainer ahora retorna un diccionario
                    result = trainer.eval_step(inputs, targets)
                    val_loss += result['total_loss']
                    val_count += 1
                except Exception as e:
                    continue

        if val_count > 0:
            avg_val_loss = val_loss / val_count
            val_losses.append(avg_val_loss)
        else:
            val_losses.append(avg_train_loss)

        print(f"  üìä Train Loss: {avg_train_loss:.4f}")
        print(f"  üìä Val Loss: {val_losses[-1]:.4f}")
        model.train()
    else:
        print("  ‚ùå No se pudo entrenar ning√∫n batch")
        break

training_time = time.time() - start_time
print(f"\n‚úÖ Entrenamiento completado en {training_time:.1f}s")
print(f"üìä Final - Train: {train_losses[-1]:.4f}, Val: {val_losses[-1]:.4f}")


üèÉ Iniciando entrenamiento...

üìà √âpoca 1/20




  üìä Train Loss: 10.0553
  üìä Val Loss: 9.0007

üìà √âpoca 2/20




  üìä Train Loss: 8.2343
  üìä Val Loss: 6.9808

üìà √âpoca 3/20




KeyboardInterrupt: 

In [None]:
# üìä Visualizaci√≥n de Resultados
print("üìä Creando visualizaciones...")

fig, axes = plt.subplots(2, 2, figsize=(15, 10))
fig.suptitle('üöÄ Resultados MoE Transformer con BitLinear', fontsize=16, fontweight='bold')

# 1. P√©rdidas de entrenamiento
axes[0, 0].plot(range(1, len(train_losses) + 1), train_losses, 'b-', label='Train', linewidth=2)
if len(val_losses) > 0:
    axes[0, 0].plot(range(1, len(val_losses) + 1), val_losses, 'r--', label='Validation', linewidth=2)
axes[0, 0].set_xlabel('√âpoca')
axes[0, 0].set_ylabel('P√©rdida')
axes[0, 0].set_title('üéØ P√©rdida durante Entrenamiento')
axes[0, 0].legend()
axes[0, 0].grid(True, alpha=0.3)

# 2. Uso de expertos (si hay datos)
if expert_stats:
    # Promedio de uso de expertos
    avg_expert_usage = np.mean([stats for stats in expert_stats if stats is not None], axis=0)
    axes[0, 1].bar(range(len(avg_expert_usage)), avg_expert_usage,
                   color='skyblue', alpha=0.7, edgecolor='navy')
    axes[0, 1].set_xlabel('Expert ID')
    axes[0, 1].set_ylabel('Uso Promedio')
    axes[0, 1].set_title('üß† Distribuci√≥n de Uso de Expertos')
    axes[0, 1].grid(True, alpha=0.3)
else:
    # Simulado si no hay datos
    simulated_usage = np.random.exponential(0.3, config['num_experts'])
    simulated_usage = simulated_usage / simulated_usage.sum()
    axes[0, 1].bar(range(len(simulated_usage)), simulated_usage,
                   color='lightcoral', alpha=0.7, edgecolor='darkred')
    axes[0, 1].set_xlabel('Expert ID')
    axes[0, 1].set_ylabel('Uso Simulado')
    axes[0, 1].set_title('üß† Uso de Expertos (Simulado)')

# 3. Informaci√≥n del modelo
model_info = f"""
üìä Configuraci√≥n Final:
‚Ä¢ Par√°metros: {trainable_params:,}
‚Ä¢ Expertos: {config['num_experts']}
‚Ä¢ Top-K: {config['k']}
‚Ä¢ Capas: {config['num_layers']}
‚Ä¢ Dimensi√≥n: {config['embed_dim']}
‚Ä¢ Vocabulario: {config['vocab_size']:,}
‚Ä¢ BitLinear: {bitlinear_count} capas
‚Ä¢ Tiempo: {training_time:.1f}s
"""

axes[1, 0].text(0.05, 0.95, model_info, transform=axes[1, 0].transAxes,
                fontsize=10, verticalalignment='top', fontfamily='monospace',
                bbox=dict(boxstyle="round,pad=0.5", facecolor="lightgray", alpha=0.8))
axes[1, 0].set_xlim(0, 1)
axes[1, 0].set_ylim(0, 1)
axes[1, 0].axis('off')
axes[1, 0].set_title('‚ÑπÔ∏è Informaci√≥n del Modelo')

# 4. Comparaci√≥n de m√©tricas
metrics_data = {
    'M√©tricas': ['Par√°metros', 'Expertos', 'BitLinear', '√âpocas'],
    'Valores': [trainable_params/1000, config['num_experts'], bitlinear_count, len(train_losses)]
}

bars = axes[1, 1].bar(metrics_data['M√©tricas'], metrics_data['Valores'],
                      color=['gold', 'lightgreen', 'lightblue', 'orange'],
                      alpha=0.8, edgecolor='black', linewidth=1)
axes[1, 1].set_ylabel('Valor (K para par√°metros)')
axes[1, 1].set_title('üìà Resumen de M√©tricas')
axes[1, 1].tick_params(axis='x', rotation=45)

# A√±adir valores encima de las barras
for bar, value in zip(bars, metrics_data['Valores']):
    height = bar.get_height()
    if height > 1000:
        axes[1, 1].text(bar.get_x() + bar.get_width()/2., height + 0.1,
                       f'{height:.0f}K', ha='center', va='bottom', fontweight='bold')
    else:
        axes[1, 1].text(bar.get_x() + bar.get_width()/2., height + 0.1,
                       f'{height:.0f}', ha='center', va='bottom', fontweight='bold')

plt.tight_layout()
plt.show()

print("‚úÖ Visualizaciones completadas")


In [None]:
# üé≠ Prueba de Generaci√≥n de Texto
print("üé≠ Probando generaci√≥n de texto...")

def generate_text(model, tokenizer, prompt, max_tokens=50, temperature=1.0):
    """Generar texto usando el modelo entrenado"""
    model.eval()

    # Tokenizar prompt
    tokens = tokenizer.encode(prompt, add_special_tokens=True)
    if len(tokens) > 20:  # Limitar prompt
        tokens = tokens[:20]

    input_ids = torch.tensor([tokens], device=device)

    with torch.no_grad():
        for _ in range(max_tokens):
            # Forward pass - el modelo devuelve (logits, gate_weights)
            logits, _ = model(input_ids)

            # Tomar √∫ltimo logit y aplicar temperatura
            next_token_logits = logits[0, -1, :] / temperature

            # Sampling o argmax
            if temperature > 0:
                probs = torch.softmax(next_token_logits, dim=-1)
                next_token = torch.multinomial(probs, 1)
            else:
                next_token = torch.argmax(next_token_logits, dim=-1, keepdim=True)

            # Agregar token
            input_ids = torch.cat([input_ids, next_token.unsqueeze(0)], dim=-1)

            # Parar en EOS
            if next_token.item() == tokenizer.eos_token_id:
                break

            # Limitar longitud
            if input_ids.size(1) > 100:
                break

    # Decodificar
    generated_tokens = input_ids[0].cpu().tolist()
    generated_text = tokenizer.decode(generated_tokens, skip_special_tokens=True)

    return generated_text

# Probar diferentes prompts en espa√±ol
test_prompts = [
    "La inteligencia artificial",
    "El futuro de la educaci√≥n",
    "Los avances tecnol√≥gicos"
]

print("üéØ Generaciones del modelo MoE:")
print("=" * 50)

for i, prompt in enumerate(test_prompts, 1):
    try:
        generated = generate_text(model, tokenizer, prompt, max_tokens=30, temperature=0.8)
        print(f"\n{i}. Prompt: '{prompt}'")
        print(f"   Generado: '{generated}'")
        print(f"   Longitud: {len(generated.split())} palabras")
    except Exception as e:
        print(f"\n{i}. Prompt: '{prompt}'")
        print(f"   Error: {e}")

print("\n" + "=" * 50)

# Resumen final
print("\nüéâ ENTRENAMIENTO COMPLETADO")
print(f"‚úÖ Modelo: MoE Transformer con BitLinear")
print(f"‚úÖ Par√°metros: {trainable_params:,}")
print(f"‚úÖ Expertos: {config['num_experts']} (top-{config['k']})")
print(f"‚úÖ P√©rdida final: {train_losses[-1]:.4f}")
print(f"‚úÖ Tiempo total: {training_time:.1f}s")
print(f"‚úÖ BitLinear: {bitlinear_count} capas cuantizadas")
print(f"‚úÖ Dataset: Espa√±ol con {len(spanish_texts)} textos")

print("\nüîß Para usar el modelo:")
print("‚Ä¢ model.eval() - cambiar a evaluaci√≥n")
print("‚Ä¢ generate_text(model, tokenizer, prompt) - generar texto")
print("‚Ä¢ torch.save(model.state_dict(), 'moe_model.pt') - guardar modelo")
print("‚Ä¢ Experimento con diferentes temperaturas y prompts")
