# Benchmark CPU & GPU — Colab

Script para verificar que CPU y GPU llegan a su máximo rendimiento.
Ejecuta celda por celda y observa los resultados.

In [None]:
import torch
import time
import os
import multiprocessing
import threading
import subprocess
import math
import numpy as np
from concurrent.futures import ThreadPoolExecutor, ProcessPoolExecutor

print("="*60)
print("INFORMACION DEL SISTEMA")
print("="*60)
print(f"CPU cores (logicos): {multiprocessing.cpu_count()}")
print(f"CPU cores (fisicos): {os.cpu_count()}")
print(f"PyTorch: {torch.__version__}")
print(f"CUDA disponible: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"VRAM total: {torch.cuda.get_device_properties(0).total_mem / 1e9:.1f} GB")
    print(f"Compute capability: {torch.cuda.get_device_properties(0).major}.{torch.cuda.get_device_properties(0).minor}")
print()

# RAM info
try:
    with open('/proc/meminfo') as f:
        for line in f:
            if 'MemTotal' in line:
                gb = int(line.split()[1]) / 1e6
                print(f"RAM total: {gb:.1f} GB")
                break
except:
    pass

# CPU model
try:
    result = subprocess.run(['lscpu'], capture_output=True, text=True)
    for line in result.stdout.split('\n'):
        if 'Model name' in line:
            print(f"CPU: {line.split(':')[1].strip()}")
            break
except:
    pass

## 1. Benchmark GPU — Operaciones matriciales

In [None]:
if not torch.cuda.is_available():
    print("NO HAY GPU — salta esta celda")
else:
    device = torch.device('cuda')
    print("="*60)
    print("BENCHMARK GPU — Multiplicacion de matrices")
    print("="*60)

    # Warmup
    a = torch.randn(1000, 1000, device=device)
    b = torch.randn(1000, 1000, device=device)
    for _ in range(10):
        _ = torch.mm(a, b)
    torch.cuda.synchronize()

    sizes = [1024, 2048, 4096, 8192]
    for size in sizes:
        a = torch.randn(size, size, device=device)
        b = torch.randn(size, size, device=device)
        torch.cuda.synchronize()

        iters = 20 if size <= 4096 else 5
        t0 = time.perf_counter()
        for _ in range(iters):
            c = torch.mm(a, b)
        torch.cuda.synchronize()
        elapsed = (time.perf_counter() - t0) / iters

        flops = 2 * size**3
        tflops = flops / elapsed / 1e12
        print(f"  [{size}x{size}] {elapsed*1000:.2f} ms/op | {tflops:.2f} TFLOPS")

    # FP16 (tensor cores)
    print("\n  --- FP16 (Tensor Cores) ---")
    for size in sizes:
        a = torch.randn(size, size, device=device, dtype=torch.float16)
        b = torch.randn(size, size, device=device, dtype=torch.float16)
        torch.cuda.synchronize()

        iters = 20 if size <= 4096 else 5
        t0 = time.perf_counter()
        for _ in range(iters):
            c = torch.mm(a, b)
        torch.cuda.synchronize()
        elapsed = (time.perf_counter() - t0) / iters

        flops = 2 * size**3
        tflops = flops / elapsed / 1e12
        print(f"  [{size}x{size}] {elapsed*1000:.2f} ms/op | {tflops:.2f} TFLOPS")

    print("\n  GPU matmul benchmark completado.")

## 2. Benchmark GPU — Inferencia de red neuronal (simula AlphaSnake)

In [None]:
if not torch.cuda.is_available():
    print("NO HAY GPU — salta esta celda")
else:
    import torch.nn as nn

    print("="*60)
    print("BENCHMARK GPU — Inferencia NN (simula AlphaSnake)")
    print("="*60)

    class SimpleResNet(nn.Module):
        def __init__(self, ch, blocks, board):
            super().__init__()
            self.stem = nn.Sequential(nn.Conv2d(4, ch, 3, padding=1), nn.BatchNorm2d(ch), nn.ReLU())
            layers = []
            for _ in range(blocks):
                layers.append(nn.Sequential(
                    nn.Conv2d(ch, ch, 3, padding=1), nn.BatchNorm2d(ch), nn.ReLU(),
                    nn.Conv2d(ch, ch, 3, padding=1), nn.BatchNorm2d(ch)
                ))
            self.blocks = nn.ModuleList(layers)
            self.fc = nn.Linear(ch * board * board, 4)
        def forward(self, x):
            x = self.stem(x)
            for block in self.blocks:
                x = torch.relu(x + block(x))
            return self.fc(x.flatten(1))

    configs = [
        ("10x10 (64ch, 6 bloques)", 64, 6, 10),
        ("20x20 (128ch, 10 bloques)", 128, 10, 20),
    ]

    for name, ch, blocks, board in configs:
        net = SimpleResNet(ch, blocks, board).to(device).eval()
        params = sum(p.numel() for p in net.parameters())
        print(f"\n  {name} — {params:,} params")

        # Batch = 1 (como en MCTS)
        for batch in [1, 8, 16, 32, 64]:
            x = torch.randn(batch, 4, board, board, device=device)
            # Warmup
            with torch.no_grad():
                for _ in range(20):
                    _ = net(x)
            torch.cuda.synchronize()

            iters = 200
            t0 = time.perf_counter()
            with torch.no_grad():
                for _ in range(iters):
                    _ = net(x)
            torch.cuda.synchronize()
            elapsed = (time.perf_counter() - t0) / iters
            inferences_per_sec = batch / elapsed
            print(f"    batch={batch:3d}: {elapsed*1000:.3f} ms | {inferences_per_sec:.0f} inf/s")

        del net
        torch.cuda.empty_cache()

    print("\n  NN inference benchmark completado.")

## 3. Benchmark GPU — Uso maximo (stress test)

In [None]:
if not torch.cuda.is_available():
    print("NO HAY GPU — salta esta celda")
else:
    print("="*60)
    print("STRESS TEST GPU — 15 segundos de carga maxima")
    print("="*60)
    print("Ejecutando... monitorea con nvidia-smi en otra celda.")

    size = 4096
    a = torch.randn(size, size, device=device)
    b = torch.randn(size, size, device=device)

    ops = 0
    t_start = time.perf_counter()
    duration = 15  # segundos

    # Monitor thread
    gpu_utils = []
    stop_monitor = threading.Event()

    def monitor_gpu():
        while not stop_monitor.is_set():
            try:
                r = subprocess.run(
                    ['nvidia-smi', '--query-gpu=utilization.gpu,memory.used,memory.total,temperature.gpu,power.draw',
                     '--format=csv,noheader,nounits'],
                    capture_output=True, text=True, timeout=5
                )
                parts = r.stdout.strip().split(',')
                if len(parts) >= 5:
                    gpu_utils.append({
                        'util': float(parts[0]),
                        'mem_used': float(parts[1]),
                        'mem_total': float(parts[2]),
                        'temp': float(parts[3]),
                        'power': float(parts[4]),
                    })
            except:
                pass
            time.sleep(1)

    t = threading.Thread(target=monitor_gpu, daemon=True)
    t.start()

    while time.perf_counter() - t_start < duration:
        c = torch.mm(a, b)
        ops += 1
    torch.cuda.synchronize()

    stop_monitor.set()
    t.join(timeout=3)

    elapsed = time.perf_counter() - t_start
    avg_tflops = (2 * size**3 * ops) / elapsed / 1e12

    print(f"\n  Duracion: {elapsed:.1f}s")
    print(f"  Operaciones: {ops}")
    print(f"  Rendimiento promedio: {avg_tflops:.2f} TFLOPS (FP32)")

    if gpu_utils:
        avg_util = np.mean([g['util'] for g in gpu_utils])
        max_util = max(g['util'] for g in gpu_utils)
        avg_temp = np.mean([g['temp'] for g in gpu_utils])
        avg_power = np.mean([g['power'] for g in gpu_utils])
        max_mem = max(g['mem_used'] for g in gpu_utils)
        print(f"\n  GPU Utilization: avg={avg_util:.0f}%, max={max_util:.0f}%")
        print(f"  VRAM: {max_mem:.0f} / {gpu_utils[0]['mem_total']:.0f} MiB")
        print(f"  Temperatura: {avg_temp:.0f}°C")
        print(f"  Potencia: {avg_power:.0f}W")

    del a, b
    torch.cuda.empty_cache()
    print("\n  Stress test GPU completado.")

## 4. Benchmark CPU — Single-thread y Multi-thread

In [None]:
print("="*60)
print("BENCHMARK CPU")
print("="*60)

def cpu_work(n=2_000_000):
    """Trabajo CPU-bound: calculos matematicos intensivos."""
    total = 0.0
    for i in range(1, n):
        total += math.sin(i) * math.cos(i) * math.sqrt(i)
    return total

# --- Single thread ---
print("\n  --- Single Thread ---")
t0 = time.perf_counter()
cpu_work(2_000_000)
single_time = time.perf_counter() - t0
print(f"  1 thread: {single_time:.2f}s")

# --- Multi thread (ThreadPool — muestra GIL bottleneck) ---
print("\n  --- Multi Thread (ThreadPoolExecutor) ---")
print("  NOTA: Python tiene GIL, asi que threads NO paralelizan CPU puro.")
print("  Esto muestra el overhead real del MCTS paralelo.")
n_threads_list = [1, 4, 8, 16, 32]
work_per_thread = 500_000

for n_threads in n_threads_list:
    t0 = time.perf_counter()
    with ThreadPoolExecutor(max_workers=n_threads) as pool:
        futures = [pool.submit(cpu_work, work_per_thread) for _ in range(n_threads)]
        results = [f.result() for f in futures]
    elapsed = time.perf_counter() - t0
    speedup = (single_time * n_threads / 4) / elapsed  # relativo
    print(f"  {n_threads:2d} threads: {elapsed:.2f}s (total work = {n_threads}x)")

# --- Multi process (ProcessPool — verdadero paralelismo) ---
print("\n  --- Multi Process (ProcessPoolExecutor) ---")
print("  Esto SI paraleliza en multiples cores.")

for n_procs in [1, 2, 4, 8]:
    t0 = time.perf_counter()
    with ProcessPoolExecutor(max_workers=n_procs) as pool:
        futures = [pool.submit(cpu_work, 1_000_000) for _ in range(n_procs)]
        results = [f.result() for f in futures]
    elapsed = time.perf_counter() - t0
    speedup = (single_time / 2 * n_procs) / elapsed
    print(f"  {n_procs} procesos: {elapsed:.2f}s | speedup: {speedup:.2f}x")

print("\n  CPU benchmark completado.")

## 5. Benchmark CPU — Simula carga tipo MCTS

In [None]:
print("="*60)
print("BENCHMARK — Simula carga MCTS (Python loops + GPU inference)")
print("="*60)
print("Esto replica el patron real: loops Python + inferencia GPU.")

import torch.nn as nn

device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')

# Red simple para simular inferencia
class TinyNet(nn.Module):
    def __init__(self, board=20, ch=128):
        super().__init__()
        self.conv = nn.Sequential(
            nn.Conv2d(4, ch, 3, padding=1), nn.BatchNorm2d(ch), nn.ReLU(),
            nn.Conv2d(ch, ch, 3, padding=1), nn.BatchNorm2d(ch), nn.ReLU(),
        )
        self.fc = nn.Linear(ch * board * board, 5)
    def forward(self, x):
        return self.fc(self.conv(x).flatten(1))

net = TinyNet(20, 128).to(device).eval()

def simulate_mcts_game(net, board_size, num_sims, max_moves):
    """Simula un juego MCTS: loops Python + inference GPU."""
    moves = 0
    total_inferences = 0
    for move in range(max_moves):
        # Simula num_sims evaluaciones MCTS
        for sim in range(num_sims):
            # Trabajo Python (simula tree traversal)
            dummy = sum(range(50))  # overhead Python tipico
            # Inferencia GPU
            x = torch.randn(1, 4, board_size, board_size, device=device)
            with torch.no_grad():
                _ = net(x)
            total_inferences += 1
        moves += 1
    return moves, total_inferences

# Test: un juego corto (10 movimientos, 100 sims)
print("\n  Simulando 1 juego (10 moves x 100 sims = 1000 inferences)...")
t0 = time.perf_counter()
moves, infs = simulate_mcts_game(net, 20, 100, 10)
torch.cuda.synchronize()
elapsed = time.perf_counter() - t0
print(f"  Tiempo: {elapsed:.2f}s")
print(f"  Inferences: {infs}")
print(f"  Inferences/s: {infs/elapsed:.0f}")
print(f"  Tiempo por movimiento: {elapsed/moves*1000:.1f} ms")
print(f"  Tiempo por simulacion: {elapsed/infs*1000:.2f} ms")

# Estimar tiempo por juego completo
avg_moves_20x20 = 500  # estimacion conservadora
est_time_per_game = elapsed / moves * avg_moves_20x20
print(f"\n  --- Estimacion para juego completo 20x20 ---")
print(f"  ~{avg_moves_20x20} moves x 100 sims = {avg_moves_20x20*100:,} inferences")
print(f"  Tiempo estimado: {est_time_per_game:.0f}s ({est_time_per_game/60:.1f} min)")
print(f"  100 juegos (1 iter fast): {est_time_per_game*100/3600:.1f} horas")

# Ahora con threading (como el InferenceBatcher)
print("\n  --- Con 16 threads paralelos (como InferenceBatcher) ---")

def simulate_short_game(net, board_size):
    return simulate_mcts_game(net, board_size, 100, 10)

t0 = time.perf_counter()
with ThreadPoolExecutor(max_workers=16) as pool:
    futures = [pool.submit(simulate_short_game, net, 20) for _ in range(16)]
    results = [f.result() for f in futures]
torch.cuda.synchronize()
elapsed_parallel = time.perf_counter() - t0
total_infs = sum(r[1] for r in results)
speedup = (elapsed * 16) / elapsed_parallel

print(f"  16 juegos paralelos: {elapsed_parallel:.2f}s")
print(f"  Total inferences: {total_infs:,}")
print(f"  Inferences/s: {total_infs/elapsed_parallel:.0f}")
print(f"  Speedup vs secuencial: {speedup:.2f}x")

est_parallel = est_time_per_game * 100 / speedup
print(f"  100 juegos con paralelismo: {est_parallel/3600:.1f} horas")

# GPU utilization during this
try:
    r = subprocess.run(
        ['nvidia-smi', '--query-gpu=utilization.gpu,memory.used',
         '--format=csv,noheader,nounits'],
        capture_output=True, text=True, timeout=5
    )
    print(f"\n  GPU actual: {r.stdout.strip()}% util, MiB used")
except:
    pass

del net
torch.cuda.empty_cache()
print("\n  MCTS simulation benchmark completado.")

## 6. Resumen y recomendaciones

In [None]:
print("="*60)
print("RESUMEN")
print("="*60)
print("""
INTERPRETACION DE RESULTADOS:

1. GPU Matmul:
   - T4:  ~8 TFLOPS FP32, ~65 TFLOPS FP16
   - A100: ~20 TFLOPS FP32, ~312 TFLOPS FP16
   - H100: ~50 TFLOPS FP32, ~990 TFLOPS FP16
   Si ves numeros cercanos a estos, la GPU esta rindiendo bien.

2. GPU Stress Test:
   - Deberia mostrar ~95-100% utilization.
   - Si es menor, hay throttling termico o de potencia.

3. NN Inference:
   - batch=1 es LENTO en cualquier GPU (latencia domina)
   - batch=16+ es donde la GPU brilla
   - Esto explica por que MCTS (batch=1) no aprovecha la GPU.

4. CPU Threads vs Processes:
   - ThreadPool: NO acelera trabajo CPU (GIL de Python)
   - ProcessPool: SI acelera (verdadero paralelismo)
   - MCTS usa threads (para compartir GPU) -> limitado por GIL

5. Simulacion MCTS:
   - Muestra el cuello de botella real: Python loops + batch=1
   - El speedup con threads viene de la GPU batching, no de CPU

CONCLUSION:
   La GPU SI funciona bien. El problema es que MCTS en Python
   no puede saturar la GPU porque el cuello de botella es el
   loop Python (GIL + overhead de objetos). La solucion real
   seria MCTS en C++ con bindings Python.
""")