# ðŸš€ Kuat GPU Decode - Performance Test

This notebook demonstrates GPU-accelerated VQ decode for Kuat archives.

**Key insight**: VQ decode is just a table lookup (`codebook[indices]`), which GPUs do ~100x faster than CPUs.

## Setup
1. Runtime â†’ Change runtime type â†’ **T4 GPU**
2. Run all cells
3. (Optional) Upload a real .qvq file to test with actual data

In [None]:
# Check GPU
import torch
print(f"PyTorch: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name(0)}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

In [None]:
import numpy as np
import time

class GPUCodebookDecoder:
    """
    GPU-accelerated VQ decode using PyTorch gather.
    
    Codebook stays on GPU, indices are tiny transfers.
    Decode is just: decoded = codebook[indices]
    """
    
    def __init__(self, codebook_np, device="cuda"):
        self.device = device
        self.codebook = torch.from_numpy(codebook_np).to(device)
        print(f"Codebook on {device}: {self.codebook.shape}")
    
    def decode_batch(self, indices, width, height, layout="NCHW"):
        """Decode indices to images on GPU."""
        batch_size = indices.shape[0]
        patch_h, patch_w = 2, 2
        patches_h, patches_w = height // patch_h, width // patch_w
        
        # Small transfer: indices to GPU
        idx_gpu = torch.from_numpy(indices.astype(np.int64)).to(self.device)
        
        # Fast GPU gather
        patches = self.codebook[idx_gpu]  # (B, P, 12)
        
        # Reshape to image
        patches = patches.reshape(batch_size, patches_h, patches_w, patch_h, patch_w, 3)
        images = patches.permute(0, 1, 3, 2, 4, 5).reshape(batch_size, height, width, 3)
        
        if layout == "NCHW":
            images = images.permute(0, 3, 1, 2)
        
        return images
    
    def decode_float(self, indices, width, height):
        """Decode to float32 [0,1], ready for model."""
        return self.decode_batch(indices, width, height, "NCHW").float() / 255.0

In [None]:
# Create test data (simulates .qvq contents)
width, height = 224, 224
codebook_size = 65536  # 16-bit
patches_per_image = (width // 2) * (height // 2)

# Simulated codebook (768 KB on GPU)
codebook = np.random.randint(0, 256, (codebook_size, 12), dtype=np.uint8)

# Initialize GPU decoder
decoder = GPUCodebookDecoder(codebook, device="cuda")

In [None]:
# =============================================================================
# BENCHMARK: CPU vs GPU Decode
# =============================================================================

def bench_cpu(codebook, indices, width, height, n=10):
    batch_size = indices.shape[0]
    patches_h, patches_w = height // 2, width // 2
    
    times = []
    for _ in range(n):
        t0 = time.perf_counter()
        patches = codebook[indices]
        patches = patches.reshape(batch_size, patches_h, patches_w, 2, 2, 3)
        images = np.transpose(patches, (0, 1, 3, 2, 4, 5)).reshape(batch_size, height, width, 3)
        times.append(time.perf_counter() - t0)
    return np.median(times)

def bench_gpu(decoder, indices, width, height, n=10):
    # Warmup
    for _ in range(3):
        decoder.decode_batch(indices, width, height)
        torch.cuda.synchronize()
    
    times = []
    for _ in range(n):
        t0 = time.perf_counter()
        decoder.decode_batch(indices, width, height)
        torch.cuda.synchronize()
        times.append(time.perf_counter() - t0)
    return np.median(times)

print("Batch Size |   CPU img/s |    GPU img/s | Speedup")
print("-" * 55)

for batch_size in [32, 64, 128, 256, 512, 1024]:
    indices = np.random.randint(0, codebook_size, (batch_size, patches_per_image), dtype=np.uint16)
    
    cpu_time = bench_cpu(codebook, indices, width, height)
    gpu_time = bench_gpu(decoder, indices, width, height)
    
    cpu_ips = batch_size / cpu_time
    gpu_ips = batch_size / gpu_time
    speedup = gpu_ips / cpu_ips
    
    print(f"{batch_size:10d} | {cpu_ips:11,.0f} | {gpu_ips:12,.0f} | {speedup:6.1f}x")

In [None]:
# =============================================================================
# PEAK THROUGHPUT TEST
# =============================================================================

print("\n" + "=" * 60)
print("PEAK THROUGHPUT TEST")
print("=" * 60)

batch_size = 512
num_batches = 100
indices = np.random.randint(0, codebook_size, (batch_size, patches_per_image), dtype=np.uint16)

# Warmup
for _ in range(10):
    decoder.decode_batch(indices, width, height)
torch.cuda.synchronize()

# Timed run
start = time.perf_counter()
for _ in range(num_batches):
    images = decoder.decode_batch(indices, width, height)
torch.cuda.synchronize()
elapsed = time.perf_counter() - start

total_images = batch_size * num_batches
ips = total_images / elapsed

print(f"\nDecoded {total_images:,} images in {elapsed:.2f}s")
print(f"Throughput: {ips:,.0f} images/sec")
print(f"\nAt 224x224 RGB: {ips * 224 * 224 * 3 / 1e9:.2f} GB/sec of pixels")

In [None]:
# =============================================================================
# END-TO-END: Simulated Training Loop
# =============================================================================

print("\n" + "=" * 60)
print("SIMULATED TRAINING LOOP")
print("=" * 60)

# Simple CNN for testing
import torch.nn as nn

model = nn.Sequential(
    nn.Conv2d(3, 64, 7, stride=2, padding=3),
    nn.ReLU(),
    nn.AdaptiveAvgPool2d(1),
    nn.Flatten(),
    nn.Linear(64, 10)
).cuda()

criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)

batch_size = 256
num_batches = 50

# Simulate training
start = time.perf_counter()
total_images = 0

for i in range(num_batches):
    # Simulate loading indices from .qvq (this would be mmap read)
    indices = np.random.randint(0, codebook_size, (batch_size, patches_per_image), dtype=np.uint16)
    labels = torch.randint(0, 10, (batch_size,)).cuda()
    
    # GPU decode - images already on GPU!
    images = decoder.decode_float(indices, 224, 224)
    
    # Forward + backward
    outputs = model(images)
    loss = criterion(outputs, labels)
    
    optimizer.zero_grad()
    loss.backward()
    optimizer.step()
    
    total_images += batch_size

torch.cuda.synchronize()
elapsed = time.perf_counter() - start

print(f"\nTraining loop: {total_images:,} images in {elapsed:.2f}s")
print(f"Throughput: {total_images/elapsed:,.0f} images/sec (including forward/backward!)")

## ðŸ“Š Expected Results

| Metric | CPU Decode | GPU Decode |
|--------|------------|------------|
| Decode only | ~5,000 img/s | ~50,000+ img/s |
| With training | ~2,000 img/s | ~10,000+ img/s |

## ðŸ”‘ Key Benefits

1. **No CPUâ†’GPU copy**: Images decoded directly on GPU
2. **Tiny transfers**: Only 25 KB of indices per batch (vs 38 MB decoded pixels)
3. **Parallel gather**: GPU does billions of lookups per second
4. **Memory efficient**: Codebook is only 768 KB on GPU