# Video-to-Manipulation Transformer: GPU-Only Stage 1 Training

This notebook implements the fastest GPU-only training approach for maximum performance on H200.

**Key Features:**
- Entire dataset cached in GPU memory (zero CPU-GPU transfers)
- Large batch sizes (1024+)
- BFloat16 mixed precision
- Compiled models for better performance
- No DataLoader overhead

**Requirements:**
- H200 GPU with 140GB memory
- PyTorch 2.0+ for torch.compile
- CUDA 12.0+

In [1]:
# Setup and imports
import os
import sys
import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
from torch.utils.data import DataLoader
from torch.cuda.amp import GradScaler, autocast
import numpy as np
import matplotlib.pyplot as plt
from tqdm.notebook import tqdm
import time
from datetime import datetime
import subprocess

# CRITICAL: Set multiprocessing start method for CUDA
import multiprocessing
try:
    multiprocessing.set_start_method('spawn', force=True)
except RuntimeError:
    pass  # Already set

# Set environment
os.environ['DEX_YCB_DIR'] = '/home/n231/231nProjectV2/dex-ycb-toolkit/data'

# CUDA optimizations for H200
torch.backends.cuda.matmul.allow_tf32 = True
torch.backends.cudnn.allow_tf32 = True
torch.backends.cudnn.benchmark = True
torch.backends.cudnn.deterministic = False

# Add project root to path
project_root = os.path.abspath('.')
sys.path.insert(0, project_root)

# Check GPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"Using device: {device}")
if torch.cuda.is_available():
    print(f"GPU: {torch.cuda.get_device_name()}")
    print(f"Memory: {torch.cuda.get_device_properties(0).total_memory / 1e9:.1f} GB")

Using device: cuda
GPU: NVIDIA H200
Memory: 150.0 GB


In [2]:
# Import our modules
from models.encoders.hand_encoder import HandPoseEncoder
from models.encoders.object_encoder import ObjectPoseEncoder
from models.encoders.contact_encoder import ContactDetectionEncoder
from data.dexycb_dataset import DexYCBDataset
from data.gpu_preprocessing import GPUVideoPreprocessor, FastDataCollator

print("✓ All modules imported successfully")

✓ All modules imported successfully


In [3]:
# H200 Optimized Configuration
config = {
    # Data settings - maximize throughput
    'batch_size': 256,  # Large batch for H200
    'num_workers': 0,   # CRITICAL: Set to 0 to avoid CUDA fork error
    'pin_memory': True,
    'persistent_workers': False,  # Not needed with 0 workers
    
    # Model settings - scale up
    'patch_size': 16,
    'image_size': [224, 224],
    'hand_hidden_dim': 1024,  # 2x larger
    'object_hidden_dim': 1024,
    'contact_hidden_dim': 512,
    'hand_layers': 12,  # 2x deeper
    'object_layers': 12,
    'contact_layers': 8,
    
    # Training settings
    'learning_rate': 5e-4,
    'num_epochs': 5,
    'grad_clip': 1.0,
    'warmup_steps': 100,
    'accumulation_steps': 2,  # Effective batch = 512
    
    # Mixed precision
    'use_amp': True,
    'amp_dtype': torch.bfloat16,
    
    # Logging
    'log_interval': 10,
    'val_interval': 50
}

print("H200 Configuration loaded")
print(f"Effective batch size: {config['batch_size'] * config['accumulation_steps']}")

H200 Configuration loaded
Effective batch size: 512


In [4]:
# Create datasets
print("Creating datasets...")

train_dataset = DexYCBDataset(split='s0_train', max_objects=10)
val_dataset = DexYCBDataset(split='s0_val', max_objects=10)

print(f"Training samples: {len(train_dataset):,}")
print(f"Validation samples: {len(val_dataset):,}")

# Create GPU-accelerated data loaders
# Use FastDataCollator to move data directly to GPU
collate_fn = FastDataCollator(device='cuda')

train_loader = DataLoader(
    train_dataset,
    batch_size=config['batch_size'],
    shuffle=True,
    num_workers=config['num_workers'],  # 0 to avoid CUDA fork
    pin_memory=config['pin_memory'],
    collate_fn=collate_fn,
    drop_last=True
)

val_loader = DataLoader(
    val_dataset,
    batch_size=config['batch_size'],
    shuffle=False,
    num_workers=0,  # Always 0 for validation
    pin_memory=True,
    collate_fn=collate_fn
)

print(f"Train batches per epoch: {len(train_loader)}")
print("✓ Data loaders created with GPU acceleration")

Creating datasets...
Training samples: 465,504
Validation samples: 23,200
Train batches per epoch: 1818
✓ Data loaders created with GPU acceleration


In [5]:
# Create scaled-up models for H200
print("Creating scaled-up models...")

patch_dim = 3 * config['patch_size'] * config['patch_size']

# Large hand encoder
hand_encoder = HandPoseEncoder(
    input_dim=patch_dim,
    hidden_dim=config['hand_hidden_dim'],
    num_layers=config['hand_layers'],
    num_heads=16,
    mlp_dim=4096,
    dropout=0.1
).to(device)

# Large object encoder
object_encoder = ObjectPoseEncoder(
    input_dim=patch_dim,
    hidden_dim=config['object_hidden_dim'],
    num_layers=config['object_layers'],
    num_heads=16,
    mlp_dim=4096,
    dropout=0.1,
    max_objects=10
).to(device)

# Large contact encoder
contact_encoder = ContactDetectionEncoder(
    input_dim=patch_dim,
    hidden_dim=config['contact_hidden_dim'],
    num_layers=config['contact_layers'],
    num_heads=16,
    mlp_dim=2048,
    dropout=0.1
).to(device)

# Compile models for better performance
if hasattr(torch, 'compile'):
    print("Compiling models...")
    hand_encoder = torch.compile(hand_encoder, mode='default')
    object_encoder = torch.compile(object_encoder, mode='default')
    contact_encoder = torch.compile(contact_encoder, mode='default')

# Model info
def count_parameters(model):
    return sum(p.numel() for p in model.parameters() if p.requires_grad)

total_params = (count_parameters(hand_encoder) + 
                count_parameters(object_encoder) + 
                count_parameters(contact_encoder))

print(f"\nModel parameters:")
print(f"  Hand encoder: {count_parameters(hand_encoder)/1e6:.1f}M")
print(f"  Object encoder: {count_parameters(object_encoder)/1e6:.1f}M")
print(f"  Contact encoder: {count_parameters(contact_encoder)/1e6:.1f}M")
print(f"  Total: {total_params/1e6:.1f}M")

print(f"\nGPU Memory: {torch.cuda.memory_allocated()/1e9:.1f} GB")

Creating scaled-up models...
Compiling models...

Model parameters:
  Hand encoder: 153.6M
  Object encoder: 153.2M
  Contact encoder: 27.6M
  Total: 334.4M

GPU Memory: 1.3 GB


In [6]:
# Create GPU preprocessor
gpu_preprocessor = GPUVideoPreprocessor(
    image_size=tuple(config['image_size']),
    patch_size=config['patch_size'],
    normalize=True,
    device='cuda'
).to(device)

# Setup optimizers and schedulers
optimizer_hand = optim.AdamW(hand_encoder.parameters(), lr=config['learning_rate'], weight_decay=0.01)
optimizer_object = optim.AdamW(object_encoder.parameters(), lr=config['learning_rate'], weight_decay=0.01)
optimizer_contact = optim.AdamW(contact_encoder.parameters(), lr=config['learning_rate'], weight_decay=0.01)

# Mixed precision scalers
scaler_hand = GradScaler(enabled=config['use_amp'])
scaler_object = GradScaler(enabled=config['use_amp'])
scaler_contact = GradScaler(enabled=config['use_amp'])

# Loss function
mse_loss = nn.MSELoss()

print("✓ Optimizers and mixed precision configured")

✓ Optimizers and mixed precision configured


  scaler_hand = GradScaler(enabled=config['use_amp'])
  scaler_object = GradScaler(enabled=config['use_amp'])
  scaler_contact = GradScaler(enabled=config['use_amp'])


In [7]:
# H200 Optimized Training Function
def train_epoch(epoch):
    """Optimized training for H200 with mixed precision"""
    hand_encoder.train()
    object_encoder.train()
    contact_encoder.train()
    
    total_loss = 0
    num_batches = 0
    data_time = 0
    compute_time = 0
    batch_start = time.time()
    
    # Zero gradients at start
    optimizer_hand.zero_grad(set_to_none=True)
    optimizer_object.zero_grad(set_to_none=True)
    optimizer_contact.zero_grad(set_to_none=True)
    
    progress_bar = tqdm(train_loader, desc=f'Epoch {epoch+1}/{config["num_epochs"]}')
    
    for batch_idx, batch in enumerate(progress_bar):
        # Data loading time
        data_time += time.time() - batch_start
        compute_start = time.time()
        
        # All data is already on GPU thanks to FastDataCollator
        # Process images through GPU preprocessor
        with torch.no_grad():
            patches = gpu_preprocessor(batch['color'])
        
        # Prepare ground truth
        hand_gt = batch['hand_joints_3d']
        if hand_gt.dim() == 4 and hand_gt.shape[1] == 1:
            hand_gt = hand_gt.squeeze(1)
        
        valid_hands = ~torch.all(hand_gt.view(hand_gt.shape[0], -1) == -1, dim=1)
        
        # === MIXED PRECISION FORWARD PASSES ===
        
        # Hand encoder
        with autocast(device_type='cuda', dtype=config['amp_dtype']):
            hand_output = hand_encoder(patches)
            
            if valid_hands.any():
                hand_loss = mse_loss(
                    hand_output['joints_3d'][valid_hands],
                    hand_gt[valid_hands]
                ) / config['accumulation_steps']
            else:
                hand_loss = torch.tensor(0.0, device=device, requires_grad=True)
        
        # Backward for hand
        scaler_hand.scale(hand_loss).backward()
        
        # Object encoder
        with autocast(device_type='cuda', dtype=config['amp_dtype']):
            object_output = object_encoder(patches, object_ids=batch.get('ycb_ids', None))
            
            # Object loss
            object_loss = torch.tensor(0.0, device=device, requires_grad=True)
            if batch['object_poses'].shape[1] > 0:
                valid_objects = ~torch.all(batch['object_poses'] == 0, dim=(2, 3))
                if valid_objects.any():
                    object_positions_gt = batch['object_poses'][:, :, :3, 3]
                    num_pred_objects = min(object_output['positions'].shape[1], batch['object_poses'].shape[1])
                    valid_mask = valid_objects[:, :num_pred_objects]
                    
                    if valid_mask.any():
                        pred_positions = object_output['positions'][:, :num_pred_objects]
                        gt_positions = object_positions_gt[:, :num_pred_objects]
                        pred_flat = pred_positions[valid_mask]
                        gt_flat = gt_positions[valid_mask]
                        object_loss = mse_loss(pred_flat, gt_flat) / config['accumulation_steps']
        
        # Backward for object
        scaler_object.scale(object_loss).backward()
        
        # Contact encoder (no ground truth)
        with autocast(device_type='cuda', dtype=config['amp_dtype']):
            contact_output = contact_encoder(
                hand_output['features'].detach(),
                object_output['features'].detach()
            )
        
        # Gradient accumulation
        if (batch_idx + 1) % config['accumulation_steps'] == 0:
            # Unscale and clip gradients
            scaler_hand.unscale_(optimizer_hand)
            scaler_object.unscale_(optimizer_object)
            
            torch.nn.utils.clip_grad_norm_(hand_encoder.parameters(), config['grad_clip'])
            torch.nn.utils.clip_grad_norm_(object_encoder.parameters(), config['grad_clip'])
            
            # Optimizer steps
            scaler_hand.step(optimizer_hand)
            scaler_object.step(optimizer_object)
            scaler_contact.step(optimizer_contact)
            
            # Update scalers
            scaler_hand.update()
            scaler_object.update()
            scaler_contact.update()
            
            # Zero gradients
            optimizer_hand.zero_grad(set_to_none=True)
            optimizer_object.zero_grad(set_to_none=True)
            optimizer_contact.zero_grad(set_to_none=True)
        
        compute_time += time.time() - compute_start
        
        # Update metrics
        total_loss += hand_loss.item() * config['accumulation_steps'] + object_loss.item() * config['accumulation_steps']
        num_batches += 1
        
        # Update progress bar
        gpu_mem = torch.cuda.memory_allocated() / 1e9
        samples_per_sec = (batch_idx + 1) * config['batch_size'] / (time.time() - progress_bar.start_t)
        
        progress_bar.set_postfix({
            'loss': f'{(hand_loss.item() + object_loss.item()) * config["accumulation_steps"]:.4f}',
            'gpu_gb': f'{gpu_mem:.1f}',
            'speed': f'{samples_per_sec:.0f}',
        })
        
        # Log periodically
        if (batch_idx + 1) % config['log_interval'] == 0:
            print(f"\n[{datetime.now().strftime('%H:%M:%S')}] "
                  f"GPU: {gpu_mem:.1f}GB | "
                  f"Speed: {samples_per_sec:.0f} samples/s")
        
        batch_start = time.time()
    
    return total_loss / max(num_batches, 1)


def validate():
    """Fast validation"""
    hand_encoder.eval()
    object_encoder.eval()
    
    total_loss = 0
    total_mpjpe = 0
    num_batches = 0
    num_valid_hands = 0
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(val_loader):
            if batch_idx >= 10:  # Quick validation
                break
                
            # GPU preprocessing
            patches = gpu_preprocessor(batch['color'])
            
            # Hand evaluation
            with autocast(device_type='cuda', dtype=config['amp_dtype']):
                hand_output = hand_encoder(patches)
                
            hand_gt = batch['hand_joints_3d']
            if hand_gt.dim() == 4 and hand_gt.shape[1] == 1:
                hand_gt = hand_gt.squeeze(1)
            
            valid_hands = ~torch.all(hand_gt.view(hand_gt.shape[0], -1) == -1, dim=1)
            
            if valid_hands.any():
                hand_loss = mse_loss(
                    hand_output['joints_3d'][valid_hands],
                    hand_gt[valid_hands]
                )
                
                # MPJPE metric
                mpjpe = (hand_output['joints_3d'][valid_hands] - hand_gt[valid_hands]).norm(dim=-1).mean()
                
                total_loss += hand_loss.item()
                total_mpjpe += mpjpe.item() * valid_hands.sum().item()
                num_valid_hands += valid_hands.sum().item()
            
            num_batches += 1
    
    return {
        'loss': total_loss / max(num_batches, 1),
        'mpjpe': total_mpjpe / max(num_valid_hands, 1) if num_valid_hands > 0 else 0
    }

In [8]:
# GPU Stats Function
def print_gpu_stats():
    """Print current GPU statistics"""
    if torch.cuda.is_available():
        allocated = torch.cuda.memory_allocated() / 1e9
        reserved = torch.cuda.memory_reserved() / 1e9
        total = torch.cuda.get_device_properties(0).total_memory / 1e9
        
        print("\n" + "="*60)
        print("GPU Statistics:")
        print(f"  Memory Allocated: {allocated:.1f} GB / {total:.1f} GB ({allocated/total*100:.1f}%)")
        print(f"  Memory Reserved: {reserved:.1f} GB")
        
        # Try to get utilization
        try:
            import subprocess
            result = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu,power.draw,power.limit', 
                                   '--format=csv,noheader,nounits'], 
                                  capture_output=True, text=True)
            if result.returncode == 0:
                gpu_util, power_draw, power_limit = result.stdout.strip().split(', ')
                print(f"  GPU Utilization: {gpu_util}%")
                print(f"  Power Draw: {power_draw}W / {power_limit}W ({float(power_draw)/float(power_limit)*100:.1f}%)")
        except:
            pass
        print("="*60 + "\n")

In [9]:
# Run H200 Optimized Training
print("Starting H200 optimized training...")
print(f"Effective batch size: {config['batch_size'] * config['accumulation_steps']}")
print(f"Mixed precision: {config['amp_dtype']}")
print("-" * 60)

# Training history
history = {
    'train_loss': [],
    'val_loss': [],
    'val_mpjpe': [],
    'throughput': []
}

# Initial GPU stats
print_gpu_stats()

# Training loop
for epoch in range(config['num_epochs']):
    start_time = time.time()
    
    # Train
    train_loss = train_epoch(epoch)
    history['train_loss'].append(train_loss)
    
    # Validate
    val_metrics = validate()
    history['val_loss'].append(val_metrics['loss'])
    history['val_mpjpe'].append(val_metrics['mpjpe'])
    
    # Calculate throughput
    epoch_time = time.time() - start_time
    samples_processed = len(train_loader) * config['batch_size']
    throughput = samples_processed / epoch_time
    history['throughput'].append(throughput)
    
    print(f"\n{'='*60}")
    print(f"Epoch {epoch+1}/{config['num_epochs']} Summary:")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Val Loss: {val_metrics['loss']:.4f}")
    print(f"  Val MPJPE: {val_metrics['mpjpe']*1000:.2f}mm")
    print(f"  Epoch Time: {epoch_time:.1f}s")
    print(f"  Throughput: {throughput:.1f} samples/s")
    print_gpu_stats()

print("\n✓ H200 optimized training completed!")

Starting H200 optimized training...
Effective batch size: 512
Mixed precision: torch.bfloat16
------------------------------------------------------------

GPU Statistics:
  Memory Allocated: 1.3 GB / 150.0 GB (0.9%)
  Memory Reserved: 1.4 GB
  GPU Utilization: 0%
  Power Draw: 114.15W / 700.00W (16.3%)



Epoch 1/5:   0%|          | 0/1818 [00:00<?, ?it/s]

RuntimeError: cannot pin 'torch.cuda.FloatTensor' only dense CPU tensors can be pinned

# Plot training curves
plt.figure(figsize=(15, 5))

plt.subplot(1, 3, 1)
plt.plot(history['train_loss'], label='Train Loss')
plt.plot(history['val_loss'], label='Val Loss')
plt.xlabel('Epoch')
plt.ylabel('Loss')
plt.title('Training and Validation Loss')
plt.legend()
plt.grid(True)

plt.subplot(1, 3, 2)
plt.plot([x * 1000 for x in history['val_mpjpe']], label='Val MPJPE (mm)')
plt.xlabel('Epoch')
plt.ylabel('MPJPE (mm)')
plt.title('Validation Hand Pose Error')
plt.legend()
plt.grid(True)

plt.subplot(1, 3, 3)
plt.plot(history['throughput'], label='Throughput')
plt.xlabel('Epoch')
plt.ylabel('Samples/sec')
plt.title('Training Throughput')
plt.legend()
plt.grid(True)

plt.tight_layout()
plt.show()

In [None]:
# Save checkpoints
checkpoint_dir = 'checkpoints/h200_optimized'
os.makedirs(checkpoint_dir, exist_ok=True)

# Save encoder states
torch.save({
    'model_state_dict': hand_encoder.state_dict(),
    'optimizer_state_dict': optimizer_hand.state_dict(),
    'config': config,
    'history': history
}, os.path.join(checkpoint_dir, 'hand_encoder.pth'))

torch.save({
    'model_state_dict': object_encoder.state_dict(),
    'optimizer_state_dict': optimizer_object.state_dict(),
    'config': config
}, os.path.join(checkpoint_dir, 'object_encoder.pth'))

torch.save({
    'model_state_dict': contact_encoder.state_dict(),
    'optimizer_state_dict': optimizer_contact.state_dict(),
    'config': config
}, os.path.join(checkpoint_dir, 'contact_encoder.pth'))

print(f"✓ Checkpoints saved to {checkpoint_dir}")

## GPU-Only Dataset Approach - Maximum Performance

This section implements a GPU-only dataset that caches everything in GPU memory, eliminating ALL CPU-GPU transfers during training.

In [10]:
# Import GPU-only dataset
from data.gpu_only_dataset import GPUOnlyDataset, GPUBatchGenerator

# GPU-only configuration
gpu_config = {
    'max_samples_train': 50000,  # Adjust based on GPU memory (50k samples ≈ 50GB)
    'max_samples_val': 5000,
    'batch_size': 1024,  # Large batch size
    'image_size': (224, 224),
    'cache_path': 'gpu_cache',  # Cache preprocessed data
    'dtype': torch.float32,  # Use bfloat16 to fit more samples
}

print("Creating GPU-only datasets...")
print(f"This will cache {gpu_config['max_samples_train']} training samples in GPU memory")
print("First run will be slow (preprocessing), subsequent runs will be instant")

# Clear GPU memory first
torch.cuda.empty_cache()
torch.cuda.synchronize()

print(f"\nCurrent GPU memory: {torch.cuda.memory_allocated()/1e9:.1f}GB")

Creating GPU-only datasets...
This will cache 50000 training samples in GPU memory
First run will be slow (preprocessing), subsequent runs will be instant

Current GPU memory: 3.2GB


In [11]:
# Create GPU-only datasets
# These load and preprocess everything once, then keep it in GPU memory

gpu_train_dataset = GPUOnlyDataset(
    split='s0_train',
    max_samples=gpu_config['max_samples_train'],
    image_size=gpu_config['image_size'],
    device='cuda',
    dtype=gpu_config['dtype'],
    cache_path=gpu_config['cache_path']
)

gpu_val_dataset = GPUOnlyDataset(
    split='s0_val',
    max_samples=gpu_config['max_samples_val'],
    image_size=gpu_config['image_size'],
    device='cuda',
    dtype=gpu_config['dtype'],
    cache_path=gpu_config['cache_path']
)

# Create batch generators (zero-copy from GPU memory)
gpu_train_loader = GPUBatchGenerator(gpu_train_dataset, batch_size=gpu_config['batch_size'], shuffle=True)
gpu_val_loader = GPUBatchGenerator(gpu_val_dataset, batch_size=gpu_config['batch_size']//2, shuffle=False)

print(f"\n✓ GPU-only datasets created:")
print(f"  Train: {len(gpu_train_dataset)} samples cached in GPU")
print(f"  Val: {len(gpu_val_dataset)} samples cached in GPU")
print(f"  GPU Memory Used: {torch.cuda.memory_allocated()/1e9:.1f} GB")
print(f"  Batches per epoch: {len(gpu_train_loader)}")

Building GPU dataset for s0_train...
Allocating GPU memory for 50000 samples...
Loading and preprocessing data...


RuntimeError: The expanded size of the tensor (48) must match the existing size (51) at non-singleton dimension 0.  Target sizes: [48].  Tensor sizes: [51]

In [13]:
# GPU-only training function
def gpu_only_train_epoch(epoch, model_hand, model_object, model_contact, 
                        opt_hand, opt_object, opt_contact, 
                        loader, preprocessor):
    """Training with zero CPU operations - everything stays on GPU"""
    model_hand.train()
    model_object.train()
    model_contact.train()
    
    total_loss = 0
    num_batches = 0
    epoch_start = time.time()
    
    # Progress bar
    progress_bar = tqdm(loader, desc=f'GPU-Only Epoch {epoch+1}')
    
    for batch_idx, batch in enumerate(progress_bar):
        # Everything is already on GPU - no transfers!
        
        # Create patches directly on GPU
        with torch.no_grad():
            patches = preprocessor.create_patches_batch(batch['color'])
        
        # Zero gradients
        opt_hand.zero_grad(set_to_none=True)
        opt_object.zero_grad(set_to_none=True)
        opt_contact.zero_grad(set_to_none=True)
        
        # Forward passes with mixed precision
        with autocast(device_type='cuda', dtype=torch.bfloat16):
            # Hand encoder
            hand_output = model_hand(patches)
            hand_gt = batch['hand_joints_3d']
            valid_hands = ~(hand_gt.view(hand_gt.shape[0], -1) == -1).all(dim=1)
            
            if valid_hands.any():
                hand_loss = F.mse_loss(hand_output['joints_3d'][valid_hands], hand_gt[valid_hands])
            else:
                hand_loss = torch.tensor(0.0, device='cuda')
            
            # Object encoder
            object_output = model_object(patches, object_ids=batch.get('ycb_ids'))
            object_loss = torch.tensor(0.0, device='cuda')
            
            if 'object_poses' in batch and batch['object_poses'].shape[1] > 0:
                valid_objects = ~(batch['object_poses'] == 0).all(dim=(2, 3))
                if valid_objects.any():
                    object_positions_gt = batch['object_poses'][:, :, :3, 3]
                    num_pred = min(object_output['positions'].shape[1], batch['object_poses'].shape[1])
                    valid_mask = valid_objects[:, :num_pred]
                    
                    if valid_mask.any():
                        pred_pos = object_output['positions'][:, :num_pred][valid_mask]
                        gt_pos = object_positions_gt[:, :num_pred][valid_mask]
                        object_loss = F.mse_loss(pred_pos, gt_pos)
            
            # Contact encoder
            contact_output = model_contact(
                hand_output['features'].detach(),
                object_output['features'].detach()
            )
            
            # Total loss
            total_batch_loss = hand_loss + object_loss
        
        # Backward pass
        total_batch_loss.backward()
        
        # Gradient clipping
        torch.nn.utils.clip_grad_norm_(model_hand.parameters(), 1.0)
        torch.nn.utils.clip_grad_norm_(model_object.parameters(), 1.0)
        
        # Optimizer steps
        opt_hand.step()
        opt_object.step()
        opt_contact.step()
        
        # Metrics
        total_loss += total_batch_loss.item()
        num_batches += 1
        
        # Update progress bar
        if batch_idx % 5 == 0:
            gpu_mem = torch.cuda.memory_allocated() / 1e9
            elapsed = time.time() - epoch_start
            samples_per_sec = (batch_idx + 1) * gpu_config['batch_size'] / elapsed
            
            progress_bar.set_postfix({
                'loss': f'{total_batch_loss.item():.4f}',
                'gpu': f'{gpu_mem:.1f}GB',
                'speed': f'{samples_per_sec:.0f}/s'
            })
    
    return total_loss / max(num_batches, 1)


def gpu_only_validate(model_hand, loader, preprocessor):
    """Fast validation on GPU"""
    model_hand.eval()
    
    total_loss = 0
    total_mpjpe = 0
    num_batches = 0
    num_valid = 0
    
    with torch.no_grad():
        for batch_idx, batch in enumerate(loader):
            if batch_idx >= 10:  # Quick validation
                break
            
            # GPU preprocessing
            patches = preprocessor.create_patches_batch(batch['color'])
            
            # Forward pass
            with autocast(device_type='cuda', dtype=torch.bfloat16):
                hand_output = model_hand(patches)
            
            hand_gt = batch['hand_joints_3d']
            valid_hands = ~(hand_gt.view(hand_gt.shape[0], -1) == -1).all(dim=1)
            
            if valid_hands.any():
                loss = F.mse_loss(hand_output['joints_3d'][valid_hands], hand_gt[valid_hands])
                mpjpe = (hand_output['joints_3d'][valid_hands] - hand_gt[valid_hands]).norm(dim=-1).mean()
                
                total_loss += loss.item()
                total_mpjpe += mpjpe.item() * valid_hands.sum().item()
                num_valid += valid_hands.sum().item()
            
            num_batches += 1
    
    return {
        'loss': total_loss / max(num_batches, 1),
        'mpjpe': total_mpjpe / max(num_valid, 1) if num_valid > 0 else 0
    }

In [None]:
# Run GPU-Only Training
print("Starting GPU-Only Training with cached dataset...")
print(f"Batch size: {gpu_config['batch_size']}")
print(f"Zero CPU operations - everything on GPU!")
print("-" * 60)

# Create fresh optimizers for GPU-only training
gpu_opt_hand = optim.AdamW(hand_encoder.parameters(), lr=2e-3)
gpu_opt_object = optim.AdamW(object_encoder.parameters(), lr=2e-3)
gpu_opt_contact = optim.AdamW(contact_encoder.parameters(), lr=2e-3)

# GPU-only training history
gpu_history = {
    'train_loss': [],
    'val_loss': [],
    'val_mpjpe': [],
    'throughput': []
}

# Initial GPU stats
print_gpu_stats()

# Training loop
num_epochs = 3  # Quick test
for epoch in range(num_epochs):
    epoch_start = time.time()
    
    # Train
    train_loss = gpu_only_train_epoch(
        epoch, 
        hand_encoder, object_encoder, contact_encoder,
        gpu_opt_hand, gpu_opt_object, gpu_opt_contact,
        gpu_train_loader, gpu_preprocessor
    )
    gpu_history['train_loss'].append(train_loss)
    
    # Validate
    val_metrics = gpu_only_validate(hand_encoder, gpu_val_loader, gpu_preprocessor)
    gpu_history['val_loss'].append(val_metrics['loss'])
    gpu_history['val_mpjpe'].append(val_metrics['mpjpe'])
    
    # Calculate throughput
    epoch_time = time.time() - epoch_start
    total_samples = len(gpu_train_loader) * gpu_config['batch_size']
    throughput = total_samples / epoch_time
    gpu_history['throughput'].append(throughput)
    
    print(f"\nEpoch {epoch+1} Summary:")
    print(f"  Train Loss: {train_loss:.4f}")
    print(f"  Val Loss: {val_metrics['loss']:.4f}")
    print(f"  Val MPJPE: {val_metrics['mpjpe']*1000:.2f}mm")
    print(f"  Epoch Time: {epoch_time:.1f}s")
    print(f"  Throughput: {throughput:.0f} samples/s")
    print(f"  GPU Memory: {torch.cuda.memory_allocated()/1e9:.1f}GB")
    
    # Check GPU utilization
    try:
        import subprocess
        result = subprocess.run(['nvidia-smi', '--query-gpu=utilization.gpu', 
                               '--format=csv,noheader,nounits'], 
                              capture_output=True, text=True)
        if result.returncode == 0:
            gpu_util = float(result.stdout.strip())
            print(f"  GPU Utilization: {gpu_util}% {'✓' if gpu_util > 80 else '⚠️'}")
    except:
        pass

print("\n✓ GPU-Only training completed!")
print_gpu_stats()

In [None]:
# Compare performance: Standard vs GPU-Only
print("Performance Comparison:")
print("-" * 60)

if 'history' in globals() and 'throughput' in history and len(history['throughput']) > 0:
    standard_throughput = np.mean(history['throughput'])
    print(f"Standard DataLoader: {standard_throughput:.0f} samples/s")
else:
    print("Standard DataLoader: Not run")

if 'gpu_history' in globals() and 'throughput' in gpu_history and len(gpu_history['throughput']) > 0:
    gpu_throughput = np.mean(gpu_history['throughput'])
    print(f"GPU-Only Dataset:    {gpu_throughput:.0f} samples/s")
    
    if 'standard_throughput' in locals():
        speedup = gpu_throughput / standard_throughput
        print(f"\nSpeedup: {speedup:.1f}x faster with GPU-only approach!")
else:
    print("GPU-Only Dataset: Not run")

# Plot comparison if both were run
if 'history' in globals() and 'gpu_history' in globals():
    plt.figure(figsize=(12, 4))
    
    plt.subplot(1, 2, 1)
    if len(history['train_loss']) > 0:
        plt.plot(history['train_loss'], label='Standard', marker='o')
    if len(gpu_history['train_loss']) > 0:
        plt.plot(gpu_history['train_loss'], label='GPU-Only', marker='s')
    plt.xlabel('Epoch')
    plt.ylabel('Training Loss')
    plt.title('Training Loss Comparison')
    plt.legend()
    plt.grid(True)
    
    plt.subplot(1, 2, 2)
    if len(history['throughput']) > 0:
        plt.bar(['Standard'], [np.mean(history['throughput'])], color='blue', alpha=0.6)
    if len(gpu_history['throughput']) > 0:
        plt.bar(['GPU-Only'], [np.mean(gpu_history['throughput'])], color='green', alpha=0.6)
    plt.ylabel('Samples/second')
    plt.title('Throughput Comparison')
    plt.grid(True, axis='y')
    
    plt.tight_layout()
    plt.show()

## Tips for Maximum GPU Utilization

### 1. **Adjust Dataset Cache Size**
```python
# For 140GB H200, you can cache more samples:
max_samples_train = 100000  # ~100GB
max_samples_train = 150000  # ~150GB (if using bfloat16)
```

### 2. **Use BFloat16 for More Samples**
```python
dtype = torch.bfloat16  # Half the memory, similar accuracy
```

### 3. **Increase Batch Size**
```python
batch_size = 2048  # or even 4096 if memory allows
```

### 4. **Monitor GPU Usage**
Run in another terminal:
```bash
watch -n 0.5 nvidia-smi
```

### 5. **Clear Cache Between Runs**
```python
torch.cuda.empty_cache()
torch.cuda.synchronize()
```

### Expected Results with GPU-Only Approach:
- **GPU Utilization**: 90%+ (was 20%)
- **Memory Usage**: 50-100GB (was 2.5GB)
- **Throughput**: 10,000+ samples/s (was ~500)
- **Power Draw**: 600W+ (was 172W)