In [1]:
import os, json, time, platform, subprocess, textwrap
from dataclasses import dataclass
from pathlib import Path
from typing import Any, Dict, Optional, List

import torch
os.environ["CUDA_VISIBLE_DEVICES"] = "0, 1"

In [2]:
RUNS_DIR = Path.cwd() / "runs"
RUNS_DIR.mkdir(parents=True, exist_ok=True)

In [3]:
RUNS_DIR

PosixPath('/home/lucasleow/project-titan/proj1_stackvalidation_regressionsuite/runs')

# Section 1 - Profiling Hardware

### Goal - Single JSON Snapshot to execute before experiment runs
- Environment / Stack
- GPU Topo / Interconnect
- Validation Checks

In [4]:
# Shell command runner with exception handling for json
def run_cmd(cmd: List[str], timeout: int = 30) -> Dict[str, Any]:
    """
    - subprocess.run(...) -> raises FileNotFound if command doesn't exist
        - wrap with try-except, return -1 if command doesn't exist
        
    """
    try:
        p = subprocess.run(
            cmd,
            stdout=subprocess.PIPE,
            stderr=subprocess.PIPE,
            timeout=timeout, # 30 seconds timeout in case command hangs
            text=True,
            check=False
        )
        return {
            "cmd": " ".join(cmd),
            "returncode": p.returncode,
            "stdout": p.stdout.strip(), # save STDOUT and STDERR into dict for json dump
            "stderr": p.stderr.strip(),
        }
    except Exception as e:
        return {
            "cmd": " ".join(cmd),
            "returncode": -1,
            "stdout": "",
            "stderr": f"{type(e).__name__}: {e}",
        }

In [5]:
# System Introspection
# Software + Hardware stack

def torch_stack_info() -> Dict[str, Any]:
    info = {
        "torch_version": torch.__version__, # Version of PyTorch 
        "torch_cuda_build": torch.version.cuda, # Version of CUDA PyTorch compiled with - if mismatch with GPU CUDA Driver, might cause performance regression
        "cuda_available": torch.cuda.is_available(),
        "device_count": torch.cuda.device_count(),
    }

    # NCCL version (best-effort)
    nccl_ver = None
    try:
        # PyTorch exposes NCCL version here on many builds
        # Older NCCL versions might not use NVLink, defaulting to PCIe speeds
        nccl_ver = torch.cuda.nccl.version()
    except Exception:
        pass
    info["nccl_version"] = nccl_ver

    # GPU properties
    gpus = []
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            prop = torch.cuda.get_device_properties(i)
            gpus.append({
                "index": i,
                "name": prop.name,
                "total_memory_gb": round(prop.total_memory / (1024**3), 2),
                "sm_count": getattr(prop, "multi_processor_count", None),
                "compute_capability": f"{prop.major}.{prop.minor}",
                "bf16_supported": bool(getattr(prop, "major", 0) >= 8),  # heuristic
            })
    info["gpus"] = gpus
    return info

torch_stack_info()

{'torch_version': '2.9.0+cu128',
 'torch_cuda_build': '12.8',
 'cuda_available': True,
 'device_count': 2,
 'nccl_version': (2, 27, 5),
 'gpus': [{'index': 0,
   'name': 'NVIDIA H200 NVL',
   'total_memory_gb': 139.8,
   'sm_count': 132,
   'compute_capability': '9.0',
   'bf16_supported': True},
  {'index': 1,
   'name': 'NVIDIA H200 NVL',
   'total_memory_gb': 139.8,
   'sm_count': 132,
   'compute_capability': '9.0',
   'bf16_supported': True}]}

### Understanding torch_stack_info:
```
- torch_version --> 2.9.0+cu128
```

In [6]:
# P2P - NVLink Check
# Turn on NVLink if required
def p2p_matrix() -> Dict[str, Any]:
    """
    Builds P2P access matrix using torch.
    Also attempts to enable peer access (best-effort).
    """
    if not torch.cuda.is_available():
        return {"available": False, "reason": "torch.cuda.is_available() is False"}

    n = torch.cuda.device_count()
    can = [[None for _ in range(n)] for _ in range(n)]
    enabled_attempts = []

    for i in range(n):
        for j in range(n):
            if i == j:
                can[i][j] = True
                continue
            try:
                can[i][j] = bool(torch.cuda.can_device_access_peer(i, j))
            except Exception as e:
                can[i][j] = f"ERR:{type(e).__name__}:{e}"

    # Try enabling peer access (optional; doesnâ€™t require root)
    for i in range(n):
        for j in range(n):
            if i == j:
                continue
            try:
                torch.cuda.set_device(i)
                # enable_peer_access can throw if already enabled / unsupported
                torch.cuda.enable_peer_access(j)
                enabled_attempts.append({"from": i, "to": j, "enabled": True})
            except Exception as e:
                enabled_attempts.append({"from": i, "to": j, "enabled": False, "err": f"{type(e).__name__}: {e}"})

    # Re-check after enable attempts
    can_after = [[None for _ in range(n)] for _ in range(n)]
    for i in range(n):
        for j in range(n):
            if i == j:
                can_after[i][j] = True
                continue
            try:
                can_after[i][j] = bool(torch.cuda.can_device_access_peer(i, j))
            except Exception as e:
                can_after[i][j] = f"ERR:{type(e).__name__}:{e}"

    return {
        "available": True,
        "can_access_peer_before": can,
        "enable_peer_access_attempts": enabled_attempts,
        "can_access_peer_after": can_after,
    }

In [7]:
def nvidia_smi_bundle() -> Dict[str, Any]:
    bundle = {}

    bundle["nvidia_smi"] = run_cmd(["nvidia-smi"])
    bundle["topo_matrix"] = run_cmd(["nvidia-smi", "topo", "-m"])

    # Structured GPU query: health + clocks + ecc + power
    query_fields = [
        "index", "name", "uuid", "pci.bus_id",
        "temperature.gpu",
        "utilization.gpu", "utilization.memory",
        "memory.total", "memory.used",
        "clocks.sm", "clocks.mem",
        "power.draw", "power.limit",
        "ecc.mode.current",
        "clocks_throttle_reasons.active",
    ]
    bundle["query_gpu"] = run_cmd([
        "nvidia-smi",
        f"--query-gpu={','.join(query_fields)}",
        "--format=csv,noheader,nounits"
    ])

    # NVLink status (command support varies by driver)
    bundle["nvlink_summary"] = run_cmd(["nvidia-smi", "nvlink", "-s"])
    # Per-GPU NVLink detail (best-effort)
    bundle["nvlink_gpu0"] = run_cmd(["nvidia-smi", "nvlink", "-i", "0"])
    bundle["nvlink_gpu1"] = run_cmd(["nvidia-smi", "nvlink", "-i", "1"])

    return bundle

In [8]:
def derive_section1_checks(snapshot: Dict[str, Any]) -> Dict[str, Any]:
    checks = []
    torch_info = snapshot.get("torch", {})
    p2p = snapshot.get("p2p", {})
    topo = snapshot.get("nvidia", {}).get("topo_matrix", {}).get("stdout", "")

    # Check: 2 GPUs
    gpu_count = torch_info.get("device_count", 0)
    checks.append({
        "name": "cuda_visible_2_gpus",
        "pass": (gpu_count == 2),
        "details": f"torch sees device_count={gpu_count}"
    })

    # Check: CUDA available
    checks.append({
        "name": "cuda_available",
        "pass": bool(torch_info.get("cuda_available", False)),
        "details": f"torch.cuda.is_available()={torch_info.get('cuda_available')}"
    })

    # Check: P2P both directions
    p2p_ok = False
    p2p_after = p2p.get("can_access_peer_after")
    if isinstance(p2p_after, list) and len(p2p_after) >= 2:
        p2p_ok = (p2p_after[0][1] is True) and (p2p_after[1][0] is True)

    checks.append({
        "name": "p2p_enabled_bidirectional",
        "pass": bool(p2p_ok),
        "details": f"can_access_peer_after[0][1]={p2p_after[0][1] if p2p_after else None}, "
                   f"[1][0]={p2p_after[1][0] if p2p_after else None}"
    })

    # Check: NVLink present in topo matrix
    # topo -m typically shows "NV#" tokens when NVLink exists.
    nvlink_present = ("NV" in topo) and ("GPU0" in topo) and ("GPU1" in topo)
    checks.append({
        "name": "topology_indicates_nvlink",
        "pass": bool(nvlink_present),
        "details": "Found 'NV' token in nvidia-smi topo -m output" if nvlink_present else "No NV token detected in topo -m"
    })

    overall_pass = all(c["pass"] for c in checks)

    return {"overall_pass": overall_pass, "checks": checks}

In [9]:
def collect_section1_snapshot() -> Dict[str, Any]:
    snap = {
        "ts_unix": time.time(),
        "ts_readable": time.strftime("%Y-%m-%d %H:%M:%S"),
        "host": {
            "hostname": platform.node(),
            "platform": platform.platform(),
            "kernel": platform.release(),
            "python": platform.python_version(),
            "cwd": str(Path.cwd()),
            "env": {
                "CONDA_DEFAULT_ENV": os.environ.get("CONDA_DEFAULT_ENV"),
                "VIRTUAL_ENV": os.environ.get("VIRTUAL_ENV"),
                "CUDA_VISIBLE_DEVICES": os.environ.get("CUDA_VISIBLE_DEVICES"),
            },
        },
        "torch": torch_stack_info(),
        "p2p": p2p_matrix(),
        "nvidia": nvidia_smi_bundle(),
    }
    snap["section1_gate"] = derive_section1_checks(snap)
    return snap

run_id = time.strftime("run_%Y%m%d_%H%M%S")
out_dir = RUNS_DIR / run_id
out_dir.mkdir(parents=True, exist_ok=True)

snapshot = collect_section1_snapshot()
snapshot_path = out_dir / "baseline_snapshot.json"
snapshot_path.write_text(json.dumps(snapshot, indent=2))

print("Saved:", snapshot_path)
print("\n=== Section 1 Gate ===")
print("OVERALL:", "PASS" if snapshot["section1_gate"]["overall_pass"] else "FAIL")
for c in snapshot["section1_gate"]["checks"]:
    print(f"- {c['name']}: {'PASS' if c['pass'] else 'FAIL'} | {c['details']}")

Saved: /home/lucasleow/project-titan/proj1_stackvalidation_regressionsuite/runs/run_20260108_124018/baseline_snapshot.json

=== Section 1 Gate ===
OVERALL: PASS
- cuda_visible_2_gpus: PASS | torch sees device_count=2
- cuda_available: PASS | torch.cuda.is_available()=True
- p2p_enabled_bidirectional: PASS | can_access_peer_after[0][1]=True, [1][0]=True
- topology_indicates_nvlink: PASS | Found 'NV' token in nvidia-smi topo -m output


## Analysis of Record

1. Environment Mapping
    - CUDA_VISIBLE_DEVICE "2, 3"
    - torch.gpus index 0 and 1 (map torch 0 to gpu 2 and torch 1 to gpu 3)
    - nvidia.query_gpu -> 0, 1, 2, 3

2. Topo
    - NV6 between all GPUs (full mesh) - 6 lanes for communication
    - NCCL all-reduce and multi-GPU training depend heavily on this

    - nvlink Link 0 through Link 17, all active at 26 GB/s - 477GBps unidirection, 900 GBps bidirection
        - per link theoretical speed (links are up, negotiated speed is high)
        - look out for missing links / links showing 0 GBps / inconsistent speed across links
    
    - Instead of PIX (PCIe internal switch) -> Fastest possible PCIe - GPUDirect P2P (allocated memory)
    - PHB (PCIe Host Bridge) -> travel to CPU bus then to PCIe to another GPU

3. Hardware Health
    - 143771 MiB - 141GB H200 HBM3e
    - 33C - 35C Temp (current cold)
    - 70W draw, 600W limit

4. Drivers
    - PyTorch 2.9.0+cu128
    - torch CUDA 12.8 matches (torch compiled on Cuda 12.8)

#### Report
- Isolation: Jobs restricted to GPU 2 & 3 via CUDA_VISIBLE_DEVICE=2,3
- Compatibility: Driver supports CUDA13.0; PyTorch built for CUDA12.8; CUDA avail
- Interconnect: NVLink present (NV6 topo mesh), P2P enabled bidirectionally
- Health: Temps low (33-35C), ECC enabled (Error Correction, GPU fixing corruption)
- 

# Section 2 - GPU Benchmarking Tests

1. General MatMul (GEMM) Benchmark (with warmup, CUDA event timing, median/p95) + TFLOPS (math ops / s)
    - TFLOPs low -> possible Hardware / Driver problem
    - TFLOPs high -> possible code inefficency (data loading / shape mismatch)
2. P2P Copy Benchmark (GPU <-> GPU)
    - driver update may disable NVLink programatically -> slower performance
    - Benchmark test will show 400GBps to 30GBps (PCIe)
3. NCCL All-Reduce Benchmark
    - not just copying data, sharing and computation of data (ring pattern)  
    - if P2P Copy Fast but NCCL slow, possible communication algorithm issue (Ring vs Tree algo)

### Idea
- when training fails, need to troubleshoot potential issues
    1. Math (GEMM)
    2. Wire (P2P)
    3. Teamwork (NCCL)

In [10]:
import os, time, json, math, socket
from dataclasses import dataclass, asdict
from typing import Any, Dict, List, Tuple, Optional

import torch
import torch.distributed as dist
import torch.multiprocessing as mp


### Section 2 Helpers

In [11]:
def _visible_device_map_hint():
    # With CUDA_VISIBLE_DEVICES=2,3:
    # torch device 0 -> physical GPU 2
    # torch device 1 -> physical GPU 3
    return os.environ.get("CUDA_VISIBLE_DEVICES", None)

# Synchronize for CPU Timers - else CPU returns immediately
def cuda_sync_all_visible():
    if not torch.cuda.is_available(): # no cuda, no sync
        return

    for i in range(torch.cuda.device_count()):
        torch.cuda.synchronize(i) # force CPU to wait for all running task on GPU to complete

def percentile(sorted_vals: list[float], quantile: float) -> float:
    if not sorted_vals:
        return float('nan')
    
    # handle floats with precision e.g. 1.0000002  with >= or <=
    if quantile <= 0: # 0th percentile -> smallest value in sorted set
        return sorted_vals[0]
    if quantile >= 1: # 100th percentile -> largest value
        return sorted_vals[-1]

    # quantile between 0 and 1
    idx = int(round((len(sorted_vals) - 1) * quantile))
    return sorted_vals[idx]
    
def summarize_times_ms(times_ms: List[float]) -> Dict[str, float]:
    times_ms = sorted(times_ms)
    return {
        "median_ms": percentile(times_ms, 0.50),
        "p95_ms": percentile(times_ms, 0.95),
        "p99_ms": percentile(times_ms, 0.99),
        "min_ms": times_ms[0] if times_ms else float("nan"),
        "max_ms": times_ms[-1] if times_ms else float("nan"),
        "n": float(len(times_ms)),
    }

def time_cuda_op_ms(
    op_fn,
    device: int,
    warmup: int,
    iters: int
) -> list[float]:
    """
    Time an operation using CUDA events on specific device idx (visible idx)
    op_fn must enqueue work on current device stream

    """
    torch.cuda.set_device(device)
    
    # warmup gpu
    for _ in range(warmup):
        op_fn()
    
    torch.cuda.synchronize(device)
    
    start = torch.cuda.Event(enable_timing=True)
    end = torch.cuda.Event(enable_timing=True)
    
    times = []
    for _ in range(iters):
        start.record()
        op_fn()
        end.record()
        torch.cuda.synchronize(device)
        times.append(start.elapsed_time(end)) # in ms
    
    return times

### Section 2 Report Builder

In [12]:
def section2_meta() -> Dict[str, Any]:
    meta = {
        "ts_unix": time.time(),
        "cuda_visible_devices": _visible_device_map_hint(),
        "torch_version": torch.__version__,
        "torch_cuda_build": torch.version.cuda,
        "nccl_version": torch.cuda.nccl.version() if torch.cuda.is_available() else None,
        "device_count_visible": torch.cuda.device_count(),
        "gpus": [],
    }
    if torch.cuda.is_available():
        for i in range(torch.cuda.device_count()):
            p = torch.cuda.get_device_properties(i)
            meta["gpus"].append({
                "index": i,
                "name": p.name,
                "total_mem_gb": round(p.total_memory / 1e9, 2),
                "sm_count": p.multi_processor_count,
                "cc": f"{p.major}.{p.minor}",
            })
    return meta


def mk_row(test_name: str, params: Dict[str, Any], metrics: Dict[str, Any]) -> Dict[str, Any]:
    return {
        "section": "2",
        "test_name": test_name,
        "params": params,
        "metrics": metrics,
        "meta": section2_meta(),
    }

## 2.1 GEMM Benchmark (TFLOPs)
- Tera-Floating Point Operations per second
- 1 trillion floating point calc per second

In [13]:
@dataclass
class GemmCase:
    M: int
    N: int
    K: int
    name: str


def gemm_tflops(M: int, N: int, K: int, ms: float) -> float:
    # 2*M*N*K ops / time
    # MatMul of (M x K) * (K x N) gives 2 x M x N x K ops
    # 2 because for every element, perform 1 multiplication & 1 addition (fused multiply-add)
    sec = ms / 1e3
    if sec <= 0:
        return float("inf")
    return (2.0 * M * N * K) / sec / 1e12


def run_gemm_bf16(
    cases: List[GemmCase],
    device: int = 0,
    warmup: int = 10, # allow clock speed to stabilize
    iters: int = 30
) -> List[Dict[str, Any]]:
    rows = []
    torch.cuda.set_device(device)

    for c in cases:
        # Allocate once per case
        # created outside op() to exclude Memory Management (finding memory to init)
        A = torch.randn((c.M, c.K), device=f"cuda:{device}", dtype=torch.bfloat16) 
        B = torch.randn((c.K, c.N), device=f"cuda:{device}", dtype=torch.bfloat16)

        # op
        def op():
            # use matmul; BF16 should hit tensor cores
            C = A @ B
            # prevent lazy elimination
            # modern compilers will see C isn't used and delete operation to save time
            return C

        times_ms = time_cuda_op_ms(op, device=device, warmup=warmup, iters=iters)
        stats = summarize_times_ms(times_ms)
        tflops_med = gemm_tflops(c.M, c.N, c.K, stats["median_ms"])
        tflops_p95 = gemm_tflops(c.M, c.N, c.K, stats["p95_ms"])

        rows.append(mk_row(
            test_name="gemm_bf16",
            params={"case": c.name, "M": c.M, "N": c.N, "K": c.K, "device": device, "warmup": warmup, "iters": iters},
            metrics={**stats, "tflops_median": tflops_med, "tflops_p95": tflops_p95}
        ))

        print(f"[GEMM BF16] {c.name:16s} M={c.M} N={c.N} K={c.K} | "
              f"median {stats['median_ms']:.3f} ms | TFLOPS {tflops_med:.2f}")

    return rows

In [14]:
# Define GEMM Benchmark Scenarios
gemm_cases = [
    GemmCase(4096, 4096, 4096, "square_4k"), # GPU compatible with square matrices
    GemmCase(8192, 8192, 8192, "square_8k"), # Larger square matrix
    GemmCase(8192, 16384, 4096, "mlp_like"),     # (tokens) x hidden, hidden x 4hidden (Batch M x Indim x Outdim)
    GemmCase(16384, 4096, 4096, "proj_like"),    # projection-ish
]

In [15]:
rows_gemm = run_gemm_bf16(gemm_cases, device=0, warmup=10, iters=30)

[GEMM BF16] square_4k        M=4096 N=4096 K=4096 | median 0.197 ms | TFLOPS 695.99
[GEMM BF16] square_8k        M=8192 N=8192 K=8192 | median 1.534 ms | TFLOPS 716.71
[GEMM BF16] mlp_like         M=8192 N=16384 K=4096 | median 1.771 ms | TFLOPS 620.69
[GEMM BF16] proj_like        M=16384 N=4096 K=4096 | median 0.895 ms | TFLOPS 614.22


## 2.2 P2P Copy Benchmark (GB/s)
- measure actual data transfer speed between 2 GPUs
- determine if system using NVLink or PCIe
- 20-60GBps (possibly PCIe)
- 150-450GBps (NVLink & bonded)

In [16]:
def run_p2p_copy(
    sizes_mb: list[int],
    warmup: int=20,
    iters: int=50,
    src_dvc: int=0,
    dst_dvc: int=1
) -> list[dict[str, Any]]:
    assert torch.cuda.device_count() >= 2 # need 2 visible GPUs for P2P
    
    rows = []
    
    # GPU0 -> GPU1 then GPU1 -> GPU0
    def one_direction(src: int, dst: int):
        torch.cuda.set_device(src)
        
        # Data payload
        # empty generation is instant, focus is on transfer time
        for mb in sizes_mb:
            n_bytes = mb * 1024 * 1024
            n = n_bytes // 2 # bf16 = 2 bytes
            src_t = torch.empty((n,), device=f"cuda:{src}", dtype=torch.bfloat16)
            dst_t = torch.empty((n,), device=f"cuda:{dst}", dtype=torch.bfloat16)
        
        # copy is launched on dst device stream
        def op():
            with torch.cuda.device(dst):
                # copy to destination
                dst_t.copy_(src_t, non_blocking=True)
        
        
        # Time on dst device
        times_ms = time_cuda_op_ms(op, device=dst, warmup=warmup, iters=iters)
        stats = summarize_times_ms(times_ms)
        
        # Bandwidth in GB/s using median
        sec = stats['median_ms'] / 1e3
        gbps = (n_bytes / sec) / 1e9 if sec > 0 else float('inf')
        rows.append(mk_row(
                test_name="p2p_copy",
                params={
                    "src": src, "dst": dst,
                    "size_mb": mb,
                    "warmup": warmup, "iters": iters
                },
                metrics={**stats, "gbps_median": gbps}
            ))
        
        print(f"[P2P COPY] {src}->{dst} size={mb:4d} MB | median {stats['median_ms']:.3f} ms | {gbps:.2f} GB/s")

    one_direction(src_dvc, dst_dvc)
    one_direction(dst_dvc, src_dvc)
    return rows
            

In [17]:
rows_p2p = run_p2p_copy(
    sizes_mb=[4096],
    warmup=20,
    iters=20,
    src_dvc=0,
    dst_dvc=1
)

[P2P COPY] 0->1 size=4096 MB | median 32.232 ms | 133.25 GB/s
[P2P COPY] 1->0 size=4096 MB | median 32.232 ms | 133.25 GB/s


In [18]:
rows_p2p = run_p2p_copy(
    sizes_mb=[8192],
    warmup=20,
    iters=30,
    src_dvc=0,
    dst_dvc=1
)

[P2P COPY] 0->1 size=8192 MB | median 64.806 ms | 132.55 GB/s
[P2P COPY] 1->0 size=8192 MB | median 64.805 ms | 132.55 GB/s


### P2P benchmark intepretation
- Direct GPU-GPU traffic (>90GBps)
- Slight asymmetry due to scheduling, clocks, background contention

- small data = latency
- large data = sustained bandwidth

## 2.3 - NCCL All-Reduce Benchmark
- P2P was just copying of data
- NCCL involves All-Reduce algorithm (computation + copying)

#### torchrun Caveats
- running torchrun directly inside notebook cell often hangs / spawns weirdly
- run as subprocess from notebook

or execute in terminal where nccl_allreduce_benchmark.py resides
```
 CUDA_VISIBLE_DEVICES=0,1 torchrun --standalone --nproc_per_node=2 nccl_allreduce_benchmark.py
```

In [19]:
%%bash
set -euo pipefail

BASE_DIR="/home/lucasleow/project-titan/proj1_stackvalidation_regressionsuite"
RUNS_DIR="${BASE_DIR}/runs"

mkdir -p "$RUNS_DIR"

# sanity checks (fail fast with clear error)
test -d "$BASE_DIR"
test -f "$BASE_DIR/nccl_allreduce_benchmark.py"

TS=$(date +%Y%m%d_%H%M%S)

export CUDA_VISIBLE_DEVICES=0,1

# NCCL "X-ray"
export NCCL_DEBUG=INFO
export NCCL_DEBUG_SUBSYS=INIT,ENV,TOPO
export TORCH_DISTRIBUTED_DEBUG=DETAIL

export NCCL_DEBUG_FILE="${RUNS_DIR}/nccl_${TS}_rank%r.log"
export NCCL_TOPO_DUMP_FILE="${RUNS_DIR}/nccl_topo_${TS}.xml"

cd "$BASE_DIR"
torchrun --standalone --nproc_per_node=2 nccl_allreduce_benchmark.py 2>&1 | tee "${RUNS_DIR}/nccl_stdout_${TS}.log"


W0108 12:40:31.737000 3270521 torch/distributed/run.py:803] 
W0108 12:40:31.737000 3270521 torch/distributed/run.py:803] *****************************************
W0108 12:40:31.737000 3270521 torch/distributed/run.py:803] Setting OMP_NUM_THREADS environment variable for each process to be 1 in default, to avoid your system being overloaded, please further tune the variable for optimal performance in your application as needed. 
W0108 12:40:31.737000 3270521 torch/distributed/run.py:803] *****************************************
[Gloo] Rank [Gloo] Rank 1 is connected to 10 peer ranks. Expected number of connected peer ranks is :  is connected to 11
 peer ranks. Expected number of connected peer ranks is : 1
[NCCL AR]    1 MB | med   0.240 ms | p95   0.301 ms | algBW    4.37 GB/s | busBW    4.37 GB/s
[NCCL AR]    4 MB | med   0.255 ms | p95   0.288 ms | algBW   16.44 GB/s | busBW   16.44 GB/s
[NCCL AR]   16 MB | med   0.381 ms | p95   0.420 ms | algBW   43.99 GB/s | busBW   43.99 GB/s
[

### NCCL All-Reduce Benchmark
- 64MB -> 100.11 GBps (all-reduce moving at that bandwidth) (matches P2P speed)
- 256MB -> 54GBps (50% drop) - NCCL switches protocol based on size
    - small size -> LL (Low Latency)
    - large size -> Ring or Tree 
    - suspected Bulk Throughput 

# Section 3 - Application Sanity Check (vLLM)

### Objectives:
- GPU can start and idle without problem
1. vLLM relies on complex CUDA kernels (PagedAttention)
    - PyTorch work != vLLM work because it might be compiled for different CUDA version

2. Memory Check
    - Loading weights into VRAM + allocating KV Cache requires more memory than raw tensors

3. Tokenizer Check
    - If CPU-based tokenizer is incompatible with `tokenizer.json`, model will hang or output garbage

In [3]:
import time
import numpy as np
from vllm import LLM, SamplingParams
import os

In [None]:
# --- CONFIG ---
os.environ["CUDA_VISIBLE_DEVICES"] = "0,1"
MODEL_ID = "Qwen/Qwen2.5-1.5B-Instruct"
TP_SIZE = 2
PROMPT_LEN = 100 # Approximate token count for input
OUTPUT_LEN = 100 # Tokens to generate
CONCURRENCIES = [1, 16, 32] # Stress Levels

In [None]:
def run_benchmark():
    
    # 1. Init Engine
    print(f"Loading {MODEL_ID} with TP={TP_SIZE}")
    llm = LLM(
        model=MODEL_ID,
        tensor_parallel_size=TP_SIZE, # split model across <TP_SIZE> worker processes
        trust_remote_code=True,
        gpu_memory_utilization=0.9 # (0.9 * Total) - Weight memory = Allocation for KV Cache
    )
    
    # 2. Define Workload
    dummy_prompt = "Explain the detailed history of the Roman Empire and its fall, including economic factors, military strategies, and political corruption. " * 3
    sampling_params = SamplingParams(
        max_tokens=OUTPUT_LEN, # hard limit for num tokens allowed to be generated
        temperature=0. # deterministic measurement 0 - always pick highest probability
    )
    
    results = {}
    
    # 3. Batch Sweep loop
    for batch_size in CONCURRENCIES:
        print(f"\nRunning Batch Size: {batch_size}")
        prompts = [dummy_prompt] * batch_size
        
        # Synchronize GPU before start
        import torch
        torch.cuda.synchronize()
    
        start_time = time.time() # begin generation
        outputs = llm.generate(prompts, sampling_params) # generation
        total_time_taken_for_generation = time.time() - start_time # end generation
        
        # 4. Metric Calc
        # Offline mode - Processing work - measure throughput
        total_tokens_generated = sum([len(o.outputs[0].token_ids) for o in outputs])
        tokens_per_sec = total_tokens_generated / total_time_taken_for_generation
        
        print(f"Done in {total_time_taken_for_generation:.2f}s")
        print(f"Throughput: {tokens_per_sec:.2f} tokens / sec")
        
        results[batch_size] = tokens_per_sec
        
    # 5. Report
    print("\n--- Baseline Report Card ---")
    print("| Concurrency | Throughput (tok/s) |")
    print("|-------------|--------------------|")
    for bs, tps in results.items():
        print(f"| {bs:<11} | {tps:<18.2f} |")

In [6]:
run_benchmark()

Loading Qwen/Qwen2.5-1.5B-Instruct with TP=2
INFO 01-08 12:41:44 [utils.py:253] non-default args: {'trust_remote_code': True, 'tensor_parallel_size': 2, 'disable_log_stats': True, 'model': 'Qwen/Qwen2.5-1.5B-Instruct'}


The argument `trust_remote_code` is to be used with Auto classes. It has no effect here and is ignored.


INFO 01-08 12:41:46 [model.py:514] Resolved architecture: Qwen2ForCausalLM
INFO 01-08 12:41:46 [model.py:1661] Using max model len 32768
INFO 01-08 12:41:48 [scheduler.py:230] Chunked prefill is enabled with max_num_batched_tokens=16384.
[0;36m(EngineCore_DP0 pid=3273329)[0;0m INFO 01-08 12:41:50 [core.py:93] Initializing a V1 LLM engine (v0.13.0) with config: model='Qwen/Qwen2.5-1.5B-Instruct', speculative_config=None, tokenizer='Qwen/Qwen2.5-1.5B-Instruct', skip_tokenizer_init=False, tokenizer_mode=auto, revision=None, tokenizer_revision=None, trust_remote_code=True, dtype=torch.bfloat16, max_seq_len=32768, download_dir=None, load_format=auto, tensor_parallel_size=2, pipeline_parallel_size=1, data_parallel_size=1, disable_custom_all_reduce=False, quantization=None, enforce_eager=False, kv_cache_dtype=auto, device_config=cuda, structured_outputs_config=StructuredOutputsConfig(backend='auto', disable_fallback=False, disable_any_whitespace=False, disable_additional_properties=False, r

[0;36m(EngineCore_DP0 pid=3273329)[0;0m [0;36m(Worker_TP0 pid=3273343)[0;0m We recommend installing via `pip install torch-c-dlpack-ext`


[0;36m(EngineCore_DP0 pid=3273329)[0;0m [0;36m(Worker_TP0 pid=3273343)[0;0m INFO 01-08 12:41:58 [cuda.py:351] Using FLASH_ATTN attention backend out of potential backends: ('FLASH_ATTN', 'FLASHINFER', 'TRITON_ATTN', 'FLEX_ATTENTION')


[0;36m(EngineCore_DP0 pid=3273329)[0;0m [0;36m(Worker_TP1 pid=3273345)[0;0m We recommend installing via `pip install torch-c-dlpack-ext`


model.safetensors:   0%|          | 0.00/3.09G [00:00<?, ?B/s]

[0;36m(EngineCore_DP0 pid=3273329)[0;0m [0;36m(Worker_TP0 pid=3273343)[0;0m INFO 01-08 12:45:48 [weight_utils.py:487] Time spent downloading weights for Qwen/Qwen2.5-1.5B-Instruct: 229.183872 seconds
[0;36m(EngineCore_DP0 pid=3273329)[0;0m [0;36m(Worker_TP0 pid=3273343)[0;0m INFO 01-08 12:45:49 [weight_utils.py:527] No model.safetensors.index.json found in remote.


Loading safetensors checkpoint shards:   0% Completed | 0/1 [00:00<?, ?it/s]


[0;36m(EngineCore_DP0 pid=3273329)[0;0m [0;36m(Worker_TP0 pid=3273343)[0;0m INFO 01-08 12:45:49 [default_loader.py:308] Loading weights took 0.19 seconds
[0;36m(EngineCore_DP0 pid=3273329)[0;0m [0;36m(Worker_TP1 pid=3273345)[0;0m INFO 01-08 12:45:49 [weight_utils.py:527] No model.safetensors.index.json found in remote.
[0;36m(EngineCore_DP0 pid=3273329)[0;0m [0;36m(Worker_TP0 pid=3273343)[0;0m INFO 01-08 12:45:50 [gpu_model_runner.py:3659] Model loading took 1.4490 GiB memory and 231.530852 seconds
[0;36m(EngineCore_DP0 pid=3273329)[0;0m [0;36m(Worker_TP0 pid=3273343)[0;0m INFO 01-08 12:45:54 [backends.py:643] Using cache directory: /home/lucasleow/.cache/vllm/torch_compile_cache/231df38584/rank_0_0/backbone for vLLM's torch.compile
[0;36m(EngineCore_DP0 pid=3273329)[0;0m [0;36m(Worker_TP0 pid=3273343)[0;0m INFO 01-08 12:45:54 [backends.py:703] Dynamo bytecode transform time: 3.13 s
[0;36m(EngineCore_DP0 pid=3273329)[0;0m [0;36m(Worker_TP0 pid=3273343)[0;0m INFO

Capturing CUDA graphs (mixed prefill-decode, PIECEWISE): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 51/51 [00:01<00:00, 29.62it/s]
Capturing CUDA graphs (decode, FULL):  96%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–Œ| 49/51 [00:00<00:00, 52.31it/s]

[0;36m(EngineCore_DP0 pid=3273329)[0;0m [0;36m(Worker_TP1 pid=3273345)[0;0m INFO 01-08 12:46:06 [custom_all_reduce.py:216] Registering 5814 cuda graph addresses


Capturing CUDA graphs (decode, FULL): 100%|â–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆâ–ˆ| 51/51 [00:00<00:00, 57.01it/s]


[0;36m(EngineCore_DP0 pid=3273329)[0;0m [0;36m(Worker_TP0 pid=3273343)[0;0m INFO 01-08 12:46:06 [custom_all_reduce.py:216] Registering 5814 cuda graph addresses
[0;36m(EngineCore_DP0 pid=3273329)[0;0m [0;36m(Worker_TP0 pid=3273343)[0;0m INFO 01-08 12:46:07 [gpu_model_runner.py:4587] Graph capturing finished in 4 secs, took 0.41 GiB
[0;36m(EngineCore_DP0 pid=3273329)[0;0m INFO 01-08 12:46:07 [core.py:259] init engine (profile, create kv cache, warmup model) took 16.49 seconds
INFO 01-08 12:46:09 [llm.py:360] Supported tasks: ['generate']

Running Batch Size: 1


Adding requests:   0%|          | 0/1 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/1 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Done in 0.28s
Throughput: 355.70 tokens / sec

Running Batch Size: 16


Adding requests:   0%|          | 0/16 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/16 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Done in 0.28s
Throughput: 5715.68 tokens / sec

Running Batch Size: 32


Adding requests:   0%|          | 0/32 [00:00<?, ?it/s]

Processed prompts:   0%|          | 0/32 [00:00<?, ?it/s, est. speed input: 0.00 toks/s, output: 0.00 toks/s]

Done in 0.30s
Throughput: 10539.59 tokens / sec

--- ðŸ“Š Baseline Report Card ---
| Concurrency | Throughput (tok/s) |
|-------------|--------------------|
| 1           | 355.70             |
| 16          | 5715.68            |
| 32          | 10539.59           |


# Section 7 - KV Cache Stress Test
- How much conversation history before GPU Crashes?
- KV Cache grows linearly with context