In [None]:

import subprocess, sys, time, math

backend = None
gpu_name = "None"
has_cuda = False

def run_cmd(x):
    try:
        p = subprocess.run(x, shell=True, capture_output=True, text=True, timeout=5)
        if p.returncode == 0 and p.stdout.strip():
            print(p.stdout.strip())
    except Exception:
        pass

try:
    import torch
    try:
        import torch.backends.cuda
        torch.set_float32_matmul_precision("high")
    except Exception:
        pass
    has_cuda = torch.cuda.is_available()
    if has_cuda:
        gpu_name = torch.cuda.get_device_name(0)
        backend = "torch"
except Exception:
    pass

if backend is None:
    try:
        import cupy as cp
        gpu_name = cp.cuda.runtime.getDeviceProperties(0)["name"].decode()
        backend = "cupy"
        has_cuda = True
    except Exception:
        pass

print(f"Backend: {backend}")
print(f"GPU: {gpu_name}")
run_cmd("nvidia-smi -L")


In [None]:

import time, math, sys

def _fmt(x):
    if x >= 1e3: return f"{x/1e3:.2f}K"
    return f"{x:.0f}"

def torch_bench():
    import torch
    if torch.cuda.is_available():
        free_mem, total_mem = torch.cuda.mem_get_info()
        budget = int(min(free_mem, total_mem*0.5))
        N = int((budget/12)**0.5)
        N = max(2048, min(N, 8192))
        N = (N//128)*128
        M = min(4096, N)
    else:
        N = 2048
        M = 2048

    def bench_mm(device, N, repeats=5, dtype=torch.float32):
        a = torch.randn((N,N), device=device, dtype=dtype)
        b = torch.randn((N,N), device=device, dtype=dtype)
        c = a @ b
        if device.type == "cuda": torch.cuda.synchronize()
        t0 = time.perf_counter()
        for _ in range(repeats):
            c = a @ b
        if device.type == "cuda": torch.cuda.synchronize()
        per = (time.perf_counter()-t0)/repeats
        gflops = (2*N*N*N)/1e9/per
        return per, gflops

    def bench_elem(device, N, repeats=5):
        x = torch.randn((N,N), device=device)
        y = x
        if device.type == "cuda": torch.cuda.synchronize()
        t0 = time.perf_counter()
        for _ in range(repeats):
            y = torch.sin(y) + torch.exp(y) + torch.tanh(y)
        if device.type == "cuda": torch.cuda.synchronize()
        per = (time.perf_counter()-t0)/repeats
        return per

    def bench_fft2(device, M, repeats=3):
        x = torch.randn((M,M), device=device, dtype=torch.complex64)
        y = torch.fft.fft2(x)
        if device.type == "cuda": torch.cuda.synchronize()
        t0 = time.perf_counter()
        for _ in range(repeats):
            y = torch.fft.fft2(x)
        if device.type == "cuda": torch.cuda.synchronize()
        per = (time.perf_counter()-t0)/repeats
        return per

    dev_gpu = torch.device("cuda" if torch.cuda.is_available() else "cpu")
    dev_cpu = torch.device("cpu")

    N_cpu = min(N, 2048)
    M_cpu = min(M, 2048)

    res = {}
    per, gflops = bench_mm(dev_cpu, N_cpu, repeats=3)
    res["CPU GEMM s"] = per
    res["CPU GEMM GFLOP/s"] = gflops
    per = bench_elem(dev_cpu, N_cpu, repeats=2)
    res["CPU Elem s"] = per
    per = bench_fft2(dev_cpu, M_cpu, repeats=2)
    res["CPU FFT2 s"] = per

    if torch.cuda.is_available():
        per, gflops = bench_mm(dev_gpu, N, repeats=5)
        res["GPU GEMM s"] = per
        res["GPU GEMM GFLOP/s"] = gflops
        per = bench_elem(dev_gpu, N, repeats=3)
        res["GPU Elem s"] = per
        per = bench_fft2(dev_gpu, M, repeats=3)
        res["GPU FFT2 s"] = per
        print(f"N={N}, M={M}")
    else:
        print("CUDA not available")

    keys = ["CPU GEMM s","GPU GEMM s","CPU GEMM GFLOP/s","GPU GEMM GFLOP/s","CPU Elem s","GPU Elem s","CPU FFT2 s","GPU FFT2 s"]
    for k in keys:
        if k in res:
            print(f"{k}: {res[k]:.4f}")
    if "GPU GEMM s" in res:
        print(f"GEMM speedup: {res['CPU GEMM s']/res['GPU GEMM s']:.1f}x")
        print(f"Elem speedup: {res['CPU Elem s']/res['GPU Elem s']:.1f}x")
        print(f"FFT2 speedup: {res['CPU FFT2 s']/res['GPU FFT2 s']:.1f}x")

def cupy_bench():
    import cupy as cp
    free_mem, total_mem = cp.cuda.runtime.memGetInfo()
    budget = int(min(free_mem, total_mem*0.5))
    N = int((budget/12)**0.5)
    N = max(2048, min(N, 8192))
    N = (N//128)*128
    M = min(4096, N)

    def bench_mm(N, repeats=5):
        a = cp.random.randn(N,N, dtype=cp.float32)
        b = cp.random.randn(N,N, dtype=cp.float32)
        c = a.dot(b)
        cp.cuda.Stream.null.synchronize()
        t0 = time.perf_counter()
        for _ in range(repeats):
            c = a.dot(b)
        cp.cuda.Stream.null.synchronize()
        per = (time.perf_counter()-t0)/repeats
        gflops = (2*N*N*N)/1e9/per
        return per, gflops

    def bench_elem(N, repeats=5):
        x = cp.random.randn(N,N).astype(cp.float32)
        y = x
        cp.cuda.Stream.null.synchronize()
        t0 = time.perf_counter()
        for _ in range(repeats):
            y = cp.sin(y) + cp.exp(y) + cp.tanh(y)
        cp.cuda.Stream.null.synchronize()
        per = (time.perf_counter()-t0)/repeats
        return per

    def bench_fft2(M, repeats=3):
        x = (cp.random.randn(M,M) + 1j*cp.random.randn(M,M)).astype(cp.complex64)
        y = cp.fft.fft2(x)
        cp.cuda.Stream.null.synchronize()
        t0 = time.perf_counter()
        for _ in range(repeats):
            y = cp.fft.fft2(x)
        cp.cuda.Stream.null.synchronize()
        per = (time.perf_counter()-t0)/repeats
        return per

    per, gflops = bench_mm(N, repeats=5)
    gpu_mm = per
    gpu_gflops = gflops
    per = bench_elem(N, repeats=3)
    gpu_elem = per
    per = bench_fft2(M, repeats=3)
    gpu_fft = per
    print(f"N={N}, M={M}")
    print(f"GPU GEMM s: {gpu_mm:.4f}")
    print(f"GPU GEMM GFLOP/s: {gpu_gflops:.2f}")
    print(f"GPU Elem s: {gpu_elem:.4f}")
    print(f"GPU FFT2 s: {gpu_fft:.4f}")

try:
    import torch
    torch_bench()
except Exception as e:
    try:
        import cupy as cp
        cupy_bench()
    except Exception as e2:
        print("Install PyTorch (CUDA build) or CuPy to run the GPU benchmark.")
