In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Titan V hardware limits
peak_compute_fp16 = 120e12  # 120 TFLOP/s
peak_bandwidth = 652.8e9    # 652.8 GB/s

# Kernel Data (assuming FP16 used throughout for simplicity)
kernels = [
    {
        "label": "Conv2D Baseline",
        "flops": 5.81e10,
        "runtime_ms": 224.382,
        "dram_bw": 143.41
    },
    {
        "label": "Conv2D NHWC FP16",
        "flops": 1.16e11,
        "runtime_ms": 720.251,
        "dram_bw": 75.085
    },
    {
        "label": "Conv2D NHWC FP16 + Shared",
        "flops": 1.16e11,
        "runtime_ms": 95.541,
        "dram_bw": 26.564
    }
]

# Compute OI and Performance
oi = []
perf = []
labels = []

for k in kernels:
    runtime_s = k["runtime_ms"] / 1000
    bytes_moved = k["dram_bw"] * runtime_s * 1e9  # GB/s to bytes
    operational_intensity = k["flops"] / bytes_moved
    achieved_perf = k["flops"] / runtime_s
    oi.append(operational_intensity)
    perf.append(achieved_perf)
    labels.append(k["label"])

# Plot Roofline
oi_range = np.logspace(-1, 2, 100)
roofline_mem = peak_bandwidth * oi_range
roofline_compute = np.full_like(oi_range, peak_compute_fp16)

plt.figure(figsize=(10, 7))
plt.loglog(oi_range, np.minimum(roofline_mem, roofline_compute), label='Roofline', linewidth=3, color='gray')
plt.axhline(y=peak_compute_fp16, color='red', linestyle='--', label='Compute Bound (120 TFLOP/s)')
plt.plot(oi_range, roofline_mem, linestyle='--', color='blue', label='Memory Bound (652.8 GB/s)')

# Plot kernel points
colors = ['orange', 'darkorange', 'red']
for i in range(len(kernels)):
    plt.scatter(oi[i], perf[i], label=labels[i], s=100, color=colors[i], marker='x')

# Labels and formatting
plt.xlabel('Operational Intensity (FLOP/Byte)')
plt.ylabel('Performance (FLOP/s)')
plt.title('Roofline Model for Titan V (FP16)')
plt.legend()
plt.grid(True


In [None]:
kernels = [
    {
        "label": "Classifier Baseline",
        "flops": 1.34217728e7,
        "runtime_ms": 4.237,
        "dram_bw": 18.917
    },
    {
        "label": "Classifier NHWC FP16",
        "flops": 2.68435456e7,
        "runtime_ms": 4.489,
        "dram_bw": 9.5165
    },
    {
        "label": "Classifier NHWC FP16 + Shared",
        "flops": 2.68435456e7,
        "runtime_ms": 4.446,
        "dram_bw": 9.5342
    }
]
