# Model Integration: Convolution Kernels



In [None]:
pwd

In [None]:
pip install torch

# "Vanilla" Convolution and PTX Convolution

In [None]:
import torch

class VanillaConv2D(torch.nn.Module):
    def __init__(self, mask: torch.Tensor):
        super().__init__()
        self.mask = mask.contiguous().float().cuda()

    def forward(self, x: torch.Tensor):
        B, C, H, W = x.shape
        out = torch.zeros_like(x)
        for b in range(B):
            custom_conv.vanilla_convolve(x[b], self.mask, out[b])
        return out


class PTXConv2D(torch.nn.Module):
    def __init__(self, mask: torch.Tensor):
        super().__init__()
        self.mask = mask.contiguous().float().cuda()

    def forward(self, x: torch.Tensor):
        B, C, H, W = x.shape
        out = torch.zeros_like(x)
        for b in range(B):
            custom_conv.ptx_convolve(x[b], self.mask, out[b])
        return out

# Setup

In [None]:
from setuptools import setup
from torch.utils.cpp_extension import CUDAExtension, BuildExtension

setup(
    name='custom_conv',
    ext_modules=[
        CUDAExtension('custom_conv', [
            'convolution.cu',
            'convoluion_inline_ptx.cu',
        ])
    ],
    cmdclass={'build_ext': BuildExtension}
)

# Run Benchmarks

In [None]:
import torch
import torch.utils.benchmark as benchmark
import VanillaConv2D, PTXConv2D


def benchmark_model(model, input_tensor, label, warmup=10, runs=50):
    for _ in range(warmup):
        _ = model(input_tensor)

    torch.cuda.synchronize()
    timer = benchmark.Timer(
        stmt='model(x)',
        setup='from __main__ import model, x',
        globals={'model': model, 'x': input_tensor},
        num_threads=1,
        label=label
    )
    results = timer.blocked_autorange(min_run_time=1.0)
    print(results)
    return results.median, results.stddev

def run_benchmarks():
    device = 'cuda'
    B, C, H, W = 4, 3, 256, 256

    input_tensor = torch.rand(B, C, H, W, device=device)
    kernel = torch.ones(5, 5, device=device) / 25.0

    model_vanilla = VanillaConv2D(kernel).to(device)
    model_ptx = PTXConv2D(kernel).to(device)

    # Run benchmarks
    vanilla_time, vanilla_std = benchmark_model(model_vanilla, input_tensor, "Vanilla CUDA")
    ptx_time, ptx_std = benchmark_model(model_ptx, input_tensor, "Inline PTX CUDA")

    # Print results
    print(f"Vanilla CUDA: {vanilla_time:.3f} ms ± {vanilla_std:.3f}")
    print(f"Inline  PTX: {ptx_time:.3f} ms ± {ptx_std:.3f}")
    print(f"Speedup (PTX over Vanilla): {vanilla_time / ptx_time:.2f}x")

    # Plot
    labels = ['Vanilla CUDA', 'Inline PTX']
    times = [vanilla_time, ptx_time]
    stds = [vanilla_std, ptx_std]

    plt.figure(figsize=(8, 5))
    bars = plt.bar(labels, times, yerr=stds, capsize=8, color=['skyblue', 'salmon'])
    plt.ylabel('Execution Time (ms)')
    plt.title('Convolution Kernel Benchmark (B=4, C=3, H=W=256)')
    for bar, t in zip(bars, times):
        plt.text(bar.get_x() + bar.get_width() / 2.0, t + 0.5, f"{t:.2f} ms", ha='center', va='bottom')
    plt.grid(True, axis='y', linestyle='--', alpha=0.7)
    plt.tight_layout()
    plt.show()

# Create Roofline Plot

In [None]:
import matplotlib.pyplot as plt
import numpy as np

# Define kernel names for comparison
kernels = ['Vanilla CUDA Kernel', 'Inline PTX Kernel']

# Example arithmetic intensity (FLOPs/byte) — Replace with measured values
arithmetic_intensity = [4.0, 6.5]  # e.g., based on estimated memory access and computation

# Example achieved performance in GFLOPs/s — Replace with profiled numbers
achieved_performance = [900, 1450]

# GPU-specific theoretical limits — Update for your actual GPU
peak_performance = 17000  # Theoretical peak in GFLOPs/s (e.g., NVIDIA A100)
memory_bandwidth = 1555   # Memory bandwidth in GB/s (e.g., A100 HBM2)

# X-axis: Arithmetic intensity range
intensity_range = np.logspace(-1, 2, 500)  # From 0.1 to 100 FLOPs/byte

# Roofline curve: min(compute_bound, memory_bound)
roofline = np.minimum(peak_performance, intensity_range * memory_bandwidth)

# Plotting
plt.figure(figsize=(10, 6))
plt.plot(intensity_range, roofline, label="Theoretical Roofline", color='black', linewidth=2)
plt.scatter(arithmetic_intensity, achieved_performance, color=['blue', 'red'], s=100)

# Annotate kernel points
for i, kernel in enumerate(kernels):
    plt.annotate(kernel,
                 (arithmetic_intensity[i], achieved_performance[i]),
                 textcoords="offset points",
                 xytext=(10, 10),
                 ha='left',
                 fontsize=10,
                 color='darkblue' if i == 0 else 'darkred')

plt.xscale('log')
plt.yscale('log')
plt.xlabel('Arithmetic Intensity (FLOPs/Byte)', fontsize=12)
plt.ylabel('Performance (GFLOPs/s)', fontsize=12)
plt.title('Roofline Model: Vanilla vs Inline PTX CUDA Kernel', fontsize=14)
plt.grid(True, which="both", ls="--", linewidth=0.5)
plt.legend()
plt.tight_layout()
plt.show()