# Lab 2: Arithmetic Intensity Visualization Lab

By completing this assignment, you will:
1. Understand the concept of **arithmetic intensity** and its impact on GPU performance
2. Learn how to measure GPU compute performance (TFLOPS)
3. Benchmark matrix operations and analyze the transition from memory-bounded to compute-bounded operations
4. Create visualizations to analyze GPU utilization using the Roofline model

## Instructions for Grading
1. Complete **Task 1** and **Task 2** (the only TODO sections)
2. Run all cells to generate the `answers.yaml` file
3. Submit the `answers.yaml` file for grading

## Grading (30 points)
1. TFLOPS Calculation — *required but not graded*
2. Arithmetic Intensity Calculation — **30 points**
3. Visualization — *provided (no implementation needed)*

## Environment Setup

runtime base container: `nvcr.io/nvidia/nemo-automodel:25.11`

In [None]:
!pip install pandas plotly pyyaml -q

In [None]:
# plot visualization
import plotly.io as pio

pio.renderers.default = "notebook"  # classic Jupyter Notebook

In [None]:
from typing import Dict, List

import numpy as np
import pandas as pd
import plotly.graph_objects as go
import torch
import yaml
from plotly.subplots import make_subplots

## Background

**Arithmetic Intensity (AI)** = FLOPs / Bytes Accessed

For matrix multiplication `C = A @ B` where A and B are N×N matrices:

## Task 1: Implement TFLOPS Calculation (15 points)

**Formula**: 
- FLOPs for N×N matmul: `2 * N³`
- TFLOPS (Tera FLOPs per Second) = `(FLOPs / time_seconds) / 1e12`

In [None]:
def calculate_tflops(matrix_size: int, time_seconds: float) -> float:
    """
    Calculate achieved TFLOPS for matrix multiplication C = X @ X.

    Args:
        matrix_size: Size of square matrix (N x N)
        time_seconds: Execution time in seconds

    Returns:
        Achieved TFLOPS (Tera FLOPs per second)
    """
    # TODO: Implement this function
    raise NotImplementedError("Complete Task 1")

## Task 2: Implement Arithmetic Intensity Calculation (15 points)

**Formula**:
- Bytes = `3 * N² * bytes_per_element` (2 bytes for FP16, 4 bytes for FP32)
- AI = `FLOPs / Bytes`

In [None]:
def calculate_arithmetic_intensity(matrix_size: int, dtype: torch.dtype) -> float:
    """
    Calculate arithmetic intensity (FLOPs per Byte) for matrix multiplication.

    Args:
        matrix_size: Size of square matrix (N x N)
        dtype: Data type of matrix

    Returns:
        Arithmetic intensity (FLOPs/Byte)
    """
    # TODO: Implement this function
    raise NotImplementedError("Complete Task 2")

In [None]:
# Provided: Benchmark function
def benchmark_matmul(
    size: int,
    dtype: torch.dtype,
    device: torch.device,
    num_warmup: int = 10,
    num_iterations: int = 100,
) -> Dict:
    """Benchmark matrix multiplication C = X @ X for a given size."""
    X = torch.randn(size, size, dtype=dtype, device=device)

    # Warmup
    for _ in range(num_warmup):
        _ = torch.matmul(X, X)

    torch.cuda.synchronize()
    start_event = torch.cuda.Event(enable_timing=True)
    end_event = torch.cuda.Event(enable_timing=True)

    start_event.record()
    for _ in range(num_iterations):
        result = torch.matmul(X, X)
    end_event.record()
    torch.cuda.synchronize()

    elapsed_time_ms = start_event.elapsed_time(end_event)
    avg_time_ms = elapsed_time_ms / num_iterations
    avg_time_s = avg_time_ms / 1000.0

    achieved_tflops = calculate_tflops(size, avg_time_s)
    arithmetic_intensity = calculate_arithmetic_intensity(size, dtype)

    return {
        "size": size,
        "time_ms": avg_time_ms,
        "achieved_tflops": achieved_tflops,
        "arithmetic_intensity": arithmetic_intensity,
    }

In [None]:
# Provided: Run benchmarks
def run_benchmarks(sizes: List[int], dtype: torch.dtype = torch.float16) -> pd.DataFrame:
    """Run benchmarks for multiple matrix sizes."""
    if not torch.cuda.is_available():
        raise RuntimeError("CUDA is not available!")

    device = torch.device("cuda:0")
    props = torch.cuda.get_device_properties(device)
    print(f"GPU: {props.name}")
    print(f"Total Memory: {props.total_memory / 1e9:.2f} GB")
    print(f"\nRunning benchmarks with dtype={dtype}")
    print("-" * 60)
    print(f"{'Size':>8} {'Time(ms)':>12} {'TFLOPS':>10} {'AI':>12}")
    print("-" * 60)

    results = []
    for size in sizes:
        result = benchmark_matmul(size, dtype, device)
        results.append(result)
        print(f"{result['size']:8d} {result['time_ms']:12.4f} "
              f"{result['achieved_tflops']:10.2f} "
              f"{result['arithmetic_intensity']:12.2f}")
    print("-" * 60)
    return pd.DataFrame(results)

In [None]:
# Provided: Visualization
def plot_results(df: pd.DataFrame, peak_tflops: float = 989, peak_bandwidth_gbps: float = 3350):
    """Create interactive Plotly visualizations with 4 subplots."""
    fig = make_subplots(
        rows=2, cols=2,
        subplot_titles=(
            "Arithmetic Intensity vs Matrix Size",
            "Achieved TFLOPS vs Matrix Size",
            "Execution Time vs Matrix Size",
            "Roofline Model",
        ),
    )

    sizes = df.index.values

    # Plot 1: Arithmetic Intensity vs Matrix Size
    fig.add_trace(
        go.Scatter(x=sizes, y=df["arithmetic_intensity"], mode="lines+markers",
                   name="Arithmetic Intensity", line=dict(color="blue", width=3)),
        row=1, col=1,
    )

    # Plot 2: Achieved TFLOPS vs Matrix Size
    fig.add_trace(
        go.Scatter(x=sizes, y=df["achieved_tflops"], mode="lines+markers",
                   name="Achieved TFLOPS", line=dict(color="green", width=3)),
        row=1, col=2,
    )
    fig.add_hline(y=peak_tflops, line_dash="dash", line_color="red",
                  annotation_text=f"Peak ({peak_tflops} TFLOPS)", row=1, col=2)

    # Plot 3: Execution Time vs Matrix Size
    fig.add_trace(
        go.Scatter(x=sizes, y=df["time_ms"], mode="lines+markers",
                   name="Execution Time", line=dict(color="red", width=3)),
        row=2, col=1,
    )

    # Plot 4: Roofline Model
    fig.add_trace(
        go.Scatter(x=df["arithmetic_intensity"], y=df["achieved_tflops"], mode="markers",
                   name="Achieved Performance",
                   marker=dict(size=12, color=df.index, colorscale="Viridis", showscale=True,
                              colorbar=dict(title="Matrix<br>Size", x=1.15))),
        row=2, col=2,
    )

    # Roofline (hardware limit)
    ai_range = np.logspace(np.log10(df["arithmetic_intensity"].min() * 0.5),
                           np.log10(df["arithmetic_intensity"].max() * 2), 100)
    memory_bound = (peak_bandwidth_gbps / 1000) * ai_range
    roofline = np.minimum(memory_bound, peak_tflops)
    fig.add_trace(
        go.Scatter(x=ai_range, y=roofline, mode="lines", name="Roofline",
                   line=dict(color="red", width=3, dash="dash")),
        row=2, col=2,
    )

    # Update axes
    fig.update_xaxes(title_text="Matrix Size (N)", type="log", row=1, col=1)
    fig.update_yaxes(title_text="Arithmetic Intensity (FLOPs/Byte)", row=1, col=1)
    fig.update_xaxes(title_text="Matrix Size (N)", type="log", row=1, col=2)
    fig.update_yaxes(title_text="TFLOPS", row=1, col=2)
    fig.update_xaxes(title_text="Matrix Size (N)", type="log", row=2, col=1)
    fig.update_yaxes(title_text="Time (ms)", type="log", row=2, col=1)
    fig.update_xaxes(title_text="Arithmetic Intensity (FLOPs/Byte)", type="log", row=2, col=2)
    fig.update_yaxes(title_text="Performance (TFLOPS)", type="log", row=2, col=2)

    fig.update_layout(title_text="Matrix Multiplication Performance Analysis",
                      height=900, width=1400, showlegend=True)
    return fig

---
## Run Benchmarks & Visualization

In [None]:
# Matrix sizes and run benchmarks
sizes = [16, 32, 64, 128, 256, 512, 1024, 2048, 4096, 8192, 16384]

df_fp16 = run_benchmarks(sizes, torch.float16).set_index("size")
df_fp32 = run_benchmarks(sizes, torch.float32).set_index("size")

df_fp16

In [None]:
# Visualization (H100 specs)
peak_tflops = 989
peak_bandwidth_gbps = 3350

fig = plot_results(df_fp16, peak_tflops, peak_bandwidth_gbps)
fig.show()

---
## Generate answers.yaml for Submission

In [None]:
# Generate answers for grading - extract directly from benchmark DataFrames
answers = {}

for n in [16, 64, 256, 1024, 4096]:
    answers[f"ai_fp16_n{n}"] = float(df_fp16.loc[n, "arithmetic_intensity"])
    answers[f"ai_fp32_n{n}"] = float(df_fp32.loc[n, "arithmetic_intensity"])

!mkdir -p submission
# Save
with open("submission/answers.yaml", "w") as f:
    yaml.dump(answers, f, default_flow_style=False, sort_keys=True)

print("✅ answers.yaml generated!")
print(yaml.dump(answers, default_flow_style=False, sort_keys=True))

## Submission

Submit `submission` for grading. Grade with:
```bash
python grade_assignment.py submission
```