In [None]:
import pandas as pd
import plotly.express as px

In [None]:
metadata_df = pd.read_csv("stress_test_metadata.csv")
cols = list(metadata_df.columns)[1:]
metadata_df = metadata_df[cols]
metadata_df

In [None]:
def load_ncu_csv(path, metadata_df, keep_units=False):
    df = pd.read_csv(path, encoding="utf-16")
    df["Kernel Name"] = df["Kernel Name"].apply(lambda x: str(x).split("<")[0].replace("void", ""))
    cols = ["Kernel Name"] + METRICS_TO_KEEP
    df = df[cols]
    if not keep_units:
        df = df.iloc[1:].reset_index(drop=True)
        # convert from microseconds to seconds
        df["gpu__time_duration_measured_user.avg"] = df["gpu__time_duration_measured_user.avg"].astype(float) * 1e-6
    df["Blocksize"] = metadata_df["blocksize"]
    df["QuantType"] = metadata_df["quant_type"]
    for col in METRICS_TO_KEEP:
        df[col] = df[col].apply(lambda x: str(x).replace(",", "")).astype(float)
    return df


baseline_csv_path = "baseline_bnb_kernel_benchmark.csv"
improved_csv_path = "improved_bnb_kernel_benchmark.csv"

df_improved = load_ncu_csv(improved_csv_path, metadata_df)
df_improved["Version"] = "Improved"

df_baseline = load_ncu_csv(baseline_csv_path, metadata_df)
df_baseline["Version"] = "Baseline"
df_baseline.head()

In [None]:
agg_baseline = df_baseline.groupby(["QuantType", "Kernel Name", "Blocksize"])[METRICS_TO_KEEP].median()
agg_baseline

In [None]:
agg_improved = df_improved.groupby(["QuantType", "Kernel Name", "Blocksize"])[METRICS_TO_KEEP].median()
agg_improved

In [None]:
df = pd.concat([df_baseline, df_improved])
df

In [None]:
agg_df = pd.concat([agg_improved.reset_index(), agg_baseline.reset_index()])

agg_df["Version"] = (["Improved"] * agg_improved.shape[0]) + (["Baseline"] * agg_baseline.shape[0])
agg_df

In [None]:
def plot(df, quant_type, metric):
    custom_colors = ["#1f77b4", "#ff7f0e", "#2ca02c", "#d62728", "#9467bd"]
    df = df[df["QuantType"] == quant_type]

    fig = px.bar(
        df,
        x="Blocksize",
        y=metric,
        color="Version",
        facet_row="Kernel Name",
        barmode="group",
        title="Total latency by Block Size [100 iterations Sum of All Access Patterns]",
        color_discrete_sequence=custom_colors,
        labels={"Kernel Name": ""},
    )

    # fig.add_annotation(
    #     text=f"Total Durationxx",
    #     xref="paper", yref="paper",
    #     x=-0.1, y=0.5,
    #     showarrow=False,
    #     textangle=-90,
    #     font=dict(size=14),
    # )

    # set discrete x-tick labels using unique blocksize values
    blocksize_values = sorted(df["Blocksize"].unique())
    for axis in fig.layout:
        if axis.startswith("xaxis"):
            fig.layout[axis].type = "category"
            fig.layout[axis].tickmode = "array"
            fig.layout[axis].tickvals = blocksize_values
            fig.layout[axis].ticktext = [str(val) for val in blocksize_values]
        # elif axis.startswith("yaxis"):
        #     fig.layout[axis].type = "log"

    fig.update_yaxes(matches=None)
    fig.update_layout(width=900, height=500)
    fig.show()


quant_type = "fp4"
metric = "gpu__time_duration_measured_user.avg"
plot(agg_df, quant_type, metric)

In [None]:
total_diff = (agg_improved - agg_baseline) / agg_baseline * 100
# negate latency columns since in latency, less is better
total_diff["gpu__time_duration_measured_user.avg"] *= -1
# negate n divergent branches since less is better
total_diff["smsp__sass_branch_targets_threads_divergent.sum"] *= -1

# total_diff.columns = ["Duration %", "Compute Throughput %", "Memory Throughput %"]

total_diff.round(3)

In [None]:
agg_baseline2 = df_baseline.drop(columns=["Blocksize", "Version"]).groupby(["QuantType", "Kernel Name"]).median()
agg_improved2 = df_improved.drop(columns=["Blocksize", "Version"]).groupby(["QuantType", "Kernel Name"]).median()

overall_diff = (agg_improved2 - agg_baseline2) / agg_baseline2 * 100
overall_diff["gpu__time_duration_measured_user.avg"] *= -1
overall_diff["smsp__sass_branch_targets_threads_divergent.sum"] *= -1

# overall_diff.columns = ["Duration %", "Compute Throughput %", "Memory Throughput %"]
overall_diff.round(3)