In [None]:

import numpy as np
import matplotlib as mpl
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
import matplotlib.transforms as mtrans
import matplotlib.patches as mpatches
import os

os.makedirs("plots", exist_ok=True)
hw = ["NVIDIA", "AMD", "INTEL"]
precision = ["SINGLE", "DOUBLE"]
use_case = ["LDC", "VKS"]
compilers = ["dpcpp", "AdaptiveCpp"]
sycl_ranges = ["range", "ndrange"]

In [None]:
# Preprocess data
sns.set_theme()


res = pd.read_csv("results.csv")
res = res.loc[(res["precision"] != "MIXED1" ) & (res["precision"] != "MIXED2")]
#find best fortran implementation for each hw
best_impl = dict()
for u in use_case:
    best_impl[u] = dict()
    for h in hw:
        best_impl[u][h] = dict()
        for p in precision:
            filtered_df = res[((res["hw"] == h) & (res["usecase"] == u) & (res["impl"] == "fortran") & (res["precision"] == p))].sort_values(by="mlups").reset_index(drop=True)
            if not filtered_df.empty:
                first_mlups_value = filtered_df["mlups"].iloc[0]
                # Optionally, store the value in best_impl dictionary
                best_impl[u][h][p] = first_mlups_value
            else:
                best_impl[u][h][p] = None
    
print(best_impl)

# Get sycl values
sycl_csv = res[(res["impl"] == "sycl")]
# Divide sycl values by best fortran values
for u in use_case:
    for h in hw:
        for p in precision:
            rows = sycl_csv.loc[(sycl_csv["hw"] == h) & (sycl_csv["usecase"] == u) & (sycl_csv["precision"] == p)]
            if not rows.empty:
                sycl_csv.loc[(sycl_csv["hw"] == h) & (sycl_csv["usecase"] == u) & (sycl_csv["precision"] == p), "mlups"] = sycl_csv.loc[(sycl_csv["hw"] == h) & (sycl_csv["usecase"] == u) & (sycl_csv["precision"] == p), "mlups"].map(lambda x: x / best_impl[u][h][p]) 


for u in use_case:
    for h in hw:
        for p in precision:
            for c in compilers:
                rows = sycl_csv.loc[(sycl_csv["hw"] == h) & (sycl_csv["usecase"] == u) & (sycl_csv["precision"] == p) & (sycl_csv["parallelism"] == c)]
                if not rows.empty:
                    # Drop all rows except the the one with the hightest mlups
                    rows.sort_values(by="mlups", inplace=True)
                    sycl_csv.drop(rows.index[0:-1], inplace=True)
for u in use_case:
    for h in hw:
            rows = sycl_csv.loc[(sycl_csv["hw"] == h) & (sycl_csv["usecase"] == u)]
            print(f"{u} - {h} -> {rows['mlups'].mean()}")
            
        
#subsitute each "dpcpp" entry in column "compilers" to "Intel DPC++"
sycl_csv["parallelism"] = sycl_csv["parallelism"].str.replace("dpcpp", "Intel DPC++")

parallelisms = ["AdaptiveCpp", "Intel DPC++"]
total_plots = len(hw) * len(parallelisms)

fig, axes = plt.subplots(nrows=1, ncols=total_plots, figsize=(total_plots * 5, 5), sharey=True)
color_palette = sns.color_palette("viridis", len(precision))
color_map = dict(zip(precision, color_palette))

plot_index = 0
for parallelism in parallelisms:
    for h in hw:
        # Assuming 'use_case' and 'precision' are defined and 'res' is filtered accordingly
        for k, u in enumerate(use_case):
            for l, p in enumerate(precision):
                df_filtered = sycl_csv[(sycl_csv["hw"] == h) & (sycl_csv["usecase"] == u) & (sycl_csv["precision"] == p) & (sycl_csv["parallelism"] == parallelism)]
                if not df_filtered.empty:
                    mean_mlups = df_filtered["mlups"].mean()
                    ax = axes[plot_index]
                    ax.bar(k + l*0.2, mean_mlups, color=color_map[p], width=0.2, label=f'{p}' if plot_index == 0 else "")
                    # Modify the ax.bar to have just one hue for the precision
                    ax.set_title(f'{h}', y = -0.2)
                    if plot_index == 1 or plot_index == 4:
                        ax.set_xlabel(f'{parallelism}')
                        # move it to the top
                        ax.xaxis.set_label_position('top')
                    ax.axhline(1, color='red', linewidth=3, linestyle='--')
                    ax.set_xticks(range(len(use_case)))
                    ax.set_xticklabels(use_case),
                    # set hatch
                    if parallelism == "AdaptiveCpp":
                        # iterate over bars
                        for bar in ax.patches:
                            # set hatching
                            bar.set_hatch('/')
                            # set hatch color
                            bar.set_edgecolor('black')
                    else:
                        # iterate over bars
                        for bar in ax.patches:
                            # set hatching
                            bar.set_hatch('x')
                            # set hatch color
                            bar.set_edgecolor('black')
                    if plot_index == 0:
                        ax.set_ylabel('Speedup')
                    trans = mtrans.Affine2D().translate(6, 0)
                    for t in ax.get_xticklabels():
                        t.set_transform(t.get_transform()+trans)
                    # Increase all subplots font size
                    for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
                                ax.get_xticklabels() + ax.get_yticklabels()):
                        item.set_fontsize(23)
                        
        plot_index += 1

plt.subplots_adjust(wspace=0, hspace=0)

legend_handles = [mpatches.Patch(color=color_map[p], label=p) for p in precision]

fig.legend(handles=legend_handles, title='Precision', loc='upper center', ncol=4, fontsize=20, title_fontsize=20, bbox_to_anchor=(0.5, 1.15))
# g.fig.suptitle('Bar Plot with Two Groups by Type', y=1.03) # Adjust title and its position
# g.set_axis_labels('Category', 'Value') # Set x and y axis labels
plt.savefig("plots/speedup.pdf", bbox_inches='tight')
plt.show()



sycl_csv = res[(res["impl"] == "sycl")]
for u in use_case:
    for h in hw:
        for p in precision:
            for c in compilers:
                rows = sycl_csv.loc[(sycl_csv["hw"] == h) & (sycl_csv["usecase"] == u) & (sycl_csv["precision"] == p) & (sycl_csv["parallelism"] == c)]
                if not rows.empty:
                    # Drop all rows except the the one with the hightest mlups
                    rows.sort_values(by="mlups", inplace=True)
                    sycl_csv.drop(rows.index[0:-1], inplace=True)
                    
#subsitute each "dpcpp" entry in column "compilers" to "Intel DPC++"
sycl_csv["parallelism"] = sycl_csv["parallelism"].str.replace("dpcpp", "Intel DPC++")

parallelisms = ["AdaptiveCpp", "Intel DPC++"]
total_plots = len(hw) * len(parallelisms)

fig, axes = plt.subplots(nrows=1, ncols=total_plots, figsize=(total_plots * 5, 5), sharey=True)
color_palette = sns.color_palette("viridis", len(precision))
color_map = dict(zip(precision, color_palette))

plot_index = 0
for parallelism in parallelisms:
    for h in hw:
        # Assuming 'use_case' and 'precision' are defined and 'res' is filtered accordingly
        for k, u in enumerate(use_case):
            for l, p in enumerate(precision):
                df_filtered = sycl_csv[(sycl_csv["hw"] == h) & (sycl_csv["usecase"] == u) & (sycl_csv["precision"] == p) & (sycl_csv["parallelism"] == parallelism)]
                if not df_filtered.empty:
                    mean_mlups = df_filtered["mlups"].mean()
                    ax = axes[plot_index]
                    ax.bar(k + l*0.2, mean_mlups, color=color_map[p], width=0.2, label=f'{p}' if plot_index == 0 else "")
                    # Modify the ax.bar to have just one hue for the precision
                    ax.set_title(f'{h}', y = -0.2)
                    if plot_index == 1 or plot_index == 4:
                        ax.set_xlabel(f'{parallelism}')
                        ax.xaxis.set_label_position('top')
                    ax.set_xticks(range(len(use_case)))
                    ax.set_xticklabels(use_case),
                    # set hatch
                    if parallelism == "AdaptiveCpp":
                        # iterate over bars
                        for bar in ax.patches:
                            # set hatching
                            bar.set_hatch('/')
                            # set hatch color
                            bar.set_edgecolor('black')
                    else:
                        # iterate over bars
                        for bar in ax.patches:
                            # set hatching
                            bar.set_hatch('x')
                            # set hatch color
                            bar.set_edgecolor('black')
                    if plot_index == 0:
                        ax.set_ylabel('MLUP/s')
                    trans = mtrans.Affine2D().translate(6, 0)
                    for t in ax.get_xticklabels():
                        t.set_transform(t.get_transform()+trans)
                    # Increase all subplots font size
                    for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
                                ax.get_xticklabels() + ax.get_yticklabels()):
                        item.set_fontsize(23)
                        
        plot_index += 1

plt.subplots_adjust(wspace=0, hspace=0)

legend_handles = [mpatches.Patch(color=color_map[p], label=p) for p in precision]

fig.legend(handles=legend_handles, title='Precision', loc='upper center', ncol=4, fontsize=20, title_fontsize=20, bbox_to_anchor=(0.5, 1.15))
# g.fig.suptitle('Bar Plot with Two Groups by Type', y=1.03) # Adjust title and its position
# g.set_axis_labels('Category', 'Value') # Set x and y axis labels
plt.savefig("plots/mlups.pdf", bbox_inches='tight')
plt.show()

In [None]:
from matplotlib.colors import LinearSegmentedColormap


sycl_csv = res[(res["impl"] == "sycl")] # Reset csv
# add new coloumn to sycl_csv eith the combination of values in columns "alloc_type" and "range"
sycl_csv["alloc_type_range"] = sycl_csv["alloc_type"] + "\n" + sycl_csv["range"]
fortran_par_types = ["doconcurrent", "openacc", "offload"]         

# duplcate each row three times and add the values in fortran_par_types to the new column "fortran_par"
sycl_csv = sycl_csv.loc[sycl_csv.index.repeat(3)].reset_index(drop=True)
sycl_csv["fortran_par"] = fortran_par_types * int(len(sycl_csv) / 3)
sycl_csv["speedup"] = 0

for h in hw:
    for p in precision:
        for u in use_case:
            for fortran_par in fortran_par_types:
#                 # print(f"hw: {h}, precision: {p}, usecase: {u}, fortran_par: {fortran_par}")
                value = res[(res["impl"] == "fortran") & (res["parallelism"] == fortran_par) & (res["hw"] == h) & (res["usecase"] == u) & (res["precision"] == p)].reset_index(drop=True)
                if not value.empty:
                    # value["mlups"].iloc[0]
                    subset = sycl_csv.loc[(sycl_csv["hw"] == h) & (sycl_csv["usecase"] == u) & (sycl_csv["precision"] == p) & (sycl_csv["fortran_par"] == fortran_par)]
                    subset["speedup"] = subset["mlups"] / value["mlups"].iloc[0]
                    sycl_csv.update(subset)
                # elif h == "INTEL" and fortran_par == "doconcurrent":
                #     subset = sycl_csv.loc[(sycl_csv["hw"] == h) & (sycl_csv["usecase"] == u) & (sycl_csv["precision"] == p) & (sycl_csv["fortran_par"] == fortran_par)]
                #     subset["speedup"] = np.max(subset["speedup"])
                #     sycl_csv.update(subset)

for p in precision:
    for u in use_case:
        # get all value with hw intel, precision p, usecase u
        sycl_csv_tmp = sycl_csv[(sycl_csv["hw"] == "INTEL") & (sycl_csv["precision"] == p) & (sycl_csv["usecase"] == u)]
        max_idx = sycl_csv_tmp["speedup"].idxmax()
        max_value = sycl_csv_tmp.loc[max_idx, "speedup"]
        subset = sycl_csv.loc[(sycl_csv["hw"] == "INTEL") & (sycl_csv["precision"] == p) & (sycl_csv["usecase"] == u) & (sycl_csv["fortran_par"] == "doconcurrent")]
        subset["speedup"] = max_value
        sycl_csv.update(subset)

        
# # make seaborn heatmap
# Filter only LDC use case
sycl_csv = sycl_csv[sycl_csv["usecase"] == "LDC"]
beaufity_precision = dict(
    SINGLE="Single Precision",
    DOUBLE="Double Precision",
    MIXED1="Mixed Precision 1",
    MIXED2="Mixed Precision 2"
)

# renane each "doconcurrent" entry from column "fortran_par" to "DC"
sycl_csv["fortran_par"] = sycl_csv["fortran_par"].str.replace("doconcurrent", "DC")

for h in hw:
    for compiler in compilers:
        sycl_csv_tmp = sycl_csv[(sycl_csv["hw"] == h) & (sycl_csv["parallelism"] == compiler)]
        # ax = sns.catplot(kind="bar", x='fortran_par', y='speedup', hue='alloc_type_range', col="precision", data=sycl_csv_tmp)
        # ax.set_titles('{col_name}')
        # ax.set_axis_labels('Fortran Parallelism', 'Speedup')
        # ax.fig.suptitle(f'{h} - {compiler}')
        # plt.show()
        # sycl_csv_tmp.to_csv(f"{h}_{compiler}.csv", index=False)
        for p in ["MIXED1", "MIXED2"]:
            sycl_csv_tmp_2 = sycl_csv_tmp[sycl_csv_tmp["precision"] == p]
            agg_sycl_csv = sycl_csv_tmp_2.groupby(['alloc_type_range', 'fortran_par'])['speedup'].mean().reset_index()

            # Pivot the aggregated DataFrame
            pivot_sycl_csv = agg_sycl_csv.pivot(index='alloc_type_range', columns='fortran_par', values='speedup')

            # Plot the heatmap
            # Define the colors for the colormap (red, yellow, green)
            colors = [(0.7, 0, 0), (0.7, 0.7, 0), (0, 0.5, 0)]  # Example darker shades
        # Create the colormap with darker colors
            cmap = LinearSegmentedColormap.from_list("DarkRedYellowGreen", colors)
            # make a colormap for the heatmap that goes from red to yellow to green
            
            ax = sns.heatmap(data=pivot_sycl_csv, annot=True,fmt=".1f",cmap=cmap, linewidth=0.5)
            ax.set_title(f'{beaufity_precision[p]}')
            _xlabels = ax.get_xticklabels()
            for labels in _xlabels:
                labels.set(text=labels.get_text().replace(' ',"\n"))
            ax.set_xticklabels(_xlabels, rotation=-360, ha='center', rotation_mode='anchor')
            ax.set_yticklabels(ax.get_yticklabels(), ha='center', x=-0.17, rotation=0)
            ax.set(xlabel='',ylabel='')
            # set the annotated values to be in the center of the cells
            for t in ax.texts:
                if h == "INTEL" and t.get_position()[0] == 0.5:
                    # set the unicode symbol for infinity
                    t.set_text("N.D.")
                    # t.set_text("Infinite")
                elif t.get_text() == "0.0":
                        t.set_text("N.S.")
                else:
                    t.set_text(t.get_text() + "x")
                # Increase font size
                t.set_fontsize(23)
            if p == "DOUBLE" or p == "MIXED2":
                # make the ytickslaels white
                # ax.set_yticklabels(ax.get_yticklabels(), color='white')
                ax.set_yticklabels([])
            # Increase font size
            for item in ([ax.title, ax.xaxis.label, ax.yaxis.label] +
                        ax.get_xticklabels() + ax.get_yticklabels()):
                item.set_fontsize(23)
            

            plt.savefig(f"plots/{h}_{compiler}_{p}.pdf", bbox_inches='tight')
            plt.show()

# ax = sns.heatmap(data = sycl_csv)


# for fortran_par in fortran_par_types:
#     value = res[(res["impl"] == "fortran") & (res["parallelism"] == fortran_par)]
#     sycl_csv[fortran_par] = sycl_csv["mlups"] / 


In [None]:
## barplot with speedup compared to offload
from matplotlib.colors import LinearSegmentedColormap


sycl_csv = res[(res["impl"] == "sycl")] # Reset csv
# add new coloumn to sycl_csv eith the combination of values in columns "alloc_type" and "range"
sycl_csv["alloc_type_range"] = sycl_csv["alloc_type"] + "\n" + sycl_csv["range"]
sycl_csv["speedup"] = 0

for h in hw:
    for p in precision:
        for u in use_case:
#                 # print(f"hw: {h}, precision: {p}, usecase: {u}, fortran_par: {fortran_par}")
            value = res[(res["impl"] == "fortran") & (res["parallelism"] == "offload") & (res["hw"] == h) & (res["usecase"] == u) & (res["precision"] == p)].reset_index(drop=True)
            if not value.empty:
                # value["mlups"].iloc[0]
                subset = sycl_csv.loc[(sycl_csv["hw"] == h) & (sycl_csv["usecase"] == u) & (sycl_csv["precision"] == p)]
                subset["speedup"] = subset["mlups"] / value["mlups"].iloc[0]
                sycl_csv.update(subset)
        
# # make seaborn heatmap
# Filter only LDC use case
sycl_csv = sycl_csv[sycl_csv["usecase"] == "LDC"]
beaufity_precision = dict(
    SINGLE="Single Precision",
    DOUBLE="Double Precision",
    MIXED1="Mixed Precision 1",
    MIXED2="Mixed Precision 2"
)

# fig = sns.relplot(kind="scatter", x='alloc_type_range', y='speedup', hue='hw', style='precision', col="parallelism", data=sycl_csv, s=200)
# Set hue color to red, blue, and green for NVIDIA, AMD, and INTEL respectively
sycl_csv["parallelism"] = sycl_csv["parallelism"].str.replace("dpcpp", "Intel DPC++")

fig = sns.relplot(kind="scatter", x='alloc_type_range', y='speedup', hue='hw', style='precision', col="parallelism", data=sycl_csv, s=200, palette={'NVIDIA': 'Green', 'AMD': 'Brown', 'INTEL': 'Blue'})
# add a red line at y=1
fig.map(plt.axhline, y=1, color='red', linestyle='--')
fig.set_titles('{col_name}')
fig.set_xlabels("")
fig.set_ylabels("Speedup over FORTRAN Offload")

plt.savefig("plots/speedup_over_offload.pdf", bbox_inches='tight')


amd_mixed = sycl_csv.loc[(sycl_csv["hw"] == "AMD") & (sycl_csv["precision"] == "MIXED1")]
dpcpp_range = sycl_csv.loc[(sycl_csv["parallelism"] == "Intel DPC++") & (sycl_csv["range"] == "range") & (sycl_csv["hw"] == "NVIDIA")]
sycl_csv_copy = sycl_csv.copy()
sycl_csv_copy.drop(amd_mixed.index, inplace=True)
sycl_csv_copy.drop(dpcpp_range.index, inplace=True)

fig = sns.relplot(kind="scatter", x='alloc_type_range', y='speedup', hue='hw', style='precision', col="parallelism", data=sycl_csv_copy, s=200, palette={'NVIDIA': 'Green', 'AMD': 'Brown', 'INTEL': 'Blue'})
# add a red line at y=1
fig.map(plt.axhline, y=1, color='red', linestyle='--')
fig.set_titles('{col_name}')
fig.set_xlabels("")
fig.set_ylabels("Speedup over FORTRAN Offload")

plt.savefig("plots/speedup_over_offload_detail.pdf", bbox_inches='tight')


In [None]:
# Intel DPC++ range
intel_csv = pd.read_csv("./results_INTEL_SYCL.csv")

# filter only LDC use case
intel_csv_tmp = intel_csv[(intel_csv["usecase"] == "VKS") & (intel_csv["alloc_type"] == "device") ]
for p in precision:
    for r in ["ndrange", "range"]:
        acpp_tmp = intel_csv_tmp[(intel_csv_tmp["precision"] == p) & (intel_csv_tmp["range"] == r) & (intel_csv_tmp["parallelism"] == "AdaptiveCpp")]
        dpcpp_tmp = intel_csv_tmp[(intel_csv_tmp["precision"] == p) & (intel_csv_tmp["range"] == r) & (intel_csv_tmp["parallelism"] == "dpcpp")]
        print(f"Device - {p} - {r}: {acpp_tmp['mlups'].mean() / dpcpp_tmp['mlups'].mean()}")
print("\n")
intel_csv_tmp = intel_csv[(intel_csv["usecase"] == "VKS") & (intel_csv["alloc_type"] == "shared") ]
for p in precision:
    for r in ["ndrange", "range"]:
        acpp_tmp = intel_csv_tmp[(intel_csv_tmp["precision"] == p) & (intel_csv_tmp["range"] == r) & (intel_csv_tmp["parallelism"] == "AdaptiveCpp")]
        dpcpp_tmp = intel_csv_tmp[(intel_csv_tmp["precision"] == p) & (intel_csv_tmp["range"] == r) & (intel_csv_tmp["parallelism"] == "dpcpp")]
        print(f"{p} - {r} - {acpp_tmp['mlups'].mean() / dpcpp_tmp['mlups'].mean()}")

In [None]:
# AMD 
amd_csv = pd.read_csv("results.csv")

# filter only LDC use case
amd_csv_tmp = amd_csv[(amd_csv["usecase"] == "LDC") & (amd_csv["alloc_type"] == "device") & ((amd_csv["hw"] == "AMD") | (amd_csv["hw"] == "NVIDIA"))]
for c in compilers:
    for p in precision:
        for r in ["range", "ndrange"]:
            nvidia_tmp = amd_csv_tmp[(amd_csv_tmp["precision"] == p) & (amd_csv_tmp["range"] == r) & (amd_csv_tmp["parallelism"] == c ) & (amd_csv_tmp["hw"] == "NVIDIA")]
            amd_tmp = amd_csv_tmp[(amd_csv_tmp["precision"] == p) & (amd_csv_tmp["range"] == r) & (amd_csv_tmp["parallelism"] == c) & (amd_csv_tmp["hw"] == "AMD")]
            print(f"Shared - {c} - {p} - {r}: {amd_tmp['mlups'].mean() / nvidia_tmp['mlups'].mean()}")
            
    

In [None]:
# Percentage of entry in range and nd_range above or under the baseline
# print(sycl_csv)
sycl_csv["parallelism"] = sycl_csv["parallelism"].str.replace("Intel DPC++", "dpcpp")

above = dict()
under = dict()
for h in hw:
    above[h] = dict()
    under[h] = dict()
    for c in compilers:
        above[h][c] = dict()
        under[h][c] = dict()
        for alloc_type in ["device", "shared"]:
            above[h][c][alloc_type] = dict()
            under[h][c][alloc_type] = dict()
            for r in sycl_ranges:
                # print(f"{h} - {c} - {alloc_type} - {r}")
                subset = sycl_csv[(sycl_csv["hw"] == h) & (sycl_csv['range'] == r) & (sycl_csv["parallelism"] == c) & (sycl_csv["alloc_type"] == alloc_type)]
                if not subset.empty:
                    above[h][c][alloc_type][r] = len(subset[subset['speedup'] >= 1]) / len(subset) * 100
                    under[h][c][alloc_type][r] = len(subset[subset['speedup'] < 1]) / len(subset) * 100

print("\n")
print("For each hardware how many times the SYCL results are above the baseline")
tmp_above = dict()
tmp_under = dict()
for h in hw:
    tmp_above[h] = []
    tmp_under[h] = []
    for r in sycl_ranges:
        for c in compilers:
            for alloc_type in ["device", "shared"]:
                    try:
                        tmp_above[h].append(above[h][c][alloc_type][r])
                        tmp_under[h].append(under[h][c][alloc_type][r])
                    except:
                        print(f"{h} - {c} - {alloc_type} - {r}")
for h in hw:
    print(f"{h} - Above: {np.mean(tmp_above[h])}")
    print(f"{h} - Under: {np.mean(tmp_under[h])}")

print("\n")
print("For each compiler how manu times is above the baseline")
tmp_above = dict()
tmp_under = dict()
for c in compilers:
    tmp_above[c] = []
    tmp_under[c] = []
    for r in sycl_ranges:
        for h in hw:
            for alloc_type in ["device", "shared"]:
                    try:
                        tmp_above[c].append(above[h][c][alloc_type][r])
                        tmp_under[c].append(under[h][c][alloc_type][r])
                    except:
                        pass
for c in compilers:
    print(f"{c} - Above: {np.mean(tmp_above[c])}")
    print(f"{c} - Under: {np.mean(tmp_under[c])}")
    
    
print("\n")
print("For each compiler and range, how many times is above the baseline")
tmp_above = dict()
tmp_under = dict()
for c in compilers:
    tmp_above[c] =  dict()
    tmp_under[c] = dict()
    for r in sycl_ranges:
        tmp_above[c][r] = []
        tmp_under[c][r] = []
        for h in hw:
            for alloc_type in ["device", "shared"]:  
                    try:
                        tmp_above[c][r].append(above[h][c][alloc_type][r])
                        tmp_under[c][r].append(under[h][c][alloc_type][r])
                    except:
                        pass
for c in compilers:
    for r in sycl_ranges:
        print(f"{c} - {r} - Above: {np.mean(tmp_above[c][r])}")
        print(f"{c} - {r} - Under: {np.mean(tmp_under[c][r])}")
    
print("\n")
print("For each range, how many times is above the baseline")
tmp_above = dict()
tmp_under = dict()
for r in sycl_ranges:
    tmp_above[r] = []
    tmp_under[r] = []
    for c in compilers:
        for h in hw:
            for alloc_type in ["device", "shared"]:
                    try:
                        tmp_above[r].append(above[h][c][alloc_type][r])
                        tmp_under[r].append(under[h][c][alloc_type][r])
                    except:
                        pass
for r in sycl_ranges:
    print(f"{r} - Above: {np.mean(tmp_above[r])}")
    print(f"{r} - Under: {np.mean(tmp_under[r])}")
    
print("\n")
print("For each alloc_type, how manu times is above the baseline")
tmp_above = dict()
tmp_under = dict()
for alloc_type in ["device", "shared"]:
    tmp_above[alloc_type] = []
    tmp_under[alloc_type] = []
    for c in compilers:
        for h in hw:
            for r in sycl_ranges:
                    try:
                        tmp_above[alloc_type].append(above[h][c][alloc_type][r])
                        tmp_under[alloc_type].append(under[h][c][alloc_type][r])
                    except:
                        pass
                    
for alloc_type in ["device", "shared"]:
    print(f"{alloc_type} - Above: {np.mean(tmp_above[alloc_type])}")
    print(f"{alloc_type} - Under: {np.mean(tmp_under[alloc_type])}")
    
print("\n")
print("For each alloc_type and compiler, how many times is above the baseline")
tmp_above = dict()
tmp_under = dict()
for alloc_type in ["device", "shared"]:
    tmp_above[alloc_type] = dict()
    tmp_under[alloc_type] = dict()
    for c in compilers:
        tmp_above[alloc_type][c] = []
        tmp_under[alloc_type][c] = []
        for h in hw:
            for r in sycl_ranges:
                    try:
                        tmp_above[alloc_type][c].append(above[h][c][alloc_type][r])
                        tmp_under[alloc_type][c].append(under[h][c][alloc_type][r])
                    except:
                        pass
                    
for alloc_type in ["device", "shared"]:
    for c in compilers:
        print(f"{alloc_type} - {c} - Above: {np.mean(tmp_above[alloc_type][c])}")
        print(f"{alloc_type} - {c} - Under: {np.mean(tmp_under[alloc_type][c])}")
        

print("\n")
print("For each alloc_type and hardware, how many times is above the baseline")
tmp_above = dict()
tmp_under = dict()
for alloc_type in ["device", "shared"]:
    tmp_above[alloc_type] = dict()
    tmp_under[alloc_type] = dict()
    for h in hw:
        tmp_above[alloc_type][h] = []
        tmp_under[alloc_type][h] = []
        for c in compilers:
            for r in sycl_ranges:
                    try:
                        tmp_above[alloc_type][h].append(above[h][c][alloc_type][r])
                        tmp_under[alloc_type][h].append(under[h][c][alloc_type][r])
                    except:
                        pass
                    
for alloc_type in ["device", "shared"]:
    for h in hw:
        print(f"{alloc_type} - {h} - Above: {np.mean(tmp_above[alloc_type][h])}")
        print(f"{alloc_type} - {h} - Under: {np.mean(tmp_under[alloc_type][h])}")


# print("\n")
# print("For each compiler and alloc_type, the ratio between range and ndrange mlups on NVIDIA hardware")
# for c in compilers:
#     for alloc_type in ["device", "shared"]:
#         range_tmp = sycl_csv[(sycl_csv["hw"] == "NVIDIA") & (sycl_csv["parallelism"] == c) & (sycl_csv["range"] == "range") & (sycl_csv["alloc_type"] == alloc_type)]
#         ndrange_tmp = sycl_csv[(sycl_csv["hw"] == "NVIDIA") & (sycl_csv["parallelism"] == c) & (sycl_csv["range"] == "ndrange") & (sycl_csv["alloc_type"] == alloc_type)]
#         print(f"{c} - {alloc_type}: {range_tmp['mlups'].mean() / ndrange_tmp['mlups'].mean()}")

print("\n")
print("For each hardware,  compiler, alloc_type, the ratio between range and ndrange mlups")
for h in hw:
    for c in compilers:
        for alloc_type in ["device", "shared"]:
            range_tmp = sycl_csv[(sycl_csv["hw"] == h) & (sycl_csv["parallelism"] == c) & (sycl_csv["range"] == "range") & (sycl_csv["alloc_type"] == alloc_type)]
            ndrange_tmp = sycl_csv[(sycl_csv["hw"] == h) & (sycl_csv["parallelism"] == c) & (sycl_csv["range"] == "ndrange") & (sycl_csv["alloc_type"] == alloc_type)]
            print(f"{h} - {c} - {alloc_type}: {range_tmp['mlups'].mean() / ndrange_tmp['mlups'].mean()}")

# print("\n")
# print(f"Range: mean of above: {np.mean(above['range'])}")
# print(f"Range: mean of under: {np.mean(under['range'])}")

# print("\n")
# print(f"ND Range: mean of above: {np.mean(above['ndrange'])}")
# print(f"ND Range: mean of under: {np.mean(under['ndrange'])}")

In [None]:
def plot_roofline_2(roofline_data,max_gflops,max_bandwidth,hw,hw_name, rotation_angle, alloc_type, aa_ticks, flop_ticks):
    # plt.cla()
    # Sample data for performance (GFLOPS), operational intensity (FLOPs/Byte), and memory bandwidth (GB/s)
    csv_roofline_data = pd.read_csv(roofline_data)
    csv_roofline_data = csv_roofline_data.loc[(csv_roofline_data["hw"] == hw) & (csv_roofline_data["alloc_type"] == alloc_type)]
    # fileter mixed1 and mixed2 precision
    csv_roofline_data = csv_roofline_data[(csv_roofline_data["precision"] == "MIXED1") | (csv_roofline_data["precision"] == "MIXED2")]
    
    balance_point = max_gflops / max_bandwidth
    
    
    # bandwidth_x = np.linspace(1, balance_point)
    # bandwidth_y = np.linspace(1, max_gflops)

    bandwidth_x = np.linspace(aa_ticks[0], balance_point)
    bandwidth_y = [max_bandwidth * x for x in bandwidth_x]
    

    flops_y = [max_gflops, max_gflops] 
    flops_x = [balance_point, aa_ticks[-1]]
    # Set up the plot using Seaborn
    # sns.reset_defaults()
    # sns.set_style('ticks',{'axes.grid' : True})
    # sns.set_theme(style="white")
    # sns.set_context("paper")
    plt.figure(figsize=(6, 4))
    
    plot = sns.lineplot(x=bandwidth_x, y=bandwidth_y, label=f'{hw_name} Bandwidth',linestyle='dashed',legend=False, color="royalblue")
    plot = sns.lineplot(x=flops_x, y=flops_y, label=f'{hw_name} TFLOPs', linestyle='dashed', legend=False, color="firebrick")
    # Palette: palette=sns.color_palette(["forestgreen","mediumpurple"],as_cmap=True)
    # rename dpcpp to Intel DPC++
    csv_roofline_data["implementation"] = csv_roofline_data["implementation"].str.replace("dpcpp", "Intel DPC++")
    plot = sns.scatterplot(data=csv_roofline_data, x='AI', y='GFLOPS',hue='precision',style='implementation',s=70, legend="brief"
                           , style_order=["AdaptiveCpp + range", "Intel DPC++ + range", "AdaptiveCpp + ndrange", "Intel DPC++ + ndrange"])
    handles,labels = plot.get_legend_handles_labels()
    handles = handles[2:]
    labels  = labels[2:]
    # Put legend outside the plot
    plot.legend(handles,labels,loc="lower right", markerscale=1.2, ncol=1, frameon=True,fancybox=True, shadow=True, 
                facecolor="White", prop = {"size" : 8.5})
    # plt.legend(h[2:],l[2:],bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    # plt.show(g)
    plot.set(xscale='log', yscale='log')
    plot.set(yticks=flop_ticks, xticks=aa_ticks, yticklabels=flop_ticks, xticklabels=aa_ticks)
    # start the x axis at 0.1
    plot.set_xlim(left=aa_ticks[0])

    bandwidth_color = plot.get_lines()[0].get_color()
    flops_color = plot.get_lines()[1].get_color()
    # Flop line text
    flop_text_x_pos = dict({
        "NVIDIA V100S": 0.2,
        "Intel Max 1100": 0.39,
        "AMD MI100": 0.25
    })
    plot.text(x=balance_point - flop_text_x_pos[hw]  * balance_point, y=(max_gflops + 300),
              s=f"FP64: {max_gflops / 1000} TF/s", color=flops_color, fontsize=13)
    # Bandwidth line text
    m = (bandwidth_y[1] - bandwidth_y[0]) / (bandwidth_x[1] - bandwidth_x[0])
    y_bandwidth = lambda x: m*(x - bandwidth_x[1]) + bandwidth_y[1]
    plot.text(x=aa_ticks[1],y=aa_ticks[1]*max_bandwidth + 1,s=f"{hw_name} HBM2 BW: {max_bandwidth / 1000} TB/s",
              color=bandwidth_color, rotation=rotation_angle, rotation_mode='anchor', fontsize=13)
    # plot.text(x=10**0.3 - 0.2, y=10**1.05 ,s=f"{hw_name} HBM2 BW: {max_bandwidth / 1000} TB/s", color=bandwidth_color, rotation=rotation_angle, rotation_mode='anchor')
    # Add labels and title
    # Increase xlabel and ylabel font size
    plt.xlabel('Arithmetic Intensity (FLOPs/Byte)', fontsize=13)
    plt.ylabel('FLOP-rate (GF/S)', fontsize=13)
    
    
    # Plot axis ticks
    for thick in aa_ticks:
        plot = plt.axvline(x=thick, color='gray', linestyle='-', linewidth=0.5)
    for thick in flop_ticks:
        plot = plt.axhline(y=thick, color='gray', linestyle='-', linewidth=0.5)
    
    plt.savefig(f"plots/{hw_name}_{alloc_type}_roofline.pdf", bbox_inches='tight')

aa_ticks=[0.4, 0.7, 1,2, 4, 7, 10]
# flop_ticks=[40, 70, 100, 400, 700, 1000, 4000, 7000, 12000]
flop_ticks=[200, 400, 700, 1000, 4000, 7000, 13000]

max_gflops = 6300 #GFLOPs
max_bandwidth = 1100  # GB/s, replace with the actual memory bandwidth of your system
rotation_angle = 27
alloc_type = "device"
plot_roofline_2("./roofline_data.csv", max_gflops, max_bandwidth, "NVIDIA V100S", "V100S", rotation_angle, alloc_type, aa_ticks, flop_ticks)
alloc_type = "shared"
plot_roofline_2("./roofline_data.csv", max_gflops, max_bandwidth, "NVIDIA V100S", "V100S", rotation_angle, alloc_type, aa_ticks, flop_ticks)

aa_ticks=[1,2, 4, 7, 10, 20]
flop_ticks=[700, 1000, 4000, 7000, 13000]

max_gflops = 10500 #GFLOPs
max_bandwidth = 890  # GB/s, replace with the actual memory bandwidth of your system
rotation_angle = 36
alloc_type = "device"
plot_roofline_2("./roofline_data.csv", max_gflops, max_bandwidth, "AMD MI100", "MI100", rotation_angle, alloc_type, aa_ticks, flop_ticks)

aa_ticks=[0.4, 0.7, 1,2, 4, 7, 10, 20]
flop_ticks=[200, 400, 700, 1000, 4000, 7000, 13000]

max_gflops = 9150 #GFLOPs
max_bandwidth = 800  # GB/s, replace with the actual memory bandwidth of your system
rotation_angle = 33
alloc_type = "device"
plot_roofline_2("./roofline_data.csv", max_gflops, max_bandwidth, "Intel Max 1100", "Intel Max 1100", rotation_angle, alloc_type, aa_ticks, flop_ticks)
alloc_type = "shared"
plot_roofline_2("./roofline_data.csv", max_gflops, max_bandwidth, "Intel Max 1100", "Intel Max 1100", rotation_angle, alloc_type, aa_ticks, flop_ticks)





In [None]:
import pprint

csv = pd.read_csv("roofline_data.csv")

# Performance portability 
def get_pp(csv, hardware):
    res = dict()
    hw_subset = csv[csv["hw"] == hardware]
    res["best"] = hw_subset["FLOP/Peak"].max() / 100
    for precision in hw_subset["precision"].unique():
        res[precision] = dict()
        precision_subset = hw_subset[hw_subset["precision"] == precision]
        res[precision]["max"] = precision_subset["FLOP/Peak"].max() / 100 
        for range_type in ["range", "ndrange"]:
            range_subset = precision_subset[precision_subset["range"] == range_type]
            res[precision][range_type] = dict()
            res[precision][range_type]["max"] = range_subset["FLOP/Peak"].max() / 100
            for alloc in ["device", "shared"]:
                alloc_subset = range_subset[range_subset["alloc_type"] == alloc]
                res[precision][range_type][alloc] = alloc_subset["FLOP/Peak"].max() / 100
    return res

nvidia = get_pp(csv, "NVIDIA V100S")
amd = get_pp(csv, "AMD MI100")
intel = get_pp(csv, "Intel Max 1100")

NUM_HW = 3
pp = dict()
for p in precision:
    pp[p] = dict()
    pp[p]["pp metric"] = 3 / (1 / nvidia[p]["max"] + 1 / amd[p]["max"] + 1 / intel[p]["max"])
    for r in sycl_ranges:
        pp[p][r] = dict() 
        pp[p][r]["pp_metric"] = 3 / (1 / nvidia[p][r]["max"] + 1 / amd[p][r]["max"] + 1 / intel[p][r]["max"])
        for alloc in ["device", "shared"]:
            amd_value = (1 / amd[p][r][alloc])  #Threat amd perf as 0 for shared memory
            pp[p][r][alloc] = 3 / (1 / nvidia[p][r][alloc] + amd_value + 1 / intel[p][r][alloc]) 

    
pprint.pp(pp)
    

