In [1]:
import subprocess as sp
import numpy as np
import pandas as pd
from io import StringIO
import os
import re
import shutil

In [2]:
#global parameters
cudadir = "/global/common/cori/software/cuda/10.0"

# Functions

In [3]:
def parse_filename(filename):
    
    #empty dicts
    result={}
    
    #add network name
    result["Network Name"] = re.match(r'.*\.name_(.*?)\.',file).groups()[0]
    result["Batch Size"] = int(re.match(r'.*\.batchsize_(.*?)\.',file).groups()[0])
    result["Input Shape"] = re.match(r'.*\.inputshape_(.*?)\.',file).groups()[0]
    result["Kernel Shape"] = re.match(r'.*\.kernelshape_(.*?)\.',file).groups()[0]
    result["Stride Size"] = int(re.match(r'.*\.stride_(.*?)\.',file).groups()[0])
    result["Data Format"] = re.match(r'.*\.dataformat_(.*?)\.',file).groups()[0]
    result["Pass"] = re.match(r'.*\.pass_(.*?)\.',file).groups()[0]
    prec = int(re.match(r'.*\.fp(.*?)\.',file).groups()[0])
    result["Precision"] = "FP16" if prec==16 else "FP32";
    metric = re.match(r'.*\.metric_(.*?)\.',file).groups()[0]
    
    return result, metric


def import_nvprof_metric(filename, timeline=False):
    #execute nvprof and parse file
    args = [os.path.join(cudadir, "bin/nvprof"),"--csv","-i",filename]
    skiprows = 2
    
    #if timeline is enabled, we have to skip less rows also
    if timeline:
        args.append("--print-gpu-trace")
        skiprows = 1
    
    #open subprocess and communicate
    p = sp.Popen(args, stdout=sp.PIPE, stderr=sp.PIPE)
    stdout, stderr = p.communicate()

    #get timeline from csv
    profiledf = pd.read_csv(StringIO(stderr.decode("utf-8")),skiprows=skiprows).dropna(how="all").rename(columns={"Kernel": "Name"})
    profiledf["Collection Type"] = "kernel"
    
    #return result
    return profiledf


def import_nvprof_overview(filename, nvtx=False):
    #execute nvprof and parse file
    args = [os.path.join(cudadir, "bin/nvprof"),"--csv","-i",filename]
    
    #open subprocess and communicate
    p = sp.Popen(args, stdout=sp.PIPE, stderr=sp.PIPE)
    stdout, stderr = p.communicate()

    #now remove the ranges
    inp = stderr.decode("utf-8")
    
    #get the profiling data
    profile = inp.split("======== NVTX result")[0]
    
    if nvtx:
        marker = inp.split("======== NVTX result")[1]

    #we can readily use the profile info
    profiledf = pd.read_csv(StringIO(profile), skiprows=1, header=[0,1]).dropna(how="all")
    
    #make the time units the same:
    for col in profiledf.columns:
        if col[1] == "ms":
            profiledf[col] *= 10**(-3)
        elif col[1] == "us":
            profiledf[col] *= 10**(-6)
        elif col[1] == "ns":
            profiledf[col] *= 10**(-9)
            
    #now drop that header
    profiledf.columns = profiledf.columns.droplevel(1)
    
    #now sort
    profiledf = profiledf.sort_values(by=["Type", "Name"]).reset_index(drop=True)
    profiledf["Metric Name"] = "time"
    
    #some renamings
    profiledf.loc[ profiledf["Type"] == "GPU activities", "Type" ] = "gpu_activities"
    profiledf.loc[ profiledf["Type"] == "API calls", "Type" ] = "api_calls"
    
    #rename columns
    profiledf.rename(columns={"Type": "Collection Type"}, inplace=True)
    
    if nvtx:
        markerdflist = []
        for it in re.finditer(r"========\s{1,}Range(.*?)(==|$)", marker, flags=re.DOTALL):
            #read into DF
            tmpdf = pd.read_csv(StringIO(it.groups()[0]),skiprows=lambda x: x in [0,2], header=0)
            del tmpdf["Time(%)"]
    
            #drop rows without info
            tmpdf = tmpdf[ ~tmpdf["Type"].str.contains("were profiled in this range") ]
    
            #extract range name:
            rangename = tmpdf.loc[ tmpdf["Type"] == "Range:", "Name" ][0]
        
            #some renamings
            tmpdf.loc[ tmpdf["Type"] == "Range:", "Name" ] = "total"
            tmpdf.loc[ tmpdf["Type"] == "Range:", "Type" ] = "range"
            tmpdf.loc[ tmpdf["Type"] == "GPU activities", "Type" ] = "gpu_activities"
            tmpdf.loc[ tmpdf["Type"] == "API calls", "Type" ] = "api_calls"
    
            #add the rangename to the entries
            tmpdf["Range Name"] = rangename
    
            #renaming
            tmpdf.rename(columns={"Type": "Collection Type"}, inplace=True)
    
            #add to list
            markerdflist.append(tmpdf)
    
        #concat the crap
        markerdf = pd.concat(markerdflist).sort_values(by=["Range Name", "Time"], ascending=[True, False]).reset_index(drop=True)
    else:
        markerdf = pd.DataFrame()
    
    return profiledf, markerdf

def combine_metrics(df, metrics):
    return pd.DataFrame.from_records([{"Metric Count": df[m].values[0], "Metric Name": m.replace("read","").replace("write","").replace("__","_"), \
    "Metric Mode": "read" if "read" in m else "write" if "write" in m else "total"} for m in metrics])


def replace_tc_string(value):
    value = int(re.match(r".*?\((.*?)\)",value).groups()[0])
    return value


def transpose_frame(df_times, df_metrics):
    #Copy the profile frame to make sure not to overwrite it and potentially read it in again if we screwed it up
    selectkeys = ["Precision", "Network Name", "Data Format", "Input Shape", "Kernel Shape", "Stride Size", "Batch Size", "Pass", "Name"]
    tc_peak_perf_flops = 125*10**12

    #just pick the gpu activities for now
    profiledf = df_times[ df_times["Collection Type"] == "gpu_activities" ].copy()
    profiledf.sort_values(by=["Name"],inplace=True)
    profiledf.reset_index(drop=True, inplace=True)
    profiledf.rename(columns={"Avg": "Time Avg"}, inplace=True)
    del profiledf["Time(%)"]
    del profiledf["Time"]
    del profiledf["Min"]
    del profiledf["Max"]
    del profiledf["Metric Name"]
    del profiledf["Collection Type"]

    #remove the calibration
    alignkeys = selectkeys[:-2]
    profiledf = profiledf.groupby(alignkeys).apply(lambda x: x[ (~x["Name"].isin(x.loc[x["Pass"].str.startswith("calibrate"), "Name"].values)) ])
    profiledf.reset_index(drop=True, inplace=True)
    
    #as metricdf use df_summary
    metricdf = df_metrics.copy()

    #now, get the AI-relevant stuff:
    #FLOPS 32
    flopdf = metricdf[ metricdf["Metric Name"].str.contains("flop_count_sp") ].sort_values(selectkeys).rename(columns={"Avg": "FP32 Flops Avg"})
    #add to timings
    profiledf = profiledf.merge(flopdf[selectkeys+["FP32 Flops Avg"]], on=selectkeys, how="inner")
    
    #monitor that: if that changes be warned
    numrows = profiledf.shape[0]

    #FLOPS 16 non-TC
    flopdf = metricdf[ metricdf["Metric Name"].str.contains("flop_count_hp") ].sort_values(selectkeys).rename(columns={"Avg": "FP16 non-TC Flops Avg"})
    #add to timings
    mergedf = profiledf.merge(flopdf[selectkeys+["FP16 non-TC Flops Avg"]], on=selectkeys, how="inner")

    #check
    if mergedf.shape[0] != numrows:
        #print(profiledf, flopdf)
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf
    
    
    #FLOPS TC
    flopdf = metricdf[ metricdf["Metric Name"].str.contains("tensor_precision_fu_utilization") ].sort_values(selectkeys).rename(columns={"Avg": "TC Flops Avg"})
    tmpdf = flopdf.merge(profiledf, how="inner", on=selectkeys).sort_values(selectkeys)
    tmpdf["TC Flops Avg"] *= tc_peak_perf_flops/10. * tmpdf["Time Avg"]
    #add to timings
    mergedf = profiledf.merge(tmpdf[selectkeys+["TC Flops Avg"]], on=selectkeys, how="inner")

    #check
    if mergedf.shape[0] != numrows:
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf

    
    #fill NA values here
    profiledf.fillna(0., inplace=True)

    #FLOPS FP16: add TC and non-TC FP16 flops together
    profiledf["FP16 Flops Avg"] = profiledf["TC Flops Avg"] + profiledf["FP16 non-TC Flops Avg"]

    #total flops
    profiledf["Flops Avg"] = profiledf["FP16 Flops Avg"] + profiledf["FP32 Flops Avg"]

    #flop fractions
    profiledf["TC Flops Fraction Avg"] = profiledf["TC Flops Avg"]/profiledf["Flops Avg"]
    profiledf["FP16 Flops Fraction Avg"] = profiledf["FP16 Flops Avg"]/profiledf["Flops Avg"]
    profiledf["FP16 non-TC Flops Fraction Avg"] = profiledf["FP16 non-TC Flops Avg"]/profiledf["Flops Avg"]
    profiledf["FP32 Flops Fraction Avg"] = profiledf["FP32 Flops Avg"]/profiledf["Flops Avg"]


    #shared
    #project out
    shareddf = metricdf[ metricdf["Metric Name"].str.contains("shared") ].sort_values(selectkeys)
    #get reads and writes
    sharedreadsdf = shareddf.loc[(shareddf["Metric Name"]=="shared_transactions") & (shareddf["Metric Mode"]=="read"), selectkeys+["Avg"]]
    sharedwritesdf = shareddf.loc[(shareddf["Metric Name"]=="shared_transactions") & (shareddf["Metric Mode"]=="write"), selectkeys+["Avg"]]
    #combine
    shareddf = sharedwritesdf.merge(sharedreadsdf, on=selectkeys, how="outer").fillna(0.)
    shareddf["Shared Transactions Avg"] = shareddf["Avg_x"] + shareddf["Avg_y"]
    #add to timings
    mergedf = profiledf.merge(shareddf[selectkeys+["Shared Transactions Avg"]], on=selectkeys, how="inner")

    #check
    if mergedf.shape[0] != numrows:
        #get the complement:
        print(profiledf[ ~profiledf.index.isin(mergedf.index) ])
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf
    
    
    #atomic
    #project out
    atomicdf = metricdf[ metricdf["Metric Name"] == "atomic_transactions" ].sort_values(selectkeys)
    #get reads and writes
    atomicdf = atomicdf[selectkeys+["Avg"]].rename(columns={"Avg": "Atomic Transactions Avg"})
    #add to timings
    mergedf = profiledf.merge(atomicdf[selectkeys+["Atomic Transactions Avg"]], on=selectkeys, how="inner")
    
    #check
    if mergedf.shape[0] != numrows:
        #get the complement:
        print(profiledf[ ~profiledf.index.isin(mergedf.index) ])
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf

    
    #L1
    #project out
    l1df = metricdf[ (metricdf["Metric Name"].str.contains("gst_")) | (metricdf["Metric Name"].str.contains("gld_")) ].sort_values(selectkeys)
    #get reads and writes
    l1readsdf = l1df.loc[(l1df["Metric Name"]=="gld_transactions"), selectkeys+["Avg"]]
    l1writesdf = l1df.loc[(l1df["Metric Name"]=="gst_transactions"), selectkeys+["Avg"]]
    #combine
    l1df = l1writesdf.merge(l1readsdf, on=selectkeys, how="outer").fillna(0.)
    l1df["L1 Transactions Avg"] = l1df["Avg_x"] + l1df["Avg_y"]
    #add to timings
    mergedf = profiledf.merge(l1df[selectkeys+["L1 Transactions Avg"]], on=selectkeys, how="inner")

    #check
    if mergedf.shape[0] != numrows:
        print(profiledf, l1df)
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf

    
    #L2
    #project out
    l2df = metricdf[ metricdf["Metric Name"].str.contains("l2") ].sort_values(selectkeys)
    #get reads and writes
    l2readsdf = l2df.loc[(l2df["Metric Name"]=="l2_transactions") & (l2df["Metric Mode"]=="read"), selectkeys+["Avg"]]
    l2writesdf = l2df.loc[(l2df["Metric Name"]=="l2_transactions") & (l2df["Metric Mode"]=="write"), selectkeys+["Avg"]]
    #combine
    l2df = l2writesdf.merge(l2readsdf, on=selectkeys, how="outer").fillna(0.)
    l2df["L2 Transactions Avg"] = l2df["Avg_x"] + l2df["Avg_y"]
    #add to timings
    mergedf = profiledf.merge(l2df[selectkeys+["L2 Transactions Avg"]], on=selectkeys, how="inner")

    #check
    if mergedf.shape[0] != numrows:
        print(profiledf, l2df)
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf
    
    
    #DRAM
    #project out
    dramdf = metricdf[ metricdf["Metric Name"].str.contains("dram") ].sort_values(selectkeys)
    #get reads and writes
    dramreadsdf = dramdf.loc[(dramdf["Metric Name"]=="dram_transactions") & (dramdf["Metric Mode"]=="read"), selectkeys+["Avg"]]
    dramwritesdf = dramdf.loc[(dramdf["Metric Name"]=="dram_transactions") & (dramdf["Metric Mode"]=="write"), selectkeys+["Avg"]]
    #combine
    dramdf = dramwritesdf.merge(dramreadsdf, on=selectkeys, how="outer").fillna(0.)
    dramdf["DRAM Transactions Avg"] = dramdf["Avg_x"] + dramdf["Avg_y"]
    #add to timings
    mergedf = profiledf.merge(dramdf[selectkeys+["DRAM Transactions Avg"]], on=selectkeys, how="inner")

    #check
    if mergedf.shape[0] != numrows:
        print(profiledf, dramdf)
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf
    

    #SYSMEM
    #project out
    sysmemdf = metricdf[ metricdf["Metric Name"].str.contains("sysmem") ].sort_values(selectkeys)
    #get reads and writes
    sysmemreadsdf = sysmemdf.loc[(sysmemdf["Metric Name"]=="sysmem_transactions") & (sysmemdf["Metric Mode"]=="read"), selectkeys+["Avg"]]
    sysmemwritesdf = sysmemdf.loc[(sysmemdf["Metric Name"]=="sysmem_transactions") & (sysmemdf["Metric Mode"]=="write"), selectkeys+["Avg"]]
    #combine
    sysmemdf = sysmemwritesdf.merge(sysmemreadsdf, on=selectkeys, how="outer").fillna(0.)
    sysmemdf["Sysmem Transactions Avg"] = sysmemdf["Avg_x"] + sysmemdf["Avg_y"]
    #add to timings
    mergedf = profiledf.merge(sysmemdf[selectkeys+["Sysmem Transactions Avg"]], on=selectkeys, how="inner")

    #check
    if mergedf.shape[0] != numrows:
        print(profiledf, sysmemdf)
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf
    

    #clean up and sort:
    profiledf.sort_values(selectkeys).reset_index(drop=True, inplace=True)

    #get performance first
    profiledf["Performance GFlop/s"] = profiledf["Flops Avg"]/(profiledf["Time Avg"]*10**9)
    profiledf["FP32 Performance GFlop/s"] = profiledf["FP32 Flops Avg"]/(profiledf["Time Avg"]*10**9)
    profiledf["FP16 Performance GFlop/s"] = profiledf["FP16 Flops Avg"]/(profiledf["Time Avg"]*10**9)
    profiledf["TC Performance GFlop/s"] = profiledf["TC Flops Avg"]/(profiledf["Time Avg"]*10**9)

    #get AI:
    #L1 is L1+shared
    profiledf["L1 AI"] = profiledf["Flops Avg"]/(32.*(profiledf["L1 Transactions Avg"]+profiledf["Shared Transactions Avg"]+profiledf["Atomic Transactions Avg"]))
    profiledf["FP32 L1 AI"] = profiledf["FP32 Flops Avg"]/(32.*(profiledf["L1 Transactions Avg"]+profiledf["Shared Transactions Avg"]+profiledf["Atomic Transactions Avg"]))
    profiledf["FP16 L1 AI"] = profiledf["FP16 Flops Avg"]/(32.*(profiledf["L1 Transactions Avg"]+profiledf["Shared Transactions Avg"]+profiledf["Atomic Transactions Avg"]))
    #L2
    profiledf["L2 AI"] = profiledf["Flops Avg"]/(32.*profiledf["L2 Transactions Avg"])
    profiledf["FP32 L2 AI"] = profiledf["FP32 Flops Avg"]/(32.*profiledf["L2 Transactions Avg"])
    profiledf["FP16 L2 AI"] = profiledf["FP16 Flops Avg"]/(32.*profiledf["L2 Transactions Avg"])
    #DRAM
    profiledf["DRAM AI"] = profiledf["Flops Avg"]/(32.*profiledf["DRAM Transactions Avg"])
    profiledf["FP32 DRAM AI"] = profiledf["FP32 Flops Avg"]/(32.*profiledf["DRAM Transactions Avg"])
    profiledf["FP16 DRAM AI"] = profiledf["FP16 Flops Avg"]/(32.*profiledf["DRAM Transactions Avg"])
    #Sysmem
    profiledf["Sysmem AI"] = profiledf["Flops Avg"]/(32.*profiledf["Sysmem Transactions Avg"])
    profiledf["FP32 Sysmem AI"] = profiledf["FP32 Flops Avg"]/(32.*profiledf["Sysmem Transactions Avg"])
    profiledf["FP16 Sysmem AI"] = profiledf["FP16 Flops Avg"]/(32.*profiledf["Sysmem Transactions Avg"])

    #sort results
    profiledf.sort_values(by=selectkeys).reset_index(drop=True, inplace=True)
    
    return profiledf

# Check for Missing Data

In [4]:
#datadir:
datadirs = ["./data/sanitized"]

#get metric list
files = []
for datadir in datadirs:
    files += [ os.path.join(datadir,x) for x in os.listdir(datadir) if (os.path.splitext(x)[-1] == ".nvprof") or (os.path.splitext(x)[-1] == ".nvvp") ]

#recs
records = []

#build feature list:
for path in files:
    
    #filename
    file = os.path.basename(path)
    
    #path
    path = os.path.dirname(path)
    
    #splitup
    splt = file.split(".")
    
    prefix = ".".join(splt[0:-2])
    metric = splt[-2].split("metric_")[1]
    
    #append to records
    records.append({"prefix": prefix, "metric": metric, "file": os.path.join(path, file)})

#put in df
recorddf = pd.DataFrame(records).sort_values(["prefix", "metric"])

#get all metrics
all_metrics = list(recorddf["metric"].unique())

#group by metric:
missingrecorddf = pd.DataFrame(recorddf.groupby("prefix").apply(lambda x: pd.Series([y for y in all_metrics if y not in list(x["metric"])])))

#create exclusion list:
excludelist = list(missingrecorddf.reset_index()["prefix"].unique())

#print the missing ones
missingrecorddf

# Check for Duplicate Files

In [5]:
##datadir:
#datadirs = ["./data/good_new", "./data/good_new_2"]
#
##do the brute force comparison
#for id1, datadir_1 in enumerate(datadirs):
#    
#    #files 1
#    files_1 = [ x for x in os.listdir(datadir_1) if (os.path.splitext(x)[-1] == ".nvprof") or (os.path.splitext(x)[-1] == ".nvvp") ]
#    
#    for id2 in range(id1+1,len(datadirs)):
#        
#        datadir_2 = datadirs[id2]
#        
#        #files 2
#        files_2 = [ x for x in os.listdir(datadir_2) if (os.path.splitext(x)[-1] == ".nvprof") or (os.path.splitext(x)[-1] == ".nvvp") ]
#        
#        #report dups:
#        dups = [x for x in files_1 if x in files_2]
#        
#        #print
#        print("Duplicates in {} and {}:".format(datadir_1, datadir_2))
#        
#        #move dups out of the way
#        if not os.path.isdir(os.path.join(datadir_2,"redundant")):
#            os.mkdir(os.path.join(datadir_2,"redundant"))
#        for file in dups:
#            source = os.path.join(datadir_2,file)
#            target = os.path.join(datadir_2,"redundant",file)
#            print("Move {} to {}".format(source,target))
#            shutil.move(source,target)

# Import Data

In [9]:
#sort by those keys:
sortkeys = ["Network Name", "Input Shape", "Kernel Shape", \
            "Batch Size", "Stride Size", "Data Format", "Pass", \
            "Precision", "Device", "Name", "Metric Name"]

#limit the input
#recorddf = recorddf[ recorddf["prefix"].str.startswith("profile.name_ResNet50-2.batchsize_16.inputshape_112x112x64.kernelshape_3x3x64x128.stride_2.dataformat_NHWC.fp16") ]

#group by prefixes and files
all_prefixes = set([x.split(".pass")[0] for x in recorddf["prefix"]])
all_metrics = recorddf["metric"].unique()
all_passes = set([x.split(".pass_")[1].replace(".pass_","") for x in recorddf["prefix"].unique()])

#metrics
df_profiles = []

for pref in all_prefixes:
    
    #set empty lists
    df_times = []
    df_timeline = []
    df_summary = []
    
    #print prefix
    #print(pref)
    
    #loop over passes
    for pas in all_passes:
        
        #project frame
        selectdf = recorddf[ recorddf["prefix"] == pref+".pass_"+pas ]
        
        #loop over metrics
        for met in [x for x in all_metrics if x != "time"]:
            
            #filename
            file = selectdf.loc[ selectdf["metric"]==met, "file" ].values[0]
        
            #extract metric name
            parameters, metric = parse_filename(os.path.basename(file))
            metrics = metric.split("-")
    
            #import as timeline
            tmpdf = import_nvprof_metric(file, timeline=True)
            for key in parameters:
                tmpdf[key] = parameters[key]
        
            #replace "Idle (0)" with 0.:
            for metric in metrics:
                if metric=="tensor_precision_fu_utilization":
                    tmpdf[metric] = tmpdf[metric].apply(lambda x: replace_tc_string(x))
    
            #combine read and write metrics
            tmpdf = tmpdf.groupby([x for x in tmpdf.columns if x not in metrics]).apply(lambda x: combine_metrics(x, metrics)).reset_index()
            lev = [x for x in tmpdf.columns if x.startswith("level_")][0]
            del tmpdf[lev]
            df_timeline.append(tmpdf)
    
            #import as summary
            tmpdf = import_nvprof_metric(file, timeline=False).sort_values(by="Name").reset_index(drop=True)
            tmpdf["Metric Mode"] = "read" if "read" in metric else "write" if "write" in metric else "write" if "store" in metric else "read" if "load" in metric else "total"
            tmpdf["Metric Name"] = metric.replace("read","").replace("write","").replace("store","").replace("load","").replace("__","_")
            for key in parameters:
                tmpdf[key] = parameters[key]
            del tmpdf["Metric Description"]
    
            #replace "Idle (0)" with 0.:
            for metric in metrics:
                if metric=="tensor_precision_fu_utilization":
                    tmpdf[ "Min" ] = tmpdf[ "Min" ].apply(lambda x: replace_tc_string(x))
                    tmpdf[ "Max" ] = tmpdf[ "Max" ].apply(lambda x: replace_tc_string(x))
                    tmpdf[ "Avg" ] = tmpdf[ "Avg" ].apply(lambda x: replace_tc_string(x))
            df_summary.append(tmpdf)
        
        #do time now
        file = selectdf.loc[ selectdf["metric"] == "time", "file" ].values[0]
        timedf, markerdf = import_nvprof_overview(file)
    
        #extract metric name
        parameters, _ = parse_filename(os.path.basename(file))
        for key in parameters:
            timedf[key] = parameters[key]
        df_times.append(timedf)
        
    #concat into frame
    metricdf = pd.concat(df_summary, sort=True)
    timedf = pd.concat(df_times, sort=True)
    
    #transpose
    profiledf = transpose_frame(timedf, metricdf)
    df_profiles.append(profiledf)

#concat everything
profiledf = pd.concat(df_profiles)

# Compute AI Results

In [11]:
#profiledf[ (profiledf["Network Name"]=="ResNet50-2") &\
#           (profiledf["Input Shape"]=="112x112x64") &\
#           (profiledf["Batch Size"]==16) &\
#           (profiledf["Precision"]=="FP32") &\
#           (profiledf["Stride Size"]==2) &\
#           (profiledf["Pass"]=="forward") &\
#           (profiledf["Kernel Shape"]=="9x9x64x64")
#         ]
profiledf

Unnamed: 0,Time Avg,Batch Size,Calls,Data Format,Input Shape,Kernel Shape,Name,Network Name,Pass,Precision,...,FP16 L1 AI,L2 AI,FP32 L2 AI,FP16 L2 AI,DRAM AI,FP32 DRAM AI,FP16 DRAM AI,Sysmem AI,FP32 Sysmem AI,FP16 Sysmem AI
0,0.000001,64,20,NHWC,112x112x64,3x3x64x64,cudnn::gemm::computeOffsetsKernel(cudnn::gemm:...,ResNet50-2,forward,FP16,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.0,0.000000e+00
1,0.000002,64,20,NHWC,112x112x64,3x3x64x64,cudnn::gemm::computeWgradOffsetsKernel(cudnn::...,ResNet50-2,backward,FP16,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.0,0.000000e+00
2,0.000034,64,20,NHWC,112x112x64,3x3x64x64,void Eigen::internal::EigenMetaKernel<Eigen::T...,ResNet50-2,backward,FP16,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.0,0.000000e+00
3,0.000175,64,40,NHWC,112x112x64,3x3x64x64,"void nchwToNhwcKernel<__half, __half, float, b...",ResNet50-2,backward,FP16,...,0.131217,0.420749,0.210375,0.210375,0.500518,0.250259,0.250259,4.071680e+05,203584.0,2.035840e+05
4,0.000143,64,40,NHWC,112x112x64,3x3x64x64,"void nchwToNhwcKernel<__half, __half, float, b...",ResNet50-2,forward,FP16,...,0.128010,0.404267,0.202133,0.202133,0.499351,0.249675,0.249675,3.271168e+05,163558.4,1.635584e+05
5,0.000003,64,20,NHWC,112x112x64,3x3x64x64,"void nhwcToNchwKernel<float, __half, float, bo...",ResNet50-2,backward,FP16,...,0.000000,0.105369,0.105369,0.000000,60.631579,60.631579,0.000000,2.304000e+02,230.4,0.000000e+00
6,0.000001,64,20,NHWC,112x112x64,3x3x64x64,"void scalePackedTensor_kernel<float, float>(cu...",ResNet50-2,backward,FP16,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.0,0.000000e+00
7,0.000857,64,20,NHWC,112x112x64,3x3x64x64,void tensorflow::functor::PadInputCustomKernel...,ResNet50-2,backward,FP16,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.0,0.000000e+00
8,0.000852,64,20,NHWC,112x112x64,3x3x64x64,void tensorflow::functor::PadInputCustomKernel...,ResNet50-2,forward,FP16,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.0,0.000000e+00
9,0.000003,64,20,NHWC,112x112x64,3x3x64x64,void tensorflow::functor::ShuffleInTensor3Simp...,ResNet50-2,forward,FP16,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.0,0.000000e+00


In [12]:
#sum over all kernels
combinedselectkeys = ["Precision", "Network Name", "Data Format", "Input Shape", "Kernel Shape", "Stride Size", \
                     "Batch Size", "Pass"]

#copy profiledf
combineddf = profiledf.copy()

#get the aggregated performance, including all kernels:
#compute weights: multiply all measures by the number of invocations
weighted = True
if weighted:
    #first, get all the names of metrics which need to be weighted
    metrics = [x for x in combineddf.columns if "Avg" in x]
    for metric in metrics:
        combineddf[metric] *= combineddf["Calls"]
    
#sum up
combineddf = profiledf.groupby(by=combinedselectkeys).sum()

#the flop fractions need to be recomputed
combineddf["TC Flops Fraction Avg"] = combineddf["TC Flops Avg"]/combineddf["Flops Avg"]
combineddf["FP16 Flops Fraction Avg"] = combineddf["FP16 Flops Avg"]/combineddf["Flops Avg"]
combineddf["FP16 non-TC Flops Fraction Avg"] = combineddf["FP16 non-TC Flops Avg"]/combineddf["Flops Avg"]
combineddf["FP32 Flops Fraction Avg"] = combineddf["FP32 Flops Avg"]/combineddf["Flops Avg"]

#get performance first
combineddf["Performance GFlop/s"] = combineddf["Flops Avg"]/(combineddf["Time Avg"]*10**9)
combineddf["FP32 Performance GFlop/s"] = combineddf["FP32 Flops Avg"]/(combineddf["Time Avg"]*10**9)
combineddf["FP16 Performance GFlop/s"] = combineddf["FP16 Flops Avg"]/(combineddf["Time Avg"]*10**9)
combineddf["TC Performance GFlop/s"] = combineddf["TC Flops Avg"]/(combineddf["Time Avg"]*10**9)

#get AI:
#L1 is L1+shared
combineddf["L1 AI"] = combineddf["Flops Avg"]/(32.*(combineddf["L1 Transactions Avg"]+combineddf["Shared Transactions Avg"]+combineddf["Atomic Transactions Avg"]))
combineddf["FP32 L1 AI"] = combineddf["FP32 Flops Avg"]/(32.*(combineddf["L1 Transactions Avg"]+combineddf["Shared Transactions Avg"]+combineddf["Atomic Transactions Avg"]))
combineddf["FP16 L1 AI"] = combineddf["FP16 Flops Avg"]/(32.*(combineddf["L1 Transactions Avg"]+combineddf["Shared Transactions Avg"]+combineddf["Atomic Transactions Avg"]))
combineddf["TC L1 AI"] = combineddf["TC Flops Avg"]/(32.*(combineddf["L1 Transactions Avg"]+combineddf["Shared Transactions Avg"]+combineddf["Atomic Transactions Avg"]))
#L2
combineddf["L2 AI"] = combineddf["Flops Avg"]/(32.*combineddf["L2 Transactions Avg"])
combineddf["FP32 L2 AI"] = combineddf["FP32 Flops Avg"]/(32.*combineddf["L2 Transactions Avg"])
combineddf["FP16 L2 AI"] = combineddf["FP16 Flops Avg"]/(32.*combineddf["L2 Transactions Avg"])
combineddf["TC L2 AI"] = combineddf["TC Flops Avg"]/(32.*combineddf["L2 Transactions Avg"])
#DRAM
combineddf["DRAM AI"] = combineddf["Flops Avg"]/(32.*combineddf["DRAM Transactions Avg"])
combineddf["FP32 DRAM AI"] = combineddf["FP32 Flops Avg"]/(32.*combineddf["DRAM Transactions Avg"])
combineddf["FP16 DRAM AI"] = combineddf["FP16 Flops Avg"]/(32.*combineddf["DRAM Transactions Avg"])
combineddf["TC DRAM AI"] = combineddf["TC Flops Avg"]/(32.*combineddf["DRAM Transactions Avg"])
#Sysmem
combineddf["Sysmem AI"] = combineddf["Flops Avg"]/(32.*combineddf["Sysmem Transactions Avg"])
combineddf["FP32 Sysmem AI"] = combineddf["FP32 Flops Avg"]/(32.*combineddf["Sysmem Transactions Avg"])
combineddf["FP16 Sysmem AI"] = combineddf["FP16 Flops Avg"]/(32.*combineddf["Sysmem Transactions Avg"])
combineddf["TC Sysmem AI"] = combineddf["TC Flops Avg"]/(32.*combineddf["Sysmem Transactions Avg"])

#print
combineddf

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Time Avg,Calls,FP32 Flops Avg,FP16 non-TC Flops Avg,TC Flops Avg,FP16 Flops Avg,Flops Avg,TC Flops Fraction Avg,FP16 Flops Fraction Avg,FP16 non-TC Flops Fraction Avg,...,DRAM AI,FP32 DRAM AI,FP16 DRAM AI,Sysmem AI,FP32 Sysmem AI,FP16 Sysmem AI,TC L1 AI,TC L2 AI,TC DRAM AI,TC Sysmem AI
Precision,Network Name,Data Format,Input Shape,Kernel Shape,Stride Size,Batch Size,Pass,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,backward,0.000509,220,17490432,9748992,9719687000.0,9729436000.0,9746927000.0,0.997205,0.998206,0.001,...,52.097169,0.093486,52.003683,6768699.0,12146.13,6756553.0,15.215602,24.841025,51.951575,6749783.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,forward,0.000458,160,19419648,7377408,10390420000.0,10397800000.0,10417220000.0,0.997428,0.998136,0.000708,...,59.363863,0.110665,59.253198,10851270.0,20228.8,10831040.0,21.855944,39.67991,59.211157,10823360.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,2,16,backward,0.001034,220,22544896,12960256,80743250000.0,80756210000.0,80778760000.0,0.99956,0.999721,0.00016,...,343.227656,0.095793,343.131863,56096360.0,15656.18,56080700.0,46.342289,148.203693,343.076795,56071700.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,2,16,forward,0.000575,160,32301568,8217088,18121580000.0,18129790000.0,18162090000.0,0.997769,0.998221,0.000452,...,74.363519,0.132257,74.231262,18918850.0,33647.47,18885200.0,22.69027,44.744757,74.197618,18876640.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x512,2,16,backward,0.001661,220,38552064,19382784,148589800000.0,148609100000.0,148647700000.0,0.99961,0.999741,0.00013,...,421.636576,0.109352,421.527224,103227600.0,26772.27,103200800.0,44.678579,151.603412,421.472245,103187300.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x512,2,16,forward,0.000849,160,58065408,9896448,35714250000.0,35724150000.0,35782210000.0,0.998101,0.998377,0.000277,...,95.729266,0.155344,95.573922,37273140.0,60484.8,37212650.0,24.713571,51.603652,95.547445,37202340.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,16,backward,0.000532,180,23515648,8143360,6997012000.0,7005156000.0,7028672000.0,0.995496,0.996654,0.001159,...,40.315782,0.134883,40.180899,5491150.0,18371.6,5472778.0,15.696336,25.442065,40.134189,5466416.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,16,forward,0.000428,160,12978688,6957568,7371562000.0,7378520000.0,7391499000.0,0.997303,0.998244,0.000941,...,42.261447,0.074207,42.187241,7699478.0,13519.47,7685958.0,20.913705,35.739775,42.14746,7678711.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,32,backward,0.00095,220,43160576,16286720,11710650000.0,11726940000.0,11770100000.0,0.994949,0.996333,0.001384,...,36.881717,0.135244,36.746473,8173679.0,29972.62,8143706.0,14.285218,22.542927,36.695439,8132396.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,32,forward,0.000794,160,25938944,13896704,11731440000.0,11745330000.0,11771270000.0,0.996616,0.997796,0.001181,...,33.607287,0.074056,33.53323,12261740.0,27019.73,12234720.0,16.639647,28.587706,33.493555,12220250.0


In [None]:
#combineddf = combineddf.reset_index()
#seldf = combineddf[ (combineddf["Network Name"]=="ResNet50-2") &\
#           (combineddf["Input Shape"]=="112x112x64") &\
#           (combineddf["Precision"]=="FP32")]
#seldf
#combineddf[["FP32 L2 AI", "FP32 L1 AI"]]
combineddf[["L2 Transactions Avg", "L1 Transactions Avg"]]

# Export Data

In [13]:
outputdir = "./results"

profiledf.to_csv(os.path.join(outputdir,"full_profile.csv"))
combineddf.to_csv(os.path.join(outputdir,"combined_profile.csv"))