In [1]:
import subprocess as sp
import numpy as np
import pandas as pd
from io import StringIO
import os
import re

# Functions

In [2]:
def parse_filename(filename):
    
    #empty dicts
    result={}
    
    #add network name
    result["Network Name"] = re.match(r'.*\.name_(.*?)\.',file).groups()[0]
    result["Batch Size"] = int(re.match(r'.*\.batchsize_(.*?)\.',file).groups()[0])
    result["Input Shape"] = re.match(r'.*\.inputshape_(.*?)\.',file).groups()[0]
    result["Kernel Shape"] = re.match(r'.*\.kernelshape_(.*?)\.',file).groups()[0]
    result["Stride Size"] = int(re.match(r'.*\.stride_(.*?)\.',file).groups()[0])
    result["Data Format"] = re.match(r'.*\.dataformat_(.*?)\.',file).groups()[0]
    result["Pass"] = re.match(r'.*\.pass_(.*?)\.',file).groups()[0]
    prec = int(re.match(r'.*\.fp(.*?)\.',file).groups()[0])
    result["Precision"] = "FP16" if prec==16 else "FP32";
    metric = re.match(r'.*\.metric_(.*?)\.',file).groups()[0]
    
    return result, metric


def import_nvprof_metric(filename, timeline=False):
    #execute nvprof and parse file
    args = ["/project/projectdirs/mpccc/tkurth/cuda10/bin/nvprof","--csv","-i",filename]
    skiprows = 2
    
    #if timeline is enabled, we have to skip less rows also
    if timeline:
        args.append("--print-gpu-trace")
        skiprows = 1
    
    #open subprocess and communicate
    p = sp.Popen(args, stdout=sp.PIPE, stderr=sp.PIPE)
    stdout, stderr = p.communicate()

    #get timeline from csv
    profiledf = pd.read_csv(StringIO(stderr.decode("utf-8")),skiprows=skiprows).dropna(how="all").rename(columns={"Kernel": "Name"})
    profiledf["Collection Type"] = "kernel"
    
    #return result
    return profiledf


def import_nvprof_overview(filename, nvtx=False):
    #execute nvprof and parse file
    args = ["/project/projectdirs/mpccc/tkurth/cuda10/bin/nvprof","--csv","-i",filename]
    
    #open subprocess and communicate
    p = sp.Popen(args, stdout=sp.PIPE, stderr=sp.PIPE)
    stdout, stderr = p.communicate()

    #now remove the ranges
    inp = stderr.decode("utf-8")
    
    #get the profiling data
    profile = inp.split("======== NVTX result")[0]
    
    if nvtx:
        marker = inp.split("======== NVTX result")[1]

    #we can readily use the profile info
    profiledf = pd.read_csv(StringIO(profile), skiprows=1, header=[0,1]).dropna(how="all")
    
    #make the time units the same:
    for col in profiledf.columns:
        if col[1] == "ms":
            profiledf[col] *= 10**(-3)
        elif col[1] == "us":
            profiledf[col] *= 10**(-6)
        elif col[1] == "ns":
            profiledf[col] *= 10**(-9)
            
    #now drop that header
    profiledf.columns = profiledf.columns.droplevel(1)
    
    #now sort
    profiledf = profiledf.sort_values(by=["Type", "Name"]).reset_index(drop=True)
    profiledf["Metric Name"] = "time"
    
    #some renamings
    profiledf.loc[ profiledf["Type"] == "GPU activities", "Type" ] = "gpu_activities"
    profiledf.loc[ profiledf["Type"] == "API calls", "Type" ] = "api_calls"
    
    #rename columns
    profiledf.rename(columns={"Type": "Collection Type"}, inplace=True)
    
    if nvtx:
        markerdflist = []
        for it in re.finditer(r"========\s{1,}Range(.*?)(==|$)", marker, flags=re.DOTALL):
            #read into DF
            tmpdf = pd.read_csv(StringIO(it.groups()[0]),skiprows=lambda x: x in [0,2], header=0)
            del tmpdf["Time(%)"]
    
            #drop rows without info
            tmpdf = tmpdf[ ~tmpdf["Type"].str.contains("were profiled in this range") ]
    
            #extract range name:
            rangename = tmpdf.loc[ tmpdf["Type"] == "Range:", "Name" ][0]
        
            #some renamings
            tmpdf.loc[ tmpdf["Type"] == "Range:", "Name" ] = "total"
            tmpdf.loc[ tmpdf["Type"] == "Range:", "Type" ] = "range"
            tmpdf.loc[ tmpdf["Type"] == "GPU activities", "Type" ] = "gpu_activities"
            tmpdf.loc[ tmpdf["Type"] == "API calls", "Type" ] = "api_calls"
    
            #add the rangename to the entries
            tmpdf["Range Name"] = rangename
    
            #renaming
            tmpdf.rename(columns={"Type": "Collection Type"}, inplace=True)
    
            #add to list
            markerdflist.append(tmpdf)
    
        #concat the crap
        markerdf = pd.concat(markerdflist).sort_values(by=["Range Name", "Time"], ascending=[True, False]).reset_index(drop=True)
    else:
        markerdf = pd.DataFrame()
    
    return profiledf, markerdf

def combine_metrics(df, metrics):
    return pd.DataFrame.from_records([{"Metric Count": df[m].values[0], "Metric Name": m.replace("read","").replace("write","").replace("__","_"), \
    "Metric Mode": "read" if "read" in m else "write" if "write" in m else "total"} for m in metrics])


def replace_tc_string(value):
    value = int(re.match(r".*?\((.*?)\)",value).groups()[0])
    return value

# Import Data

In [3]:
#datadir:
datadir = "./data/good_new"
#datadir = "/global/cscratch1/sd/tkurth/tf_cnn_kernels/runs/62888"

#sort by those:
sortkeys = ["Network Name", "Input Shape", "Kernel Shape", \
            "Batch Size", "Stride Size", "Data Format", "Pass", \
            "Precision", "Device", "Name", "Metric Name"]

#init lists to zero
df_timeline = []
df_summary = []
df_summary_derived = []

#get metric list
files = [ x for x in os.listdir(datadir) if (os.path.splitext(x)[-1] == ".nvprof") or (os.path.splitext(x)[-1] == ".nvvp") ]

#metrics
for file in files:
    
    #extract metric name
    parameters, metric = parse_filename(file)
    metrics = metric.split("-")
    
    #print("Reading {}".format(file))
    
    if metric == "time":
        continue

    #import as timeline
    tmpdf = import_nvprof_metric(os.path.join(datadir,file), timeline=True)
    for key in parameters:
        tmpdf[key] = parameters[key]
        
    #replace "Idle (0)" with 0.:
    for metric in metrics:
        if metric=="tensor_precision_fu_utilization":
            tmpdf[metric] = tmpdf[metric].apply(lambda x: replace_tc_string(x))
    
    #combine read and write metrics
    tmpdf = tmpdf.groupby([x for x in tmpdf.columns if x not in metrics]).apply(lambda x: combine_metrics(x, metrics)).reset_index()
    lev = [x for x in tmpdf.columns if x.startswith("level_")][0]
    del tmpdf[lev]
    df_timeline.append(tmpdf)
    
    #import as summary
    tmpdf = import_nvprof_metric(os.path.join(datadir,file), timeline=False).sort_values(by="Name").reset_index(drop=True)
    tmpdf["Metric Mode"] = "read" if "read" in metric else "write" if "write" in metric else "total"
    tmpdf["Metric Name"] = metric.replace("read","").replace("write","").replace("__","_")
    for key in parameters:
        tmpdf[key] = parameters[key]
    del tmpdf["Metric Description"]
    
    #replace "Idle (0)" with 0.:
    for metric in metrics:
        if metric=="tensor_precision_fu_utilization":
            tmpdf[ "Min" ] = tmpdf[ "Min" ].apply(lambda x: replace_tc_string(x))
            tmpdf[ "Max" ] = tmpdf[ "Max" ].apply(lambda x: replace_tc_string(x))
            tmpdf[ "Avg" ] = tmpdf[ "Avg" ].apply(lambda x: replace_tc_string(x))
    df_summary.append(tmpdf)

#concat the frames
df_timeline = pd.concat(df_timeline, sort=True)
df_timeline = df_timeline.sort_values(by=sortkeys+["Metric Mode", "Correlation_ID"]).reset_index(drop=True)
df_summary = pd.concat(df_summary, sort=True)

#compute summary df:
tmpdf = df_timeline.groupby([x for x in df_timeline.columns if x not in ["Metric Mode", "Metric Count"]]).apply(lambda x: pd.Series({"Metric Count": x["Metric Count"].values.sum(), "Metric Mode": "total"})).reset_index()
tmpdf.sort_values(sortkeys+["Correlation_ID"]).reset_index(drop=True, inplace=True)
df_summary_derived = tmpdf.groupby(sortkeys).apply(lambda x: pd.Series({"Invocations": x["Metric Count"].count(), \
                                                               "Min": x["Metric Count"].min(), \
                                                               "Max": x["Metric Count"].max(), \
                                                               "STD": x["Metric Count"].std(), \
                                                               "Average": x["Metric Count"].mean()})).reset_index().sort_values(by="Name").fillna(0.)

#timings
files = [ x for x in os.listdir(datadir) if ((os.path.splitext(x)[-1] == ".nvprof") or (os.path.splitext(x)[-1] == ".nvvp")) and "metric_time" in x ]
df_times = []
for file in files:
    timedf, markerdf = import_nvprof_overview(os.path.join(datadir,file))
    
    #extract metric name
    parameters, _ = parse_filename(file)
    for key in parameters:
        timedf[key] = parameters[key]
    
    #append frame
    df_times.append(timedf)
df_time = pd.concat(df_times)

# Compute AI Results

In [6]:
#Copy the profile frame to make sure not to overwrite it and potentially read it in again if we screwed it up
selectkeys = ["Precision", "Network Name", "Data Format", "Input Shape", "Kernel Shape", "Stride Size", \
             "Batch Size", "Pass", "Name"]
tc_peak_perf_flops = 125*10**12

#just pick the gpu activities for now
profiledf = df_time[ df_time["Collection Type"] == "gpu_activities" ].copy()
profiledf.sort_values(by=["Name"],inplace=True)
profiledf.reset_index(drop=True, inplace=True)
profiledf.rename(columns={"Avg": "Time Avg"}, inplace=True)
del profiledf["Time(%)"]
del profiledf["Time"]
del profiledf["Min"]
del profiledf["Max"]
del profiledf["Metric Name"]
del profiledf["Collection Type"]

#remove the calibration
alignkeys = selectkeys[:-2]
profiledf = profiledf.groupby(alignkeys).apply(lambda x: x[ (~x["Name"].isin(x.loc[x["Pass"].str.startswith("calibrate"), "Name"].values)) ])
profiledf.reset_index(drop=True, inplace=True)

#as metricdf use df_summary
metricdf = df_summary.copy()

#now, get the AI-relevant stuff:
#FLOPS 32
flopdf = metricdf[ metricdf["Metric Name"].str.contains("flop_count_sp") ].sort_values(selectkeys).rename(columns={"Avg": "FP32 Flops Avg"})
#add to timings
profiledf = profiledf.merge(flopdf[selectkeys+["FP32 Flops Avg"]], on=selectkeys, how="left")

#FLOPS 16 non-TC
flopdf = metricdf[ metricdf["Metric Name"].str.contains("flop_count_hp") ].sort_values(selectkeys).rename(columns={"Avg": "FP16 non-TC Flops Avg"})
#add to timings
profiledf = profiledf.merge(flopdf[selectkeys+["FP16 non-TC Flops Avg"]], on=selectkeys, how="left")

#FLOPS TC
flopdf = metricdf[ metricdf["Metric Name"].str.contains("tensor_precision_fu_utilization") ].sort_values(selectkeys).rename(columns={"Avg": "TC Flops Avg"})
tmpdf = flopdf.merge(profiledf, how="inner", on=selectkeys).sort_values(selectkeys)
tmpdf["TC Flops Avg"] *= tc_peak_perf_flops/10. * tmpdf["Time Avg"]
#add to timings
profiledf = profiledf.merge(tmpdf[selectkeys+["TC Flops Avg"]], on=selectkeys, how="left")

#FLOPS FP16
profiledf["FP16 Flops Avg"] = profiledf["TC Flops Avg"] + profiledf["FP16 non-TC Flops Avg"]

#L1
#project out
l1df = metricdf[ (metricdf["Metric Name"].str.contains("gst_")) | (metricdf["Metric Name"].str.contains("gld_")) ].sort_values(selectkeys)
#get reads and writes
l1readsdf = l1df.loc[(l1df["Metric Name"]=="gld_transactions"), selectkeys+["Avg"]]
l1writesdf = l1df.loc[(l1df["Metric Name"]=="gst_transactions"), selectkeys+["Avg"]]
#combine
l1df = l1writesdf.merge(l1readsdf, on=selectkeys, how="outer").fillna(0.)
l1df["L1 Transactions Avg"] = l1df["Avg_x"] + l1df["Avg_y"]
#add to timings
profiledf = profiledf.merge(l1df[selectkeys+["L1 Transactions Avg"]], on=selectkeys, how="inner")

#L2
#project out
l2df = metricdf[ metricdf["Metric Name"].str.contains("l2") ].sort_values(selectkeys)
#get reads and writes
l2readsdf = l2df.loc[(l2df["Metric Name"]=="l2_transactions") & (l2df["Metric Mode"]=="read"), selectkeys+["Avg"]]
l2writesdf = l2df.loc[(l2df["Metric Name"]=="l2_transactions") & (l2df["Metric Mode"]=="write"), selectkeys+["Avg"]]
#combine
l2df = l2writesdf.merge(l2readsdf, on=selectkeys, how="outer").fillna(0.)
l2df["L2 Transactions Avg"] = l2df["Avg_x"] + l2df["Avg_y"]
#add to timings
profiledf = profiledf.merge(l2df[selectkeys+["L2 Transactions Avg"]], on=selectkeys, how="inner")

#DRAM
#project out
dramdf = metricdf[ metricdf["Metric Name"].str.contains("dram") ].sort_values(selectkeys)
#get reads and writes
dramreadsdf = dramdf.loc[(dramdf["Metric Name"]=="dram_transactions") & (dramdf["Metric Mode"]=="read"), selectkeys+["Avg"]]
dramwritesdf = dramdf.loc[(dramdf["Metric Name"]=="dram_transactions") & (dramdf["Metric Mode"]=="write"), selectkeys+["Avg"]]
#combine
dramdf = dramwritesdf.merge(dramreadsdf, on=selectkeys, how="outer").fillna(0.)
dramdf["DRAM Transactions Avg"] = dramdf["Avg_x"] + dramdf["Avg_y"]
#add to timings
profiledf = profiledf.merge(dramdf[selectkeys+["DRAM Transactions Avg"]], on=selectkeys, how="inner")

#SYSMEM
#project out
sysmemdf = metricdf[ metricdf["Metric Name"].str.contains("sysmem") ].sort_values(selectkeys)
#get reads and writes
sysmemreadsdf = sysmemdf.loc[(sysmemdf["Metric Name"]=="sysmem_transactions") & (sysmemdf["Metric Mode"]=="read"), selectkeys+["Avg"]]
sysmemwritesdf = sysmemdf.loc[(sysmemdf["Metric Name"]=="sysmem_transactions") & (sysmemdf["Metric Mode"]=="write"), selectkeys+["Avg"]]
#combine
sysmemdf = sysmemwritesdf.merge(sysmemreadsdf, on=selectkeys, how="outer").fillna(0.)
sysmemdf["Sysmem Transactions Avg"] = sysmemdf["Avg_x"] + sysmemdf["Avg_y"]
#add to timings
profiledf = profiledf.merge(sysmemdf[selectkeys+["Sysmem Transactions Avg"]], on=selectkeys, how="inner")

#clean up and sort:
profiledf.sort_values(selectkeys).reset_index(drop=True, inplace=True)

#get performance first
profiledf["FP32 Performance GFlop/s"] = profiledf["FP32 Flops Avg"]/(profiledf["Time Avg"]*10**9)
profiledf["FP16 Performance GFlop/s"] = profiledf["FP16 Flops Avg"]/(profiledf["Time Avg"]*10**9)
profiledf["TC Performance GFlop/s"] = profiledf["TC Flops Avg"]/(profiledf["Time Avg"]*10**9)

#get AI:
#L2
profiledf["FP32 L2 AI"] = profiledf["FP32 Flops Avg"]/(32.*profiledf["L2 Transactions Avg"])
profiledf["FP16 L2 AI"] = profiledf["FP16 Flops Avg"]/(32.*profiledf["L2 Transactions Avg"])
#DRAM
profiledf["FP32 DRAM AI"] = profiledf["FP32 Flops Avg"]/(32.*profiledf["DRAM Transactions Avg"])
profiledf["FP16 DRAM AI"] = profiledf["FP16 Flops Avg"]/(32.*profiledf["DRAM Transactions Avg"])
#Sysmem
profiledf["FP32 Sysmem AI"] = profiledf["FP32 Flops Avg"]/(32.*profiledf["Sysmem Transactions Avg"])
profiledf["FP16 Sysmem AI"] = profiledf["FP16 Flops Avg"]/(32.*profiledf["Sysmem Transactions Avg"])

#sort results
profiledf.sort_values(by=selectkeys).reset_index(drop=True, inplace=True)

In [7]:
#profiledf
profiledf

Unnamed: 0,Calls,Time Avg,Name,Network Name,Batch Size,Input Shape,Kernel Shape,Stride Size,Data Format,Pass,...,Sysmem Transactions Avg,FP32 Performance GFlop/s,FP16 Performance GFlop/s,TC Performance GFlop/s,FP32 L2 AI,FP16 L2 AI,FP32 DRAM AI,FP16 DRAM AI,FP32 Sysmem AI,FP16 Sysmem AI
0,20,0.000002,cudnn::gemm::computeOffsetsKernel(cudnn::gemm:...,ResNet50-1,16,224x224x3,7x7x3x64,2,NHWC,forward,...,5.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00
1,20,0.000002,cudnn::gemm::computeWgradOffsetsKernel(cudnn::...,ResNet50-1,16,224x224x3,7x7x3x64,2,NHWC,backward,...,5.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00
2,20,0.000034,void Eigen::internal::EigenMetaKernel<Eigen::T...,ResNet50-1,16,224x224x3,7x7x3x64,2,NHWC,backward,...,5.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00
3,20,0.000069,"void nchwToNhwcKernel<__half, __half, float, b...",ResNet50-1,16,224x224x3,7x7x3x64,2,NHWC,backward,...,5.0,186.601043,186.601043,0.0,0.246204,0.246204,0.255864,0.255864,8.028160e+04,8.028160e+04
4,20,0.000101,"void nchwToNhwcKernel<__half, __half, float, b...",ResNet50-1,16,224x224x3,7x7x3x64,2,NHWC,backward,...,5.0,24.038462,24.038462,0.0,0.054506,0.054506,0.076638,0.076638,1.518750e+04,1.518750e+04
5,20,0.000002,"void nhwcToNchwKernel<float, __half, float, bo...",ResNet50-1,16,224x224x3,7x7x3x64,2,NHWC,backward,...,5.0,3.913478,0.000000,0.0,0.068917,0.000000,16.333333,0.000000,5.880000e+01,0.000000e+00
6,20,0.000002,"void scalePackedTensor_kernel<float, float>(cu...",ResNet50-1,16,224x224x3,7x7x3x64,2,NHWC,backward,...,5.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00
7,20,0.000042,void tensorflow::functor::PadInputCustomKernel...,ResNet50-1,16,224x224x3,7x7x3x64,2,NHWC,backward,...,5.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00
8,20,0.000042,void tensorflow::functor::PadInputCustomKernel...,ResNet50-1,16,224x224x3,7x7x3x64,2,NHWC,forward,...,5.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00
9,20,0.000003,void tensorflow::functor::ShuffleInTensor3Simp...,ResNet50-1,16,224x224x3,7x7x3x64,2,NHWC,forward,...,5.0,0.000000,0.000000,0.0,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.000000e+00


In [8]:
#sum over all kernels
combinedselectkeys = ["Precision", "Network Name", "Data Format", "Input Shape", "Kernel Shape", "Stride Size", \
                     "Batch Size", "Pass"]

#get the aggregated performance, including all kernels:
combineddf = profiledf.groupby(by=combinedselectkeys).sum()

#get performance first
combineddf["FP32 Performance GFlop/s"] = combineddf["FP32 Flops Avg"]/(combineddf["Time Avg"]*10**9)
combineddf["FP16 Performance GFlop/s"] = combineddf["FP16 Flops Avg"]/(combineddf["Time Avg"]*10**9)
combineddf["TC Performance GFlop/s"] = combineddf["TC Flops Avg"]/(combineddf["Time Avg"]*10**9)

#get AI:
#L2
combineddf["FP32 L1 AI"] = combineddf["FP32 Flops Avg"]/(32.*combineddf["L1 Transactions Avg"])
combineddf["FP16 L1 AI"] = combineddf["FP16 Flops Avg"]/(32.*combineddf["L1 Transactions Avg"])
combineddf["TC L1 AI"] = combineddf["TC Flops Avg"]/(32.*combineddf["L1 Transactions Avg"])
#L2
combineddf["FP32 L2 AI"] = combineddf["FP32 Flops Avg"]/(32.*combineddf["L2 Transactions Avg"])
combineddf["FP16 L2 AI"] = combineddf["FP16 Flops Avg"]/(32.*combineddf["L2 Transactions Avg"])
combineddf["TC L2 AI"] = combineddf["TC Flops Avg"]/(32.*combineddf["L2 Transactions Avg"])
#DRAM
combineddf["FP32 DRAM AI"] = combineddf["FP32 Flops Avg"]/(32.*combineddf["DRAM Transactions Avg"])
combineddf["FP16 DRAM AI"] = combineddf["FP16 Flops Avg"]/(32.*combineddf["DRAM Transactions Avg"])
combineddf["TC DRAM AI"] = combineddf["TC Flops Avg"]/(32.*combineddf["DRAM Transactions Avg"])
#Sysmem
combineddf["FP32 Sysmem AI"] = combineddf["FP32 Flops Avg"]/(32.*combineddf["Sysmem Transactions Avg"])
combineddf["FP16 Sysmem AI"] = combineddf["FP16 Flops Avg"]/(32.*combineddf["Sysmem Transactions Avg"])
combineddf["TC Sysmem AI"] = combineddf["TC Flops Avg"]/(32.*combineddf["Sysmem Transactions Avg"])

#print
combineddf

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Calls,Time Avg,FP32 Flops Avg,FP16 non-TC Flops Avg,TC Flops Avg,FP16 Flops Avg,L1 Transactions Avg,L2 Transactions Avg,DRAM Transactions Avg,Sysmem Transactions Avg,...,FP32 DRAM AI,FP16 DRAM AI,FP32 Sysmem AI,FP16 Sysmem AI,FP32 L1 AI,FP16 L1 AI,TC L1 AI,TC L2 AI,TC DRAM AI,TC Sysmem AI
Precision,Network Name,Data Format,Input Shape,Kernel Shape,Stride Size,Batch Size,Pass,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
FP16,ResNet50-1,NHWC,224x224x3,7x7x3x64,2,16,backward,220,0.00149,98777330.0,15275056.0,96554680000.0,96569950000.0,54617585,52475151,6960787,55.0,...,0.443454,433.544503,56123.48,54869290.0,0.056516,55.253467,55.244727,57.500237,433.475926,54860610.0
FP16,ResNet50-1,NHWC,224x224x3,7x7x3x64,2,16,forward,120,0.000527,3930587000.0,0.0,0.0,0.0,10642868,4138245,3203439,30.0,...,38.343433,0.0,4094362.0,0.0,11.541142,0.0,0.0,0.0,0.0,0.0
FP16,ResNet50-1,NHWC,224x224x3,7x7x3x64,2,32,backward,120,0.002743,9865605000.0,152886778.0,0.0,152886800.0,41152554,13574336,9031960,30.0,...,34.134359,0.528978,10276670.0,159257.1,7.491641,0.116098,0.0,0.0,0.0,0.0
FP16,ResNet50-1,NHWC,224x224x3,7x7x3x64,2,32,forward,120,0.000994,7861174000.0,0.0,0.0,0.0,21272249,8153172,6450430,30.0,...,38.084546,0.0,8188723.0,0.0,11.548459,0.0,0.0,0.0,0.0,0.0
FP16,ResNet50-1,NHWC,224x224x3,7x7x3x64,2,64,backward,120,0.003643,19731210000.0,305768038.0,0.0,305768000.0,82177623,28995687,17632496,30.0,...,34.969543,0.541911,20553340.0,318508.4,7.503263,0.116276,0.0,0.0,0.0,0.0
FP16,ResNet50-1,NHWC,224x224x3,7x7x3x64,2,64,forward,120,0.001988,15722350000.0,0.0,0.0,0.0,42532264,16210118,12824596,30.0,...,38.311023,0.0,16377450.0,0.0,11.551781,0.0,0.0,0.0,0.0,0.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,16,backward,180,0.000529,23515650.0,8143360.0,6912488000.0,6920631000.0,8878020,8593930,5446903,40.0,...,0.134914,39.705079,18371.6,5406743.0,0.082773,24.360129,24.331465,25.135792,39.658359,5400381.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,16,forward,160,0.000427,12978690.0,6957568.0,7383562000.0,7390520000.0,6347143,6446126,5465782,30.0,...,0.074204,42.254476,13519.47,7698458.0,0.0639,36.387041,36.352786,35.794573,42.214696,7691211.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,32,backward,220,0.00095,43160580.0,16286720.0,11710650000.0,11726940000.0,16883440,16233820,9972842,45.0,...,0.135244,36.746473,29972.62,8143706.0,0.079887,21.705693,21.675548,22.542927,36.695439,8132396.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x64,2,32,forward,160,0.000794,25938940.0,13896704.0,11731440000.0,11745330000.0,12650451,12823954,10945611,30.0,...,0.074056,33.53323,27019.73,12234720.0,0.064076,29.014119,28.979791,28.587706,33.493555,12220250.0


In [9]:
combineddf = combineddf.reset_index()
seldf = combineddf[ (combineddf["Network Name"]=="VGG-2") &\
           (combineddf["Input Shape"]=="224x224x64") &\
           (combineddf["Precision"]=="FP32") &\
           (combineddf["Pass"]=="backward") ]
seldf["FP32 Flops Avg"]

48    3.291087e+10
50    6.579421e+10
52    0.000000e+00
Name: FP32 Flops Avg, dtype: float64

# Export Data

In [10]:
outputdir = "./results"

profiledf.to_csv(os.path.join(outputdir,"full_profile.csv"))
combineddf.to_csv(os.path.join(outputdir,"combined_profile.csv"))