In [1]:
import subprocess as sp
import numpy as np
import pandas as pd
from io import StringIO
import os
import re
import shutil

from utils import *

In [2]:
#global parameters
cudadir = "/usr/common/software/cuda/10.2.89"
homedir = os.path.dirname(os.getcwd())

In [3]:
#input and output dirs
#datadirs = ["../scripts/tf_cnn_kernels_nsight/runs/386219"]
#datadirs = ["../scripts/tf_cnn_kernels_nsight/runs/386058"]
#datadirs = os.path.join(homedir,"data/tf_2.0b/new_nsight")
datadirs = ["../data/tf_2.0b/new_nsight"]
outputdir = "../results/tf2_nsight/results_NHWC"

# Functions

In [4]:
def transpose_frame(df_metrics):
    #Copy the profile frame to make sure not to overwrite it and potentially read it in again if we screwed it up
    selectkeys = ["Precision", "Network Name", "Data Format", "Input Shape", "Kernel Shape", "Stride Size", "Batch Size", "Pass", "Name"]
    
    tc_peak_perf_flops = 125*10**12

    #as metricdf use df_summary
    metricdf = df_metrics.copy()
    metricdf.sort_values(by=selectkeys,inplace=True)
    metricdf.reset_index(drop=True, inplace=True)

    #remove the calibration
    metricdf = metricdf[metricdf["Pass"] != "calibrate"]
    
    #tmp = [metricdf["Metric Name"].unique()]
    #print(tmp)
    
    

    ####### Get timing information

    ### CUDA Time
    cudatimedf = metricdf[ (metricdf["Metric Name"].str.contains("smsp__cycles_elapsed")) ].sort_values(selectkeys)
    # get cycles and rates
    cyclesdf = metricdf.loc[(metricdf["Metric Name"]=="smsp__cycles_elapsed") & (metricdf["Metric Type"]=="total"), selectkeys+["Metric Value"]]
    ratesdf = metricdf.loc[(metricdf["Metric Name"]=="smsp__cycles_elapsed") & (metricdf["Metric Type"]=="rate"), selectkeys+["Metric Value"]]
    # combine
    cudatimedf = cyclesdf.merge(ratesdf, on=selectkeys, how="outer").fillna(0.)
    cudatimedf["CUDA Time Avg"] = cudatimedf["Metric Value_x"] / (cudatimedf["Metric Value_y"] * 1e9)
    cudatimedf = cudatimedf.fillna(0.)
    # merge into results
    metricdf = metricdf.merge(cudatimedf[selectkeys+["CUDA Time Avg"]], on=selectkeys, how="inner")
    
    
    ### Tensor Core Time
    tctimedf = metricdf[ (metricdf["Metric Name"].str.contains("smsp__pipe_tensor_op_hmma_cycles_active")) ].sort_values(selectkeys)
    # get cycles and rates
    cyclesdf = metricdf.loc[(metricdf["Metric Name"]=="smsp__pipe_tensor_op_hmma_cycles_active") & (metricdf["Metric Type"]=="total"), selectkeys+["Metric Value"]]
    ratesdf = metricdf.loc[(metricdf["Metric Name"]=="smsp__pipe_tensor_op_hmma_cycles_active") & (metricdf["Metric Type"]=="rate"), selectkeys+["Metric Value"]]
    # combine
    tctimedf = cyclesdf.merge(ratesdf, on=selectkeys, how="outer").fillna(0.)
    tctimedf["TC Time Avg"] = tctimedf["Metric Value_x"] / (tctimedf["Metric Value_y"] * 1e9).fillna(0.)
    tctimedf = tctimedf.fillna(0.)
    # merge into results
    metricdf = metricdf.merge(tctimedf[selectkeys+["TC Time Avg"]], on=selectkeys, how="inner")
    
    ### check
    tmpdf = metricdf.loc[(abs(metricdf["CUDA Time Avg"] - metricdf["TC Time Avg"])/metricdf["CUDA Time Avg"] > 0.01) & (metricdf["TC Time Avg"] != 0)]
    if not tmpdf.empty:
        print(tmpdf)
        raise ValueError("CUDA Time not consistent wit TC Time") 
        
        
        
    ####### Get number of FLOPs
    
    ### FMA FLOPs = number of FMA instructions x 2
    metricdf.loc[metricdf["Metric Name"].str.contains("fma"), ["Metric Value"]] *= 2
    

    ### FP64 FLOPs
    #metrics = ['smsp__sass_thread_inst_executed_op_dadd_pred_on',
    #           'smsp__sass_thread_inst_executed_op_dfma_pred_on',
    #           'smsp__sass_thread_inst_executed_op_dmul_pred_on']
    #tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), selectkeys+["Metric Value"] ].copy()
    #tmpdf = tmpdf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "FP64 FLOPs"})
    #metricdf = metricdf.merge(tmpdf[selectkeys+["FP64 FLOPs"]], on=selectkeys, how="inner")
    
    
    ### FP32 FLOPs
    metrics = ['smsp__sass_thread_inst_executed_op_fadd_pred_on',
               'smsp__sass_thread_inst_executed_op_ffma_pred_on',
               'smsp__sass_thread_inst_executed_op_fmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), selectkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "FP32 FLOPs Avg"})
    metricdf = metricdf.merge(tmpdf[selectkeys+["FP32 FLOPs Avg"]], on=selectkeys, how="inner")
    
    
    ### FP16 FLOPs
    metrics = ['smsp__sass_thread_inst_executed_op_hadd_pred_on',
               'smsp__sass_thread_inst_executed_op_hfma_pred_on',
               'smsp__sass_thread_inst_executed_op_hmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), selectkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "FP16 FLOPs Avg"})
    #print(tmpdf)
    metricdf = metricdf.merge(tmpdf[selectkeys+["FP16 FLOPs Avg"]], on=selectkeys, how="inner")
    
    
    ### TC FLOPs
    tmpdf = metricdf.loc[ metricdf["Metric Name"].str.contains("tensor_op_hmma.avg.pct_of_peak"), selectkeys+["TC Time Avg", "Metric Value"] ].copy()
    tmpdf["Utilization"] = 0.01 * tmpdf["Metric Value"]
    #print(tmpdf)
    tmpdf["TC FLOPs Avg"] = tc_peak_perf_flops * tmpdf["Utilization"] * tmpdf["TC Time Avg"]
    # merge
    metricdf = metricdf.merge(tmpdf[selectkeys+["TC FLOPs Avg"]], on=selectkeys, how="inner")

    
    ### Total FLOPs
    metricdf["FLOPs Avg"] = metricdf["FP32 FLOPs Avg"] + metricdf["FP16 FLOPs Avg"] + metricdf["TC FLOPs Avg"] #+ metricdf["FP64 FLOPs"]

    
    ### FLOPs fractions
    #metricdf["FP64 FLOPs Fraction"] = metricdf["FP64 FLOPs"]/metricdf["FLOPs"]
    metricdf["FP32 FLOPs Fraction Avg"] = metricdf["FP32 FLOPs Avg"]/metricdf["FLOPs Avg"]
    metricdf["FP16 FLOPs Fraction Avg"] = metricdf["FP16 FLOPs Avg"]/metricdf["FLOPs Avg"]
    metricdf["TC FLOPs Fraction Avg"]   = metricdf["TC FLOPs Avg"]/metricdf["FLOPs Avg"]
    #print(metricdf)
    
    
    
    ####### Get number of bytes
    
    ### Shared transactions
    #project out
    shareddf = metricdf.loc[metricdf["Metric Name"].str.contains("smsp__inst_executed_op_shared"), selectkeys+["Metric Value"] ].copy()
    shareddf = shareddf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "Shared Transactions Avg"})
    #add to timings
    metricdf = metricdf.merge(shareddf[selectkeys+["Shared Transactions Avg"]], on=selectkeys, how="inner")

    
    ### L1 atomic transactions
    # project out
    metrics = ['l1tex__t_set_accesses_pipe_lsu_mem_global_op_atom',
               'l1tex__t_set_accesses_pipe_lsu_mem_global_op_red',
               'l1tex__t_set_accesses_pipe_tex_mem_surface_op_atom',
               'l1tex__t_set_accesses_pipe_tex_mem_surface_op_red']
    atomicdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), selectkeys+["Metric Value"] ].copy()
    # get reads and writes
    atomicdf = atomicdf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "L1 Atomic Transactions Avg"})
    # add to timings
    metricdf = metricdf.merge(atomicdf[selectkeys+["L1 Atomic Transactions Avg"]], on=selectkeys, how="inner")
    
    
    ### Local transactions 
    # project out
    localdf = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__t_sectors_pipe_lsu_mem_local_op"), selectkeys+["Metric Value"] ].copy()
    localdf = localdf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "Local Transactions Avg"})
    # add to timings
    metricdf = metricdf.merge(localdf[selectkeys+["Local Transactions Avg"]], on=selectkeys, how="inner")
    
    
    ### Global transactions 
    # project out
    globaldf = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__t_sectors_pipe_lsu_mem_global_op"), selectkeys+["Metric Value"] ].copy()
    globaldf = globaldf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "Global Transactions Avg"})
    # add to timings
    metricdf = metricdf.merge(globaldf[selectkeys+["Global Transactions Avg"]], on=selectkeys, how="inner")
    
    
    ### L1 Bytes
    metricdf["L1 Transactions Avg"] = (metricdf["Shared Transactions Avg"] + metricdf["L1 Atomic Transactions Avg"]
                            + metricdf["Local Transactions Avg"] + metricdf["Global Transactions Avg"])
    metricdf["L1 Bytes Avg"] = metricdf["L1 Transactions Avg"] * 32
    
    # clean up
    #del metricdf["Shared Transactions Avg"]
    #del metricdf["L1 Atomic Transactions Avg"]
    #del metricdf["Local Transactions Avg"]
    #del metricdf["Global Transactions Avg"]
    
    
    ### L2 atomic & reduction
    metricdf.loc[(metricdf["Metric Name"].str.contains("lts__t_sectors_op")) & (metricdf["Metric Type"]=="total"), ["Metric Value"]] *= 2
    
    
    ### L2 transactions
    # project out
    l2df = metricdf.loc[metricdf["Metric Name"].str.contains("lts__t_sectors_op"), selectkeys+["Metric Value"] ].copy()
    l2df = l2df.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "L2 Transactions Avg"})
    l2df["L2 Bytes Avg"] = l2df["L2 Transactions Avg"] * 32
    # add to timings
    metricdf = metricdf.merge(l2df[selectkeys+["L2 Transactions Avg", "L2 Bytes Avg"]], on=selectkeys, how="inner")
    
    
    ### DRAM Bytes
    # project out
    dramdf = metricdf[ metricdf["Metric Name"].str.contains("dram__sectors") ].sort_values(selectkeys)
    # get reads and writes
    dramreadsdf = dramdf.loc[(dramdf["Metric Name"]=="dram__sectors") & (dramdf["Metric Type"]=="read"), selectkeys+["Metric Value"]]
    dramwritesdf = dramdf.loc[(dramdf["Metric Name"]=="dram__sectors") & (dramdf["Metric Type"]=="write"), selectkeys+["Metric Value"]]
    # combine
    dramdf = dramwritesdf.merge(dramreadsdf, on=selectkeys, how="outer").fillna(0.)
    dramdf["DRAM Transactions Avg"] = dramdf["Metric Value_x"] + dramdf["Metric Value_y"]
    dramdf["DRAM Bytes Avg"] = dramdf["DRAM Transactions Avg"] * 32
    #print(dramdf[['Name', 'Metric Value_x', 'Metric Value_y']])
    metricdf = metricdf.merge(dramdf[selectkeys+["DRAM Transactions Avg", "DRAM Bytes Avg"]], on=selectkeys, how="inner")
    
    
    
    ####### Clean up and return:
    del metricdf["Metric Value"]
    del metricdf["Metric Name"]
    del metricdf["Metric Type"]
    #del metricdf["Invocations"]
    metricdf.drop_duplicates(keep = 'first', inplace = True)
    

    ### Get performance
    metricdf["Performance GFlop/s"]      = metricdf["FLOPs Avg"]      / (metricdf["CUDA Time Avg"]*10**9)
    metricdf["FP32 Performance GFlop/s"] = metricdf["FP32 FLOPs Avg"] / (metricdf["CUDA Time Avg"]*10**9)
    metricdf["FP16 Performance GFlop/s"] = metricdf["FP16 FLOPs Avg"] / (metricdf["CUDA Time Avg"]*10**9)
    metricdf["TC Performance GFlop/s"]   = metricdf["TC FLOPs Avg"]   / (metricdf["TC Time Avg"]*10**9)

    
    ### Get AI
    # L1
    metricdf["L1 AI"]        = metricdf["FLOPs Avg"]      / metricdf["L1 Bytes Avg"]
    metricdf["FP32 L1 AI"]   = metricdf["FP32 FLOPs Avg"] / metricdf["L1 Bytes Avg"]
    metricdf["FP16 L1 AI"]   = metricdf["FP16 FLOPs Avg"] / metricdf["L1 Bytes Avg"]
    metricdf["TC L1 AI"]     = metricdf["TC FLOPs Avg"]   / metricdf["L1 Bytes Avg"]
    # L2
    metricdf["L2 AI"]        = metricdf["FLOPs Avg"]      / metricdf["L2 Bytes Avg"]
    metricdf["FP32 L2 AI"]   = metricdf["FP32 FLOPs Avg"] / metricdf["L2 Bytes Avg"]
    metricdf["FP16 L2 AI"]   = metricdf["FP16 FLOPs Avg"] / metricdf["L2 Bytes Avg"]
    metricdf["TC L2 AI"]     = metricdf["TC FLOPs Avg"]   / metricdf["L2 Bytes Avg"]
    # DRAM
    metricdf["DRAM AI"]      = metricdf["FLOPs Avg"]      / metricdf["DRAM Bytes Avg"]
    metricdf["FP32 DRAM AI"] = metricdf["FP32 FLOPs Avg"] / metricdf["DRAM Bytes Avg"]
    metricdf["FP16 DRAM AI"] = metricdf["FP16 FLOPs Avg"] / metricdf["DRAM Bytes Avg"]
    metricdf["TC DRAM AI"]   = metricdf["TC FLOPs Avg"]   / metricdf["DRAM Bytes Avg"]


    ### Cleanup
    metricdf.sort_values(by=selectkeys).reset_index(drop=True, inplace=True)
    #print(metricdf[['CUDA Time Avg', 'TC Time Avg']])
    
    return metricdf

# Import Data

In [5]:
#get all the files
files = []
for datadir in datadirs:
    files += [ os.path.join(datadir,x) for x in os.listdir(datadir) if ((os.path.splitext(x)[-1] == ".nsight-cuprof-report"))]

#recs
records = []

#build feature list:
for path in files:
    
    #filename
    file = os.path.basename(path)
    
    #path
    path = os.path.dirname(path)
    
    #splitup
    splt = file.split(".")
    
    prefix = ".".join(splt[0:-1])
    
    #append to records
    records.append({"prefix": prefix, "file": os.path.join(path, file)})

#put in df
recorddf = pd.DataFrame(records).sort_values(["prefix"])
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):
#display(recorddf["prefix"])

In [6]:
#sort by those keys:
sortkeys = ["Network Name", "Input Shape", "Kernel Shape", \
            "Batch Size", "Stride Size", "Data Format", "Pass", \
            "Precision", "Device", "Name"]

#limit the input
#recorddf = recorddf[ recorddf["prefix"].str.startswith("profile.name_ResNet50-2.batchsize_16.inputshape_112x112x64.kernelshape_7x7x64x64.stride_2.dataformat_NHWC.fp32") ]
    
#group by prefixes and files
all_prefixes = set([x.split(".pass")[0] for x in recorddf["prefix"]])
all_passes = set([x.split(".pass_")[1].replace(".pass_","") for x in recorddf["prefix"].unique()])

#print(recorddf.values[0])

#metrics
df_profiles = []

for pref in all_prefixes:
    
    #set empty lists
    df_times = []
    df_timeline = []
    df_summary = []
    
    #print prefix
    #print(pref)
    
    #loop over passes
    df_times = []
    df_metrics = []
    for pas in all_passes:
        
        #project frame
        files = recorddf.loc[ recorddf["prefix"] == pref + ".pass_" + pas, "file" ].values
        
        #project the invididual files
        metricfile = [x for x in files if x.endswith(".nsight-cuprof-report")][0]
            
        #get the parameters from the filename
        parameters = parse_filename_nsight(os.path.basename(metricfile))
            
        #metrics
        metricdf = import_nsight_metric(metricfile, cuda_dir=cudadir)
        for key in parameters:
            metricdf[key] = parameters[key]
        
        #fuse read/write metrics together:
        unique_metrics = metricdf["Metric Name"].unique()
        unique_metrics = set([x.replace(".sum","").replace("_write","").replace("_read","").replace("_ld","").replace("_st","") for x in unique_metrics])
        #add the metric type
        metricdf["Metric Type"] = "total"
        #read
        metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_read"), "Metric Type" ] = "read"
        metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_ld"), "Metric Type" ] = "read"
        #write
        metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_write"), "Metric Type" ] = "write"
        metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_st"), "Metric Type" ] = "write"
        #rate
        metricdf.loc[ metricdf[ "Metric Name" ].str.contains(".per_second"), "Metric Type" ] = "rate"
        
        for metric in unique_metrics:
            metricdf.loc[ metricdf[ "Metric Name"].str.startswith(metric), "Metric Name" ] = metric

        #append to DF:
        df_metrics.append(metricdf)
    
    metricdf = pd.concat(df_metrics)
    
    #compute the profile
    profiledf = transpose_frame(metricdf)
    df_profiles.append(profiledf)

#concat everything
profiledf = pd.concat(df_profiles)
profiledf.reset_index(drop=True, inplace=True)

In [7]:
#display(profiledf.columns)
#tmplist = ['Name', 'Invocations', 'Pass', 'L1 Transactions Avg', 'L2 Transactions Avg', 'DRAM Transactions Avg']
#display(profiledf[tmplist])

# Compute AI Results

In [8]:
#profiledf[ (profiledf["Network Name"]=="ResNet50-2") &\
#           (profiledf["Input Shape"]=="112x112x64") &\
#           (profiledf["Batch Size"]==16) &\
#           (profiledf["Precision"]=="FP32") &\
#           (profiledf["Stride Size"]==2) &\
#           (profiledf["Pass"]=="forward") &\
#           (profiledf["Kernel Shape"]=="7x7x64x64")
#         ]
#profiledf

In [9]:
#sum over all kernels
combinedselectkeys = ["Precision", "Network Name", "Data Format", "Input Shape", "Kernel Shape", "Stride Size", \
                     "Batch Size", "Pass"]

#copy profiledf
combineddf = profiledf.copy()

#get the aggregated performance, including all kernels:
#compute weights: multiply all measures by the number of invocations
weighted = True
if weighted:
    #first, get all the names of metrics which need to be weighted
    metrics = [x for x in combineddf.columns if "Avg" in x]
    for metric in metrics:
        combineddf[metric] *= combineddf["Invocations"]

#sum up
combineddf = combineddf.groupby(by=combinedselectkeys).sum()#.reset_index()


#the flop fractions need to be recomputed
combineddf["FP32 FLOPs Fraction Avg"] = combineddf["FP32 FLOPs Avg"] / combineddf["FLOPs Avg"]
combineddf["FP16 FLOPs Fraction Avg"] = combineddf["FP16 FLOPs Avg"] / combineddf["FLOPs Avg"]
combineddf["TC FLOPs Fraction Avg"]   = combineddf["TC FLOPs Avg"]   / combineddf["FLOPs Avg"]

### Get performance
combineddf["Performance GFlop/s"]      = combineddf["FLOPs Avg"]      / (combineddf["CUDA Time Avg"]*10**9)
combineddf["FP32 Performance GFlop/s"] = combineddf["FP32 FLOPs Avg"] / (combineddf["CUDA Time Avg"]*10**9)
combineddf["FP16 Performance GFlop/s"] = combineddf["FP16 FLOPs Avg"] / (combineddf["CUDA Time Avg"]*10**9)
combineddf["TC Performance GFlop/s"]   = combineddf["TC FLOPs Avg"]   / (combineddf["TC Time Avg"]*10**9)


### Get AI
# L1
combineddf["L1 AI"]        = combineddf["FLOPs Avg"]      / combineddf["L1 Bytes Avg"]
combineddf["FP32 L1 AI"]   = combineddf["FP32 FLOPs Avg"] / combineddf["L1 Bytes Avg"]
combineddf["FP16 L1 AI"]   = combineddf["FP16 FLOPs Avg"] / combineddf["L1 Bytes Avg"]
combineddf["TC L1 AI"]     = combineddf["TC FLOPs Avg"]   / combineddf["L1 Bytes Avg"]
# L2
combineddf["L2 AI"]        = combineddf["FLOPs Avg"]      / combineddf["L2 Bytes Avg"]
combineddf["FP32 L2 AI"]   = combineddf["FP32 FLOPs Avg"] / combineddf["L2 Bytes Avg"]
combineddf["FP16 L2 AI"]   = combineddf["FP16 FLOPs Avg"] / combineddf["L2 Bytes Avg"]
combineddf["TC L2 AI"]     = combineddf["TC FLOPs Avg"]   / combineddf["L2 Bytes Avg"]
# DRAM
combineddf["DRAM AI"]      = combineddf["FLOPs Avg"]      / combineddf["DRAM Bytes Avg"]
combineddf["FP32 DRAM AI"] = combineddf["FP32 FLOPs Avg"] / combineddf["DRAM Bytes Avg"]
combineddf["FP16 DRAM AI"] = combineddf["FP16 FLOPs Avg"] / combineddf["DRAM Bytes Avg"]
combineddf["TC DRAM AI"]   = combineddf["TC FLOPs Avg"]   / combineddf["DRAM Bytes Avg"]

combineddf.sort_values(by=combinedselectkeys).reset_index(drop=True, inplace=True)

In [10]:
#display(combineddf.columns)
display(combineddf[['CUDA Time Avg', 'FP32 FLOPs Avg', 'FP16 FLOPs Avg', 'TC FLOPs Avg', 'FLOPs Avg', 'L1 Transactions Avg', 'L2 Transactions Avg', 'DRAM Transactions Avg']])
# combineddf.keys
# combineddf.columns

# combineddf['Name']
# combineddf.iloc[0,1]
# combineddf.iloc[2]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,CUDA Time Avg,FP32 FLOPs Avg,FP16 FLOPs Avg,TC FLOPs Avg,FLOPs Avg,L1 Transactions Avg,L2 Transactions Avg,DRAM Transactions Avg
Precision,Network Name,Data Format,Input Shape,Kernel Shape,Stride Size,Batch Size,Pass,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1
FP16,ResNet50-1,NHWC,224x224x3,7x7x3x64,2,16,backward,0.087054,2.768432e+11,4.416810e+09,1.301671e+12,1.582931e+12,2.776795e+09,1.007221e+09,2.773773e+08
FP16,ResNet50-1,NHWC,224x224x3,7x7x3x64,2,16,forward,0.011637,7.835484e+10,1.277573e+09,0.000000e+00,7.963241e+10,7.917887e+08,1.326833e+08,5.590063e+07
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,1,16,backward,0.053656,2.364867e+09,5.911347e+08,3.600567e+12,3.603523e+12,3.098490e+09,2.415621e+09,4.377863e+08
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,1,16,forward,0.013514,1.027604e+09,5.911347e+08,7.284012e+11,7.300200e+11,5.651085e+08,4.560803e+08,1.466827e+08
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,backward,0.055820,2.416681e+09,1.068933e+09,1.875959e+12,1.879444e+12,1.338203e+09,1.009104e+09,5.470152e+08
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,forward,0.027021,1.481035e+11,1.104937e+09,0.000000e+00,1.492084e+11,1.051534e+09,2.858008e+08,1.662337e+08
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,3,16,backward,0.042358,1.386929e+11,1.723765e+09,9.121670e+11,1.052584e+12,1.015623e+09,8.485502e+20,3.275743e+08
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,3,16,forward,0.006848,1.186202e+08,5.269095e+08,1.284342e+11,1.290797e+11,1.897878e+08,1.813047e+08,1.177080e+08
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,1,16,backward,0.138504,5.626266e+09,5.269095e+08,1.291752e+13,1.292367e+13,6.526844e+09,4.239338e+09,6.819464e+08
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,1,16,forward,0.024543,2.055209e+09,5.269095e+08,1.421264e+12,1.423846e+12,1.321918e+09,7.659668e+08,1.784323e+08


In [11]:
#combineddf = combineddf.reset_index()
#seldf = combineddf[ (combineddf["Network Name"]=="ResNet50-2") &\
#           (combineddf["Input Shape"]=="112x112x64") &\
#           (combineddf["Precision"]=="FP32")]
#seldf
#combineddf[["FP32 L2 AI", "FP32 L1 AI"]]
combineddf[["L2 AI", "L1 AI"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,L2 AI,L1 AI
Precision,Network Name,Data Format,Input Shape,Kernel Shape,Stride Size,Batch Size,Pass,Unnamed: 8_level_1,Unnamed: 9_level_1
FP16,ResNet50-1,NHWC,224x224x3,7x7x3x64,2,16,backward,4.911195e+01,17.814283
FP16,ResNet50-1,NHWC,224x224x3,7x7x3x64,2,16,forward,1.875529e+01,3.142900
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,1,16,backward,4.661745e+01,36.343542
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,1,16,forward,5.001998e+01,40.369458
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,backward,5.820273e+01,43.889180
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,forward,1.631473e+01,4.434250
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,3,16,backward,3.876404e-11,32.387235
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,3,16,forward,2.224841e+01,21.253953
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,1,16,backward,9.526600e+01,61.877510
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,1,16,forward,5.809025e+01,33.659572


# Export Data

In [12]:
profiledf.to_csv(os.path.join(outputdir,"full_profile.csv"))
combineddf.to_csv(os.path.join(outputdir,"combined_profile.csv"))