In [1]:
import subprocess as sp
import numpy as np
import pandas as pd
from io import StringIO
import os
import re
import shutil

from utils import *

In [2]:
#global parameters
cudadir = "/usr/common/software/cuda/10.0.130"

In [3]:
#input and output dirs
datadirs = ["./data/tf_2.0b/sanitized"]
outputdir = "./results/tf_2.0b/results_NHWC"

# Functions

In [4]:
def transpose_frame(df_times, df_metrics):
    #Copy the profile frame to make sure not to overwrite it and potentially read it in again if we screwed it up
    selectkeys = ["Precision", "Network Name", "Data Format", "Input Shape", "Kernel Shape", "Stride Size", "Batch Size", "Pass", "Name"]
    tc_peak_perf_flops = 125*10**12

    #just pick the gpu activities for now
    profiledf = df_times[ df_times["Collection Type"] == "gpu_activities" ].copy()
    profiledf.sort_values(by=["Name"],inplace=True)
    profiledf.reset_index(drop=True, inplace=True)
    profiledf.rename(columns={"Avg": "Time Avg"}, inplace=True)
    del profiledf["Time(%)"]
    del profiledf["Time"]
    del profiledf["Min"]
    del profiledf["Max"]
    del profiledf["Metric Name"]
    del profiledf["Collection Type"]

    #remove the calibration
    alignkeys = selectkeys[:-2]
    profiledf = profiledf.groupby(alignkeys).apply(lambda x: x[ (~x["Name"].isin(x.loc[x["Pass"].str.startswith("calibrate"), "Name"].values)) ])
    profiledf.reset_index(drop=True, inplace=True)
    
    #as metricdf use df_summary
    metricdf = df_metrics.copy()

    #now, get the AI-relevant stuff:
    #FLOPS 32
    flopdf = metricdf[ metricdf["Metric Name"].str.contains("flop_count_sp") ].sort_values(selectkeys).rename(columns={"Avg": "FP32 Flops Avg"})
    #add to timings
    profiledf = profiledf.merge(flopdf[selectkeys+["FP32 Flops Avg"]], on=selectkeys, how="inner")
    
    #monitor that: if that changes be warned
    numrows = profiledf.shape[0]

    #FLOPS 16 non-TC
    flopdf = metricdf[ metricdf["Metric Name"].str.contains("flop_count_hp") ].sort_values(selectkeys).rename(columns={"Avg": "FP16 non-TC Flops Avg"})
    #add to timings
    mergedf = profiledf.merge(flopdf[selectkeys+["FP16 non-TC Flops Avg"]], on=selectkeys, how="inner")

    #check
    if mergedf.shape[0] != numrows:
        #print(profiledf, flopdf)
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf
    
    
    #FLOPS TC
    flopdf = metricdf[ metricdf["Metric Name"].str.contains("tensor_precision_fu_utilization") ].sort_values(selectkeys).rename(columns={"Avg": "TC Flops Avg"})
    tmpdf = flopdf.merge(profiledf, how="inner", on=selectkeys).sort_values(selectkeys)
    tmpdf["TC Flops Avg"] *= tc_peak_perf_flops/10. * tmpdf["Time Avg"]
    #add to timings
    mergedf = profiledf.merge(tmpdf[selectkeys+["TC Flops Avg"]], on=selectkeys, how="inner")

    #check
    if mergedf.shape[0] != numrows:
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf

    
    #fill NA values here
    profiledf.fillna(0., inplace=True)

    #FLOPS FP16: add TC and non-TC FP16 flops together
    profiledf["FP16 Flops Avg"] = profiledf["TC Flops Avg"] + profiledf["FP16 non-TC Flops Avg"]

    #total flops
    profiledf["Flops Avg"] = profiledf["FP16 Flops Avg"] + profiledf["FP32 Flops Avg"]

    #flop fractions
    profiledf["TC Flops Fraction Avg"] = profiledf["TC Flops Avg"]/profiledf["Flops Avg"]
    profiledf["FP16 Flops Fraction Avg"] = profiledf["FP16 Flops Avg"]/profiledf["Flops Avg"]
    profiledf["FP16 non-TC Flops Fraction Avg"] = profiledf["FP16 non-TC Flops Avg"]/profiledf["Flops Avg"]
    profiledf["FP32 Flops Fraction Avg"] = profiledf["FP32 Flops Avg"]/profiledf["Flops Avg"]


    #shared
    #project out
    shareddf = metricdf[ metricdf["Metric Name"].str.contains("shared") ].sort_values(selectkeys)
    #get reads and writes
    sharedreadsdf = shareddf.loc[(shareddf["Metric Name"]=="shared_transactions") & (shareddf["Metric Mode"]=="read"), selectkeys+["Avg"]]
    sharedwritesdf = shareddf.loc[(shareddf["Metric Name"]=="shared_transactions") & (shareddf["Metric Mode"]=="write"), selectkeys+["Avg"]]
    #combine
    shareddf = sharedwritesdf.merge(sharedreadsdf, on=selectkeys, how="outer").fillna(0.)
    shareddf["Shared Transactions Avg"] = shareddf["Avg_x"] + shareddf["Avg_y"]
    #add to timings
    mergedf = profiledf.merge(shareddf[selectkeys+["Shared Transactions Avg"]], on=selectkeys, how="inner")

    #check
    if mergedf.shape[0] != numrows:
        #get the complement:
        print(profiledf[ ~profiledf.index.isin(mergedf.index) ])
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf
    
    
    #atomic
    #project out
    atomicdf = metricdf[ metricdf["Metric Name"] == "atomic_transactions" ].sort_values(selectkeys)
    #get reads and writes
    atomicdf = atomicdf[selectkeys+["Avg"]].rename(columns={"Avg": "Atomic Transactions Avg"})
    #add to timings
    mergedf = profiledf.merge(atomicdf[selectkeys+["Atomic Transactions Avg"]], on=selectkeys, how="inner")
    
    #check
    if mergedf.shape[0] != numrows:
        #get the complement:
        print(profiledf[ ~profiledf.index.isin(mergedf.index) ])
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf

    
    #L1
    #project out
    l1df = metricdf[ (metricdf["Metric Name"].str.contains("gst_")) | (metricdf["Metric Name"].str.contains("gld_")) ].sort_values(selectkeys)
    #get reads and writes
    l1readsdf = l1df.loc[(l1df["Metric Name"]=="gld_transactions"), selectkeys+["Avg"]]
    l1writesdf = l1df.loc[(l1df["Metric Name"]=="gst_transactions"), selectkeys+["Avg"]]
    #combine
    l1df = l1writesdf.merge(l1readsdf, on=selectkeys, how="outer").fillna(0.)
    l1df["L1 Transactions Avg"] = l1df["Avg_x"] + l1df["Avg_y"]
    #add to timings
    mergedf = profiledf.merge(l1df[selectkeys+["L1 Transactions Avg"]], on=selectkeys, how="inner")

    #check
    if mergedf.shape[0] != numrows:
        print(profiledf, l1df)
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf

    
    #L2
    #project out
    l2df = metricdf[ metricdf["Metric Name"].str.contains("l2") ].sort_values(selectkeys)
    #get reads and writes
    l2readsdf = l2df.loc[(l2df["Metric Name"]=="l2_transactions") & (l2df["Metric Mode"]=="read"), selectkeys+["Avg"]]
    l2writesdf = l2df.loc[(l2df["Metric Name"]=="l2_transactions") & (l2df["Metric Mode"]=="write"), selectkeys+["Avg"]]
    #combine
    l2df = l2writesdf.merge(l2readsdf, on=selectkeys, how="outer").fillna(0.)
    l2df["L2 Transactions Avg"] = l2df["Avg_x"] + l2df["Avg_y"]
    #add to timings
    mergedf = profiledf.merge(l2df[selectkeys+["L2 Transactions Avg"]], on=selectkeys, how="inner")

    #check
    if mergedf.shape[0] != numrows:
        print(profiledf, l2df)
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf
    
    
    #DRAM
    #project out
    dramdf = metricdf[ metricdf["Metric Name"].str.contains("dram") ].sort_values(selectkeys)
    #get reads and writes
    dramreadsdf = dramdf.loc[(dramdf["Metric Name"]=="dram_transactions") & (dramdf["Metric Mode"]=="read"), selectkeys+["Avg"]]
    dramwritesdf = dramdf.loc[(dramdf["Metric Name"]=="dram_transactions") & (dramdf["Metric Mode"]=="write"), selectkeys+["Avg"]]
    #combine
    dramdf = dramwritesdf.merge(dramreadsdf, on=selectkeys, how="outer").fillna(0.)
    dramdf["DRAM Transactions Avg"] = dramdf["Avg_x"] + dramdf["Avg_y"]
    #add to timings
    mergedf = profiledf.merge(dramdf[selectkeys+["DRAM Transactions Avg"]], on=selectkeys, how="inner")

    #check
    if mergedf.shape[0] != numrows:
        print(profiledf, dramdf)
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf
    

    #SYSMEM
    #project out
    sysmemdf = metricdf[ metricdf["Metric Name"].str.contains("sysmem") ].sort_values(selectkeys)
    #get reads and writes
    sysmemreadsdf = sysmemdf.loc[(sysmemdf["Metric Name"]=="sysmem_transactions") & (sysmemdf["Metric Mode"]=="read"), selectkeys+["Avg"]]
    sysmemwritesdf = sysmemdf.loc[(sysmemdf["Metric Name"]=="sysmem_transactions") & (sysmemdf["Metric Mode"]=="write"), selectkeys+["Avg"]]
    #combine
    sysmemdf = sysmemwritesdf.merge(sysmemreadsdf, on=selectkeys, how="outer").fillna(0.)
    sysmemdf["Sysmem Transactions Avg"] = sysmemdf["Avg_x"] + sysmemdf["Avg_y"]
    #add to timings
    mergedf = profiledf.merge(sysmemdf[selectkeys+["Sysmem Transactions Avg"]], on=selectkeys, how="inner")

    #check
    if mergedf.shape[0] != numrows:
        print(profiledf, sysmemdf)
        raise ValueError("Something went wrong, check consistency of inputs")
    else:
        profiledf = mergedf
    

    #clean up and sort:
    profiledf.sort_values(selectkeys).reset_index(drop=True, inplace=True)

    #get performance first
    profiledf["Performance GFlop/s"] = profiledf["Flops Avg"]/(profiledf["Time Avg"]*10**9)
    profiledf["FP32 Performance GFlop/s"] = profiledf["FP32 Flops Avg"]/(profiledf["Time Avg"]*10**9)
    profiledf["FP16 Performance GFlop/s"] = profiledf["FP16 Flops Avg"]/(profiledf["Time Avg"]*10**9)
    profiledf["TC Performance GFlop/s"] = profiledf["TC Flops Avg"]/(profiledf["Time Avg"]*10**9)

    #get AI:
    #L1 is L1+shared
    profiledf["L1 AI"] = profiledf["Flops Avg"]/(32.*(profiledf["L1 Transactions Avg"]+profiledf["Shared Transactions Avg"]+profiledf["Atomic Transactions Avg"]))
    profiledf["FP32 L1 AI"] = profiledf["FP32 Flops Avg"]/(32.*(profiledf["L1 Transactions Avg"]+profiledf["Shared Transactions Avg"]+profiledf["Atomic Transactions Avg"]))
    profiledf["FP16 L1 AI"] = profiledf["FP16 Flops Avg"]/(32.*(profiledf["L1 Transactions Avg"]+profiledf["Shared Transactions Avg"]+profiledf["Atomic Transactions Avg"]))
    #L2
    profiledf["L2 AI"] = profiledf["Flops Avg"]/(32.*profiledf["L2 Transactions Avg"])
    profiledf["FP32 L2 AI"] = profiledf["FP32 Flops Avg"]/(32.*profiledf["L2 Transactions Avg"])
    profiledf["FP16 L2 AI"] = profiledf["FP16 Flops Avg"]/(32.*profiledf["L2 Transactions Avg"])
    #DRAM
    profiledf["DRAM AI"] = profiledf["Flops Avg"]/(32.*profiledf["DRAM Transactions Avg"])
    profiledf["FP32 DRAM AI"] = profiledf["FP32 Flops Avg"]/(32.*profiledf["DRAM Transactions Avg"])
    profiledf["FP16 DRAM AI"] = profiledf["FP16 Flops Avg"]/(32.*profiledf["DRAM Transactions Avg"])
    #Sysmem
    profiledf["Sysmem AI"] = profiledf["Flops Avg"]/(32.*profiledf["Sysmem Transactions Avg"])
    profiledf["FP32 Sysmem AI"] = profiledf["FP32 Flops Avg"]/(32.*profiledf["Sysmem Transactions Avg"])
    profiledf["FP16 Sysmem AI"] = profiledf["FP16 Flops Avg"]/(32.*profiledf["Sysmem Transactions Avg"])

    #sort results
    profiledf.sort_values(by=selectkeys).reset_index(drop=True, inplace=True)
    
    return profiledf

# Check for Missing Data

In [5]:
#get metric list
files = []
for datadir in datadirs:
    files += [ os.path.join(datadir,x) for x in os.listdir(datadir) if (os.path.splitext(x)[-1] == ".nvprof") or (os.path.splitext(x)[-1] == ".nvvp") ]

#recs
records = []

#build feature list:
for path in files:
    
    #filename
    file = os.path.basename(path)
    
    #path
    path = os.path.dirname(path)
    
    #splitup
    splt = file.split(".")
    
    prefix = ".".join(splt[0:-2])
    metric = splt[-2].split("metric_")[1]
    
    #append to records
    records.append({"prefix": prefix, "metric": metric, "file": os.path.join(path, file)})

#put in df
recorddf = pd.DataFrame(records).sort_values(["prefix", "metric"])

#get all metrics
all_metrics = list(recorddf["metric"].unique())

#group by metric:
missingrecorddf = pd.DataFrame(recorddf.groupby("prefix").apply(lambda x: pd.Series([y for y in all_metrics if y not in list(x["metric"])])))

#create exclusion list:
excludelist = list(missingrecorddf.reset_index()["prefix"].unique())

#print the missing ones
missingrecorddf

# Check for Duplicate Files

In [6]:
##datadir:
#datadirs = ["./data/good_new", "./data/good_new_2"]
#
##do the brute force comparison
#for id1, datadir_1 in enumerate(datadirs):
#    
#    #files 1
#    files_1 = [ x for x in os.listdir(datadir_1) if (os.path.splitext(x)[-1] == ".nvprof") or (os.path.splitext(x)[-1] == ".nvvp") ]
#    
#    for id2 in range(id1+1,len(datadirs)):
#        
#        datadir_2 = datadirs[id2]
#        
#        #files 2
#        files_2 = [ x for x in os.listdir(datadir_2) if (os.path.splitext(x)[-1] == ".nvprof") or (os.path.splitext(x)[-1] == ".nvvp") ]
#        
#        #report dups:
#        dups = [x for x in files_1 if x in files_2]
#        
#        #print
#        print("Duplicates in {} and {}:".format(datadir_1, datadir_2))
#        
#        #move dups out of the way
#        if not os.path.isdir(os.path.join(datadir_2,"redundant")):
#            os.mkdir(os.path.join(datadir_2,"redundant"))
#        for file in dups:
#            source = os.path.join(datadir_2,file)
#            target = os.path.join(datadir_2,"redundant",file)
#            print("Move {} to {}".format(source,target))
#            shutil.move(source,target)

# Import Data

In [None]:
#sort by those keys:
sortkeys = ["Network Name", "Input Shape", "Kernel Shape", \
            "Batch Size", "Stride Size", "Data Format", "Pass", \
            "Precision", "Device", "Name", "Metric Name"]

#limit the input
#recorddf = recorddf[ recorddf["prefix"].str.startswith("profile.name_ResNet50-2.batchsize_16.inputshape_112x112x64.kernelshape_7x7x64x64.stride_2.dataformat_NHWC.fp32") ]

#group by prefixes and files
all_prefixes = set([x.split(".pass")[0] for x in recorddf["prefix"]])
all_metrics = recorddf["metric"].unique()
all_passes = set([x.split(".pass_")[1].replace(".pass_","") for x in recorddf["prefix"].unique()])

#metrics
df_profiles = []

for pref in all_prefixes:
    
    #set empty lists
    df_times = []
    df_timeline = []
    df_summary = []
    
    #print prefix
    #print(pref)
    
    #loop over passes
    for pas in all_passes:
        
        #project frame
        selectdf = recorddf[ recorddf["prefix"] == pref + ".pass_" + pas ]
        
        #loop over metrics
        for met in [x for x in all_metrics if x != "time"]:
            
            #filename
            file = selectdf.loc[ selectdf["metric"] == met, "file" ].values[0]
        
            #extract metric name
            parameters, metric = parse_filename(os.path.basename(file))
            metrics = metric.split("-")
    
            #import as timeline
            tmpdf = import_nvprof_metric(file, timeline=True, cuda_dir=cudadir)
            for key in parameters:
                tmpdf[key] = parameters[key]
        
            #replace "Idle (0)" with 0.:
            for metric in metrics:
                if metric=="tensor_precision_fu_utilization":
                    tmpdf[metric] = tmpdf[metric].apply(lambda x: replace_tc_string(x))
    
            #combine read and write metrics
            tmpdf = tmpdf.groupby([x for x in tmpdf.columns if x not in metrics]).apply(lambda x: combine_metrics(x, metrics)).reset_index()
            lev = [x for x in tmpdf.columns if x.startswith("level_")][0]
            del tmpdf[lev]
            df_timeline.append(tmpdf)
    
            #import as summary
            tmpdf = import_nvprof_metric(file, timeline=False, cuda_dir=cudadir).sort_values(by="Name").reset_index(drop=True)
            tmpdf["Metric Mode"] = "read" if "read" in metric else "write" if "write" in metric else "write" if "store" in metric else "read" if "load" in metric else "total"
            tmpdf["Metric Name"] = metric.replace("read","").replace("write","").replace("store","").replace("load","").replace("__","_")
            for key in parameters:
                tmpdf[key] = parameters[key]
            del tmpdf["Metric Description"]
    
            #replace "Idle (0)" with 0.:
            for metric in metrics:
                if metric=="tensor_precision_fu_utilization":
                    tmpdf[ "Min" ] = tmpdf[ "Min" ].apply(lambda x: replace_tc_string(x))
                    tmpdf[ "Max" ] = tmpdf[ "Max" ].apply(lambda x: replace_tc_string(x))
                    tmpdf[ "Avg" ] = tmpdf[ "Avg" ].apply(lambda x: replace_tc_string(x))
            df_summary.append(tmpdf)
        
        #do time now
        file = selectdf.loc[ selectdf["metric"] == "time", "file" ].values[0]
        timedf, markerdf = import_nvprof_overview(file, cuda_dir=cudadir)
    
        #extract metric name
        parameters, _ = parse_filename(os.path.basename(file))
        for key in parameters:
            timedf[key] = parameters[key]
        df_times.append(timedf)
        
    #concat into frame
    metricdf = pd.concat(df_summary, sort=True)
    timedf = pd.concat(df_times, sort=True)
    
    #transpose
    profiledf = transpose_frame(timedf, metricdf)
    df_profiles.append(profiledf)

#concat everything
profiledf = pd.concat(df_profiles)

# Compute AI Results

In [22]:
#profiledf[ (profiledf["Network Name"]=="ResNet50-2") &\
#           (profiledf["Input Shape"]=="112x112x64") &\
#           (profiledf["Batch Size"]==16) &\
#           (profiledf["Precision"]=="FP32") &\
#           (profiledf["Stride Size"]==2) &\
#           (profiledf["Pass"]=="forward") &\
#           (profiledf["Kernel Shape"]=="7x7x64x64")
#         ]
profiledf

Unnamed: 0,Time Avg,Batch Size,Calls,Data Format,Input Shape,Kernel Shape,Name,Network Name,Pass,Precision,...,FP16 L1 AI,L2 AI,FP32 L2 AI,FP16 L2 AI,DRAM AI,FP32 DRAM AI,FP16 DRAM AI,Sysmem AI,FP32 Sysmem AI,FP16 Sysmem AI
0,0.000001,16,20,NHWC,112x112x64,3x3x64x64,cudnn::gemm::computeOffsetsKernel(cudnn::gemm:...,ResNet50-2,forward,FP32,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.0,0.000000e+00
1,0.000001,16,20,NHWC,112x112x64,3x3x64x64,cudnn::gemm::computeOffsetsKernel(cudnn::gemm:...,ResNet50-2,backward,FP32,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.0,0.000000e+00
2,0.000015,16,40,NHWC,112x112x64,3x3x64x64,void Eigen::internal::EigenMetaKernel<Eigen::T...,ResNet50-2,backward,FP32,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.0,0.000000e+00
3,0.000975,16,40,NHWC,112x112x64,3x3x64x64,"void cudnn::detail::dgrad_engine<float, int=12...",ResNet50-2,backward,FP32,...,0.000000,1.885012,1.885012,0.000000,31.791065,31.791065,0.000000,2.377359e+07,23773593.6,0.000000e+00
4,0.000463,16,40,NHWC,112x112x64,3x3x64x64,"void cudnn::detail::wgrad_alg0_engine<float, i...",ResNet50-2,backward,FP32,...,0.000000,8.661180,8.661180,0.000000,28.079700,28.079700,0.000000,2.314691e+07,23146905.6,0.000000e+00
5,0.000078,16,40,NHWC,112x112x64,3x3x64x64,"void scalePackedTensor_kernel<float, float>(cu...",ResNet50-2,backward,FP32,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.0,0.000000e+00
6,0.000206,16,40,NHWC,112x112x64,3x3x64x64,void tensorflow::functor::PadInputCustomKernel...,ResNet50-2,backward,FP32,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.0,0.000000e+00
7,0.000202,16,20,NHWC,112x112x64,3x3x64x64,void tensorflow::functor::PadInputCustomKernel...,ResNet50-2,forward,FP32,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.0,0.000000e+00
8,0.000203,16,60,NHWC,112x112x64,3x3x64x64,void tensorflow::functor::PadInputCustomKernel...,ResNet50-2,backward,FP32,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.0,0.000000e+00
9,0.000003,16,100,NHWC,112x112x64,3x3x64x64,void tensorflow::functor::ShuffleInTensor3Simp...,ResNet50-2,backward,FP32,...,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000e+00,0.0,0.000000e+00


In [23]:
#sum over all kernels
combinedselectkeys = ["Precision", "Network Name", "Data Format", "Input Shape", "Kernel Shape", "Stride Size", \
                     "Batch Size", "Pass"]

#copy profiledf
combineddf = profiledf.copy()

#get the aggregated performance, including all kernels:
#compute weights: multiply all measures by the number of invocations
weighted = True
if weighted:
    #first, get all the names of metrics which need to be weighted
    metrics = [x for x in combineddf.columns if "Avg" in x]
    for metric in metrics:
        combineddf[metric] *= combineddf["Calls"]
    
#sum up
combineddf = combineddf.groupby(by=combinedselectkeys).sum()

#the flop fractions need to be recomputed
combineddf["TC Flops Fraction Avg"] = combineddf["TC Flops Avg"]/combineddf["Flops Avg"]
combineddf["FP16 Flops Fraction Avg"] = combineddf["FP16 Flops Avg"]/combineddf["Flops Avg"]
combineddf["FP16 non-TC Flops Fraction Avg"] = combineddf["FP16 non-TC Flops Avg"]/combineddf["Flops Avg"]
combineddf["FP32 Flops Fraction Avg"] = combineddf["FP32 Flops Avg"]/combineddf["Flops Avg"]

#get performance first
combineddf["Performance GFlop/s"] = combineddf["Flops Avg"]/(combineddf["Time Avg"]*10**9)
combineddf["FP32 Performance GFlop/s"] = combineddf["FP32 Flops Avg"]/(combineddf["Time Avg"]*10**9)
combineddf["FP16 Performance GFlop/s"] = combineddf["FP16 Flops Avg"]/(combineddf["Time Avg"]*10**9)
combineddf["TC Performance GFlop/s"] = combineddf["TC Flops Avg"]/(combineddf["Time Avg"]*10**9)

#get AI:
#L1 is L1+shared
combineddf["L1 AI"] = combineddf["Flops Avg"]/(32.*(combineddf["L1 Transactions Avg"]+combineddf["Shared Transactions Avg"]+combineddf["Atomic Transactions Avg"]))
combineddf["FP32 L1 AI"] = combineddf["FP32 Flops Avg"]/(32.*(combineddf["L1 Transactions Avg"]+combineddf["Shared Transactions Avg"]+combineddf["Atomic Transactions Avg"]))
combineddf["FP16 L1 AI"] = combineddf["FP16 Flops Avg"]/(32.*(combineddf["L1 Transactions Avg"]+combineddf["Shared Transactions Avg"]+combineddf["Atomic Transactions Avg"]))
combineddf["TC L1 AI"] = combineddf["TC Flops Avg"]/(32.*(combineddf["L1 Transactions Avg"]+combineddf["Shared Transactions Avg"]+combineddf["Atomic Transactions Avg"]))
#L2
combineddf["L2 AI"] = combineddf["Flops Avg"]/(32.*combineddf["L2 Transactions Avg"])
combineddf["FP32 L2 AI"] = combineddf["FP32 Flops Avg"]/(32.*combineddf["L2 Transactions Avg"])
combineddf["FP16 L2 AI"] = combineddf["FP16 Flops Avg"]/(32.*combineddf["L2 Transactions Avg"])
combineddf["TC L2 AI"] = combineddf["TC Flops Avg"]/(32.*combineddf["L2 Transactions Avg"])
#DRAM
combineddf["DRAM AI"] = combineddf["Flops Avg"]/(32.*combineddf["DRAM Transactions Avg"])
combineddf["FP32 DRAM AI"] = combineddf["FP32 Flops Avg"]/(32.*combineddf["DRAM Transactions Avg"])
combineddf["FP16 DRAM AI"] = combineddf["FP16 Flops Avg"]/(32.*combineddf["DRAM Transactions Avg"])
combineddf["TC DRAM AI"] = combineddf["TC Flops Avg"]/(32.*combineddf["DRAM Transactions Avg"])
#Sysmem
combineddf["Sysmem AI"] = combineddf["Flops Avg"]/(32.*combineddf["Sysmem Transactions Avg"])
combineddf["FP32 Sysmem AI"] = combineddf["FP32 Flops Avg"]/(32.*combineddf["Sysmem Transactions Avg"])
combineddf["FP16 Sysmem AI"] = combineddf["FP16 Flops Avg"]/(32.*combineddf["Sysmem Transactions Avg"])
combineddf["TC Sysmem AI"] = combineddf["TC Flops Avg"]/(32.*combineddf["Sysmem Transactions Avg"])

#print
combineddf

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,Time Avg,Calls,FP32 Flops Avg,FP16 non-TC Flops Avg,TC Flops Avg,FP16 Flops Avg,Flops Avg,TC Flops Fraction Avg,FP16 Flops Fraction Avg,FP16 non-TC Flops Fraction Avg,...,DRAM AI,FP32 DRAM AI,FP16 DRAM AI,Sysmem AI,FP32 Sysmem AI,FP16 Sysmem AI,TC L1 AI,TC L2 AI,TC DRAM AI,TC Sysmem AI
Precision,Network Name,Data Format,Input Shape,Kernel Shape,Stride Size,Batch Size,Pass,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1
FP16,ResNet50-1,NHWC,224x224x3,7x7x3x64,2,16,backward,0.196639,420,811565707960,12464662320,0.0,12464660000.0,824030400000.0,0.0,0.015126,0.015126,...,115.27097,113.527328,1.743642,12262360.0,12076870.0,185486.0,0.0,0.0,0.0,0.0
FP16,ResNet50-1,NHWC,224x224x3,7x7x3x64,2,16,forward,0.008887,60,78354841600,1168128000,0.0,1168128000.0,79522970000.0,0.0,0.014689,0.014689,...,67.353405,66.364038,0.989367,8283643.0,8161963.0,121680.0,0.0,0.0,0.0,0.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,1,16,backward,0.046682,460,2671575040,64225280,3553212000000.0,3553276000000.0,3555948000000.0,0.999231,0.999249,1.8e-05,...,328.518062,0.246815,328.271248,48314510.0,36298.57,48278210.0,24.100168,48.738226,328.265314,48277340.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,1,16,forward,0.008267,60,1027604480,64225280,714099800000.0,714164000000.0,715191600000.0,0.998473,0.998563,9e-05,...,456.051929,0.655266,455.396662,74499120.0,107042.1,74392080.0,30.140257,71.45994,455.355708,74385390.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,backward,0.042112,640,2472714240,542023680,1250946000000.0,1251488000000.0,1253960000000.0,0.997596,0.998028,0.000432,...,98.51902,0.194272,98.324748,12245710.0,24147.6,12221560.0,23.488733,43.163538,98.282163,12216260.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,forward,0.006987,80,256901120,16056320,203463000000.0,203479100000.0,203736000000.0,0.99866,0.998739,7.9e-05,...,96.245516,0.121361,96.124155,15916870.0,20070.4,15896800.0,29.082267,56.758448,96.11657,15895550.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,1,16,backward,0.112195,460,5626265600,128450560,11626290000000.0,11626420000000.0,11632040000000.0,0.999505,0.999516,1.1e-05,...,614.692084,0.297318,614.394766,158044100.0,76443.83,157967600.0,32.609811,88.644047,614.387978,157965900.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,1,16,forward,0.014299,60,2055208960,128450560,1241030000000.0,1241158000000.0,1243214000000.0,0.998244,0.998347,0.000103,...,396.779583,0.655933,396.12365,129501400.0,214084.3,129287300.0,26.186449,61.873982,396.082654,129274000.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,2,16,backward,0.051999,640,6416015360,561029120,2224060000000.0,2224621000000.0,2231037000000.0,0.996873,0.997124,0.000251,...,139.642428,0.401584,139.240844,21787470.0,62656.4,21724810.0,22.326575,44.164706,139.205729,21719330.0
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,2,16,forward,0.007908,80,513802240,32112640,303853500000.0,303885600000.0,304399400000.0,0.998207,0.998312,0.000105,...,93.04582,0.157054,92.888766,23781200.0,40140.8,23741060.0,26.707765,60.885441,92.87895,23738550.0


In [24]:
#combineddf = combineddf.reset_index()
#seldf = combineddf[ (combineddf["Network Name"]=="ResNet50-2") &\
#           (combineddf["Input Shape"]=="112x112x64") &\
#           (combineddf["Precision"]=="FP32")]
#seldf
#combineddf[["FP32 L2 AI", "FP32 L1 AI"]]
combineddf[["L2 AI", "L1 AI"]]

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,Unnamed: 4_level_0,Unnamed: 5_level_0,Unnamed: 6_level_0,Unnamed: 7_level_0,L2 AI,L1 AI
Precision,Network Name,Data Format,Input Shape,Kernel Shape,Stride Size,Batch Size,Pass,Unnamed: 8_level_1,Unnamed: 9_level_1
FP16,ResNet50-1,NHWC,224x224x3,7x7x3x64,2,16,backward,12.409742,1.853958
FP16,ResNet50-1,NHWC,224x224x3,7x7x3x64,2,16,forward,23.558102,2.267101
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,1,16,backward,48.775752,24.118724
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,1,16,forward,71.569199,30.18634
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,backward,43.267561,23.54534
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x128,2,16,forward,56.834593,29.121282
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,1,16,backward,88.687924,32.625952
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,1,16,forward,61.982853,26.232526
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,2,16,backward,44.303254,22.396615
FP16,ResNet50-2,NHWC,112x112x64,3x3x64x256,2,16,forward,60.99483,26.755749


# Export Data

In [26]:
profiledf.to_csv(os.path.join(outputdir,"full_profile.csv"))
combineddf.to_csv(os.path.join(outputdir,"combined_profile.csv"))