In [1]:
import subprocess as sp
import numpy as np
import pandas as pd
from io import StringIO
import os
import re
import shutil

from utils import *

In [2]:
#global parameters
cudadir = "/usr/common/software/cuda/10.2.89"
homedir = os.path.dirname(os.getcwd())

In [3]:
#input and output dirs
#datadirs = ["../scripts/tf_cnn_kernels_nsight/runs/386219"]
#datadirs = ["../scripts/tf_cnn_kernels_nsight/runs/386058"]
#datadirs = os.path.join(homedir,"data/tf_2.0b/new_nsight")
datadirs = ["../data/pytorch_1.5"]
outputdir = "../results/pytorch_1.5"

# Functions

In [4]:
def transpose_frame(df_metrics):
    #Copy the profile frame to make sure not to overwrite it and potentially read it in again if we screwed it up
    selectkeys = ["Precision", "Network Name", "Batch Size", "Pass", "Name"]
    
    tc_peak_perf_flops = 125*10**12

    #as metricdf use df_summary
    metricdf = df_metrics.copy()
    metricdf.sort_values(by=selectkeys,inplace=True)
    metricdf.reset_index(drop=True, inplace=True)

    ####### Get timing information

    ### CUDA Time
    cudatimedf = metricdf[ (metricdf["Metric Name"].str.contains("smsp__cycles_elapsed")) ].sort_values(selectkeys)
    # get cycles and rates
    cyclesdf = metricdf.loc[(metricdf["Metric Name"]=="smsp__cycles_elapsed") & (metricdf["Metric Type"]=="total"), selectkeys+["Metric Value"]]
    ratesdf = metricdf.loc[(metricdf["Metric Name"]=="smsp__cycles_elapsed") & (metricdf["Metric Type"]=="rate"), selectkeys+["Metric Value"]]
    
    # combine
    cudatimedf = cyclesdf.merge(ratesdf, on=selectkeys, how="outer").fillna(0.)
    cudatimedf["CUDA Time Avg"] = cudatimedf["Metric Value_x"] / (cudatimedf["Metric Value_y"] * 1e9)
    cudatimedf = cudatimedf.fillna(0.)
    # merge into results
    metricdf = metricdf.merge(cudatimedf[selectkeys+["CUDA Time Avg"]], on=selectkeys, how="inner")
    
    ### Tensor Core Time
    tctimedf = metricdf[ (metricdf["Metric Name"].str.contains("smsp__pipe_tensor_op_hmma_cycles_active")) ].sort_values(selectkeys)
    # get cycles and rates
    cyclesdf = metricdf.loc[(metricdf["Metric Name"]=="smsp__pipe_tensor_op_hmma_cycles_active") & (metricdf["Metric Type"]=="total"), selectkeys+["Metric Value"]]
    ratesdf = metricdf.loc[(metricdf["Metric Name"]=="smsp__pipe_tensor_op_hmma_cycles_active") & (metricdf["Metric Type"]=="rate"), selectkeys+["Metric Value"]]
    
    # combine
    tctimedf = cyclesdf.merge(ratesdf, on=selectkeys, how="outer").fillna(0.)
    tctimedf["TC Time Avg"] = tctimedf["Metric Value_x"] / (tctimedf["Metric Value_y"] * 1e9).fillna(0.)
    tctimedf = tctimedf.fillna(0.)
    metricdf = metricdf.merge(tctimedf[selectkeys+["TC Time Avg"]], on=selectkeys, how="inner")
    
    ### check
    #tmpdf = metricdf.loc[(abs(metricdf["CUDA Time Avg"] - metricdf["TC Time Avg"])/metricdf["CUDA Time Avg"] > 0.01) & (metricdf["TC Time Avg"] != 0)]
    #if not tmpdf.empty:
    #    print(tmpdf)
    #    raise ValueError("CUDA Time not consistent wit TC Time")    
        
        
    ####### Get number of FLOPs
    
    ### FMA FLOPs = number of FMA instructions x 2
    metricdf.loc[metricdf["Metric Name"].str.contains("fma"), ["Metric Value"]] *= 2
    

    ### FP64 FLOPs
    #metrics = ['smsp__sass_thread_inst_executed_op_dadd_pred_on',
    #           'smsp__sass_thread_inst_executed_op_dfma_pred_on',
    #           'smsp__sass_thread_inst_executed_op_dmul_pred_on']
    #tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), selectkeys+["Metric Value"] ].copy()
    #tmpdf = tmpdf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "FP64 FLOPs"})
    #metricdf = metricdf.merge(tmpdf[selectkeys+["FP64 FLOPs"]], on=selectkeys, how="inner")
    
    
    ### FP32 FLOPs
    metrics = ['smsp__sass_thread_inst_executed_op_fadd_pred_on',
               'smsp__sass_thread_inst_executed_op_ffma_pred_on',
               'smsp__sass_thread_inst_executed_op_fmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), selectkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "FP32 FLOPs Avg"})
    metricdf = metricdf.merge(tmpdf[selectkeys+["FP32 FLOPs Avg"]], on=selectkeys, how="inner")
    
    ### FP16 FLOPs
    metrics = ['smsp__sass_thread_inst_executed_op_hadd_pred_on',
               'smsp__sass_thread_inst_executed_op_hfma_pred_on',
               'smsp__sass_thread_inst_executed_op_hmul_pred_on']
    tmpdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), selectkeys+["Metric Value"] ].copy()
    tmpdf = tmpdf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "FP16 FLOPs Avg"})
    metricdf = metricdf.merge(tmpdf[selectkeys+["FP16 FLOPs Avg"]], on=selectkeys, how="inner")
    
    #### TC FLOPs
    tmpdf = metricdf.loc[ metricdf["Metric Name"] == "sm__inst_executed_pipe_tensor_op_hmma", selectkeys+["TC Time Avg", "Metric Value"] ].copy()
    tmpdf["Utilization"] = 0.01 * tmpdf["Metric Value"]
    tmpdf["TC FLOPs Avg"] = tc_peak_perf_flops * tmpdf["Utilization"] * tmpdf["TC Time Avg"]
    metricdf = metricdf.merge(tmpdf[selectkeys+["TC FLOPs Avg"]], on=selectkeys, how="inner")

    
    ### Total FLOPs
    metricdf["FLOPs Avg"] = metricdf["FP32 FLOPs Avg"] + metricdf["FP16 FLOPs Avg"] + metricdf["TC FLOPs Avg"] #+ metricdf["FP64 FLOPs"]
    
    
    ### FLOPs fractions
    #metricdf["FP64 FLOPs Fraction"] = metricdf["FP64 FLOPs"]/metricdf["FLOPs"]
    metricdf["FP32 FLOPs Fraction Avg"] = metricdf["FP32 FLOPs Avg"]/metricdf["FLOPs Avg"]
    metricdf["FP16 FLOPs Fraction Avg"] = metricdf["FP16 FLOPs Avg"]/metricdf["FLOPs Avg"]
    metricdf["TC FLOPs Fraction Avg"]   = metricdf["TC FLOPs Avg"]/metricdf["FLOPs Avg"]
    
    ####### Get number of bytes
    
    ### Shared transactions
    #project out
    shareddf = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__data_pipe_lsu_wavefronts_mem_shared_op"), selectkeys+["Metric Value"] ].copy()
    shareddf = shareddf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "Shared Transactions Avg"})
    #add to timings
    metricdf = metricdf.merge(shareddf[selectkeys+["Shared Transactions Avg"]], on=selectkeys, how="inner")

    
    ### L1 atomic transactions
    # project out
    metrics = ['l1tex__t_set_accesses_pipe_lsu_mem_global_op_atom',
               'l1tex__t_set_accesses_pipe_lsu_mem_global_op_red',
               'l1tex__t_set_accesses_pipe_tex_mem_surface_op_atom',
               'l1tex__t_set_accesses_pipe_tex_mem_surface_op_red']
    atomicdf = metricdf.loc[ metricdf["Metric Name"].isin(metrics), selectkeys+["Metric Value"] ].copy()
    # get reads and writes
    atomicdf = atomicdf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "L1 Atomic Transactions Avg"})
    # add to timings
    metricdf = metricdf.merge(atomicdf[selectkeys+["L1 Atomic Transactions Avg"]], on=selectkeys, how="inner")
    
    
    ### Local transactions 
    # project out
    localdf = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__t_sectors_pipe_lsu_mem_local_op"), selectkeys+["Metric Value"] ].copy()
    localdf = localdf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "Local Transactions Avg"})
    # add to timings
    metricdf = metricdf.merge(localdf[selectkeys+["Local Transactions Avg"]], on=selectkeys, how="inner")
    
    
    ### Global transactions 
    # project out
    globaldf = metricdf.loc[metricdf["Metric Name"].str.contains("l1tex__t_sectors_pipe_lsu_mem_global_op"), selectkeys+["Metric Value"] ].copy()
    globaldf = globaldf.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "Global Transactions Avg"})
    # add to timings
    metricdf = metricdf.merge(globaldf[selectkeys+["Global Transactions Avg"]], on=selectkeys, how="inner")
    
    
    ### L1 Bytes
    metricdf["L1 Transactions Avg"] = (metricdf["Shared Transactions Avg"] + metricdf["L1 Atomic Transactions Avg"]
                            + metricdf["Local Transactions Avg"] + metricdf["Global Transactions Avg"])
    metricdf["L1 Bytes Avg"] = metricdf["L1 Transactions Avg"] * 32
    
    
    ### L2 atomic & reduction
    metricdf.loc[(metricdf["Metric Name"].str.contains("lts__t_sectors_op")) & (metricdf["Metric Type"]=="total"), ["Metric Value"]] *= 2

    
    ### L2 transactions
    # project out
    l2df = metricdf.loc[metricdf["Metric Name"].str.contains("lts__t_sectors_op"), selectkeys+["Metric Value"] ].copy()
    l2df = l2df.groupby(selectkeys).sum().reset_index().rename(columns={"Metric Value": "L2 Transactions Avg"})
    l2df["L2 Bytes Avg"] = l2df["L2 Transactions Avg"] * 32
    # add to timings
    metricdf = metricdf.merge(l2df[selectkeys+["L2 Transactions Avg", "L2 Bytes Avg"]], on=selectkeys, how="inner")
    
    
    ### DRAM Bytes
    # project out
    dramdf = metricdf[ metricdf["Metric Name"].str.contains("dram__sectors") ].sort_values(selectkeys)
    # get reads and writes
    dramreadsdf = dramdf.loc[(dramdf["Metric Name"]=="dram__sectors") & (dramdf["Metric Type"]=="read"), selectkeys+["Metric Value"]]
    dramwritesdf = dramdf.loc[(dramdf["Metric Name"]=="dram__sectors") & (dramdf["Metric Type"]=="write"), selectkeys+["Metric Value"]]
    # combine
    dramdf = dramwritesdf.merge(dramreadsdf, on=selectkeys, how="outer").fillna(0.)
    dramdf["DRAM Transactions Avg"] = dramdf["Metric Value_x"] + dramdf["Metric Value_y"]
    dramdf["DRAM Bytes Avg"] = dramdf["DRAM Transactions Avg"] * 32
    #print(dramdf[['Name', 'Metric Value_x', 'Metric Value_y']])
    metricdf = metricdf.merge(dramdf[selectkeys+["DRAM Transactions Avg", "DRAM Bytes Avg"]], on=selectkeys, how="inner")
    
    
    ### Host Memory Bytes
    # project out
    sysmemdf = metricdf[ metricdf["Metric Name"].str.contains("lts__t_sectors_aperture_sysmem_op") ].sort_values(selectkeys)
    # get reads and writes
    sysmemreadsdf = sysmemdf.loc[(sysmemdf["Metric Name"]=="lts__t_sectors_aperture_sysmem_op") & (sysmemdf["Metric Type"]=="read"), selectkeys+["Metric Value"]]
    sysmemwritesdf = sysmemdf.loc[(sysmemdf["Metric Name"]=="lts__t_sectors_aperture_sysmem_op") & (sysmemdf["Metric Type"]=="write"), selectkeys+["Metric Value"]]
    # combine
    sysmemdf = sysmemwritesdf.merge(sysmemreadsdf, on=selectkeys, how="outer").fillna(0.)
    sysmemdf["SYSMEM Transactions Avg"] = sysmemdf["Metric Value_x"] + sysmemdf["Metric Value_y"]
    sysmemdf["SYSMEM Bytes Avg"] = sysmemdf["SYSMEM Transactions Avg"] * 32
    #print(dramdf[['Name', 'Metric Value_x', 'Metric Value_y']])
    metricdf = metricdf.merge(sysmemdf[selectkeys+["SYSMEM Transactions Avg", "SYSMEM Bytes Avg"]], on=selectkeys, how="inner")
    
    ####### Clean up and return:
    del metricdf["Metric Value"]
    del metricdf["Metric Name"]
    del metricdf["Metric Type"]
    #del metricdf["Invocations"]
    metricdf.drop_duplicates(keep = 'first', inplace = True)
    

    ### Get performance
    metricdf["Performance GFlop/s"]      = metricdf["FLOPs Avg"]      / (metricdf["CUDA Time Avg"]*10**9)
    metricdf["FP32 Performance GFlop/s"] = metricdf["FP32 FLOPs Avg"] / (metricdf["CUDA Time Avg"]*10**9)
    metricdf["FP16 Performance GFlop/s"] = metricdf["FP16 FLOPs Avg"] / (metricdf["CUDA Time Avg"]*10**9)
    metricdf["TC Performance GFlop/s"]   = metricdf["TC FLOPs Avg"]   / (metricdf["TC Time Avg"]*10**9)

    
    ### Get AI
    # L1
    metricdf["L1 AI"]        = metricdf["FLOPs Avg"]      / metricdf["L1 Bytes Avg"]
    metricdf["FP32 L1 AI"]   = metricdf["FP32 FLOPs Avg"] / metricdf["L1 Bytes Avg"]
    metricdf["FP16 L1 AI"]   = metricdf["FP16 FLOPs Avg"] / metricdf["L1 Bytes Avg"]
    metricdf["TC L1 AI"]     = metricdf["TC FLOPs Avg"]   / metricdf["L1 Bytes Avg"]
    # L2
    metricdf["L2 AI"]        = metricdf["FLOPs Avg"]      / metricdf["L2 Bytes Avg"]
    metricdf["FP32 L2 AI"]   = metricdf["FP32 FLOPs Avg"] / metricdf["L2 Bytes Avg"]
    metricdf["FP16 L2 AI"]   = metricdf["FP16 FLOPs Avg"] / metricdf["L2 Bytes Avg"]
    metricdf["TC L2 AI"]     = metricdf["TC FLOPs Avg"]   / metricdf["L2 Bytes Avg"]
    # DRAM
    metricdf["DRAM AI"]      = metricdf["FLOPs Avg"]      / metricdf["DRAM Bytes Avg"]
    metricdf["FP32 DRAM AI"] = metricdf["FP32 FLOPs Avg"] / metricdf["DRAM Bytes Avg"]
    metricdf["FP16 DRAM AI"] = metricdf["FP16 FLOPs Avg"] / metricdf["DRAM Bytes Avg"]
    metricdf["TC DRAM AI"]   = metricdf["TC FLOPs Avg"]   / metricdf["DRAM Bytes Avg"]
    # SYSMEM
    metricdf["SYSMEM AI"]      = metricdf["FLOPs Avg"]      / metricdf["SYSMEM Bytes Avg"]
    metricdf["FP32 SYSMEM AI"] = metricdf["FP32 FLOPs Avg"] / metricdf["SYSMEM Bytes Avg"]
    metricdf["FP16 SYSMEM AI"] = metricdf["FP16 FLOPs Avg"] / metricdf["SYSMEM Bytes Avg"]
    metricdf["TC SYSMEM AI"]   = metricdf["TC FLOPs Avg"]   / metricdf["SYSMEM Bytes Avg"]

    ### Cleanup
    metricdf.sort_values(by=selectkeys).reset_index(drop=True, inplace=True)
    #print(metricdf[['CUDA Time Avg', 'TC Time Avg']])
    
    return metricdf

# Import Data

In [5]:
#get all the files
files = []
for datadir in datadirs:
    files += [ os.path.join(datadir,x) for x in os.listdir(datadir) if ((os.path.splitext(x)[-1] == ".ncu-rep"))]

#recs
records = []

#build feature list:
for path in files:
    
    #filename
    file = os.path.basename(path)
    
    #path
    path = os.path.dirname(path)
    
    #splitup
    splt = file.split(".")
    
    prefix = ".".join(splt[0:-1])
    
    #append to records
    records.append({"prefix": prefix, "file": os.path.join(path, file)})

#put in df
recorddf = pd.DataFrame(records).sort_values(["prefix"])
#with pd.option_context('display.max_rows', None, 'display.max_columns', None):

In [6]:
#sort by those keys:
sortkeys = ["Network Name", \
            "Batch Size", "Pass", \
            "Precision", "Device", "Name"]
    
#group by prefixes and files
all_prefixes = set([x.split(".pass")[0] for x in recorddf["prefix"]])
all_passes = set([re.match(r'.*\.pass_(.*?)\.', x).groups()[0] for x in recorddf["prefix"].unique()])

#metrics
df_profiles = []

for pref in all_prefixes:
    
    #set empty lists
    df_times = []
    df_timeline = []
    df_summary = []
    
    #print prefix
    #print(pref)
    
    #loop over passes
    df_times = []
    df_metrics = []
    for pas in all_passes:
        
        #project frame
        files = recorddf.loc[recorddf["prefix"].apply(lambda x: re.match(r'.*\.pass_(.*?)\.', x).groups()[0]) == pas, "file"].values
        
        #project the invididual files
        metricfiles = [x for x in files if x.endswith(".ncu-rep")]
        
        for metricfile in metricfiles:
            
            #print the file
            print(metricfile)
            
            #get the parameters from the filename
            parameters = parse_filename_nsight(os.path.basename(metricfile))
        
            #metrics
            metricdf = import_nsight_metric(metricfile, cuda_dir=cudadir)
            for key in parameters:
                metricdf[key] = parameters[key]
        
            #fuse read/write metrics together:
            unique_metrics = metricdf["Metric Name"].unique()
            unique_metrics = set([x.split(".")[0].replace("_write","").replace("_read","").replace("_ld","").replace("_st","") for x in unique_metrics])
            #add the metric type
            metricdf["Metric Type"] = "total"
            #read
            metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_read"), "Metric Type" ] = "read"
            metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_ld"), "Metric Type" ] = "read"
            #write
            metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_write"), "Metric Type" ] = "write"
            metricdf.loc[ metricdf[ "Metric Name" ].str.contains("_st"), "Metric Type" ] = "write"
            #rate
            metricdf.loc[ metricdf[ "Metric Name" ].str.contains(".per_second"), "Metric Type" ] = "rate"
        
            for metric in unique_metrics:
                metricdf.loc[ metricdf[ "Metric Name"].str.startswith(metric), "Metric Name" ] = metric

            #append to DF:
            df_metrics.append(metricdf)
    
    #concat the frames
    metricdf = pd.concat(df_metrics).reset_index(drop=True)
    
    #compute the profile
    profiledf = transpose_frame(metricdf)
    df_profiles.append(profiledf)

#concat everything
profiledf = pd.concat(df_profiles).reset_index(drop=True)

../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_dram__sectors_read.sum.ncu-rep
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_dram__sectors_write.sum.ncu-rep
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__data_pipe_lsu_wavefronts_mem_shared_op_ld.sum.ncu-rep
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__data_pipe_lsu_wavefronts_mem_shared_op_st.sum.ncu-rep
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_global_op_ld.sum.ncu-rep
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_global_op_st.sum.ncu-rep
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_local_op_ld.sum.ncu-rep
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_sectors_pipe_lsu_mem_local_op_st.sum.ncu-rep
../data/pytorch_1.5/profile.pass_backward.batchsize_2.metric_l1tex__t_set_accesses_pipe_lsu_mem_global_op_ato

In [7]:
profiledf

Unnamed: 0,Name,Invocations,Network Name,Batch Size,Pass,Precision,CUDA Time Avg,TC Time Avg,FP32 FLOPs Avg,FP16 FLOPs Avg,...,FP16 L2 AI,TC L2 AI,DRAM AI,FP32 DRAM AI,FP16 DRAM AI,TC DRAM AI,SYSMEM AI,FP32 SYSMEM AI,FP16 SYSMEM AI,TC SYSMEM AI
0,Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1...,384.0,deepCam,2,backward,mixed,0.000165,0.000165,3.382784e+06,0.0,...,0.000000,86.269259,460.004408,0.110112,0.000000,459.894296,6.308905e+07,1.510171e+04,0.000000,6.307394e+07
1,Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1...,12.0,deepCam,2,backward,mixed,0.000120,0.000120,2.048000e+06,0.0,...,0.000000,175.980559,411.945348,0.117225,0.000000,411.828123,3.212944e+07,9.142857e+03,0.000000,3.212029e+07
2,Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x1...,6.0,deepCam,2,backward,mixed,0.002004,0.002003,6.930432e+06,0.0,...,0.000000,421.062825,1362.422104,0.056049,0.000000,1362.366055,7.520624e+08,3.093943e+04,0.000000,7.520315e+08
3,Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_128x2...,12.0,deepCam,2,backward,mixed,0.002617,0.002624,3.538944e+06,0.0,...,0.000000,133.586092,1491.376331,0.020842,0.000000,1491.355488,1.130491e+09,1.579886e+04,0.000000,1.130475e+09
4,Volta_hmma_implicit_gemm_wgrad_fp32_nhwc_64x32...,24.0,deepCam,2,backward,mixed,0.000211,0.000230,2.347008e+06,0.0,...,0.000000,30.525030,70.019151,0.017258,0.000000,70.001893,4.251066e+07,1.047771e+04,0.000000,4.250018e+07
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
110,volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f...,6.0,deepCam,2,forward,mixed,0.000080,0.000080,1.061683e+07,663552.0,...,0.010521,63.149830,262.910613,0.698876,0.043680,262.168057,1.783014e+07,4.739657e+04,2962.285714,1.777978e+07
111,volta_fp16_s884cudnn_fp16_256x64_ldg8_relu_f2f...,6.0,deepCam,2,forward,mixed,0.000303,0.000307,5.662310e+07,3538944.0,...,0.010270,57.673099,234.258467,0.665453,0.041591,233.551423,8.898636e+07,2.527817e+05,15798.857143,8.871778e+07
112,volta_fp16_s884gemm_fp16_128x128_ldg8_f2f_nt,36.0,deepCam,2,forward,mixed,0.000425,0.000469,5.573837e+07,0.0,...,0.000000,64.365833,246.901319,0.387155,0.000000,246.514164,1.586883e+08,2.488320e+05,0.000000,1.584394e+08
113,volta_fp16_scudnn_fp16_128x64_relu_interior_nn_v1,12.0,deepCam,2,forward,mixed,0.000171,0.000000,1.833173e+09,0.0,...,0.000000,0.000000,30.543450,30.543450,0.000000,0.000000,8.183808e+06,8.183808e+06,0.000000,0.000000e+00


# Compute AI Results

In [8]:
#sum over all kernels
combinedselectkeys = ["Precision", "Network Name", "Batch Size", "Pass"]

#copy profiledf
combineddf = profiledf.copy()

#get the aggregated performance, including all kernels:
#compute weights: multiply all measures by the number of invocations
weighted = True
if weighted:
    #first, get all the names of metrics which need to be weighted
    metrics = [x for x in combineddf.columns if "Avg" in x]
    for metric in metrics:
        combineddf[metric] *= combineddf["Invocations"]

#sum up
combineddf = combineddf.groupby(by=combinedselectkeys).sum()#.reset_index()


#the flop fractions need to be recomputed
combineddf["FP32 FLOPs Fraction Avg"] = combineddf["FP32 FLOPs Avg"] / combineddf["FLOPs Avg"]
combineddf["FP16 FLOPs Fraction Avg"] = combineddf["FP16 FLOPs Avg"] / combineddf["FLOPs Avg"]
combineddf["TC FLOPs Fraction Avg"]   = combineddf["TC FLOPs Avg"]   / combineddf["FLOPs Avg"]

### Get performance
combineddf["Performance GFlop/s"]      = combineddf["FLOPs Avg"]      / (combineddf["CUDA Time Avg"]*10**9)
combineddf["FP32 Performance GFlop/s"] = combineddf["FP32 FLOPs Avg"] / (combineddf["CUDA Time Avg"]*10**9)
combineddf["FP16 Performance GFlop/s"] = combineddf["FP16 FLOPs Avg"] / (combineddf["CUDA Time Avg"]*10**9)
combineddf["TC Performance GFlop/s"]   = combineddf["TC FLOPs Avg"]   / (combineddf["TC Time Avg"]*10**9)


### Get AI
# L1
combineddf["L1 AI"]        = combineddf["FLOPs Avg"]      / combineddf["L1 Bytes Avg"]
combineddf["FP32 L1 AI"]   = combineddf["FP32 FLOPs Avg"] / combineddf["L1 Bytes Avg"]
combineddf["FP16 L1 AI"]   = combineddf["FP16 FLOPs Avg"] / combineddf["L1 Bytes Avg"]
combineddf["TC L1 AI"]     = combineddf["TC FLOPs Avg"]   / combineddf["L1 Bytes Avg"]
# L2
combineddf["L2 AI"]        = combineddf["FLOPs Avg"]      / combineddf["L2 Bytes Avg"]
combineddf["FP32 L2 AI"]   = combineddf["FP32 FLOPs Avg"] / combineddf["L2 Bytes Avg"]
combineddf["FP16 L2 AI"]   = combineddf["FP16 FLOPs Avg"] / combineddf["L2 Bytes Avg"]
combineddf["TC L2 AI"]     = combineddf["TC FLOPs Avg"]   / combineddf["L2 Bytes Avg"]
# DRAM
combineddf["DRAM AI"]      = combineddf["FLOPs Avg"]      / combineddf["DRAM Bytes Avg"]
combineddf["FP32 DRAM AI"] = combineddf["FP32 FLOPs Avg"] / combineddf["DRAM Bytes Avg"]
combineddf["FP16 DRAM AI"] = combineddf["FP16 FLOPs Avg"] / combineddf["DRAM Bytes Avg"]
combineddf["TC DRAM AI"]   = combineddf["TC FLOPs Avg"]   / combineddf["DRAM Bytes Avg"]

combineddf.sort_values(by=combinedselectkeys).reset_index(drop=True, inplace=True)

# Export Data

In [9]:
metricdf.to_csv("./metrics.csv")
profiledf.to_csv("./profile.csv")