In [118]:
import pandas as pd
import gzip
import json
import csv
import re
import pandas as pd
import numpy as np
from tqdm import tqdm 

def parse_name(name):
  name = name.lower()
  name = re.sub(r'[0-9]*', '', name)
  if 'loss' in name or 'entropy' in name:
    return 'Others'
  elif 'optimizer' in name or 'update' in name or 'adam' in name:
    return 'OPT'
  elif 'layernorm' in name or 'layer_norm' in name or 'norm' in name:
    return 'LayerNorm + Dropout'
  elif 'batchnorm' in name or 'batch_norm' in name or 'relu' in name:
    return 'BN+ReLU'
  # elif 'cast' in name:
  #   return 'Fusion(Cast)'
  elif 'dropout' in name and 'self' in name or '/type_embeddings/matmul' in name:
    return 'Softmax+Dropout'
  elif 'dropout' in name :
    return 'LayerNorm + Dropout'
  elif 'relu' in name:
    return 'Fusion(ReLU)'
  elif 'softmax' in name:
    return 'Softmax+Dropout'
  return 'Fusion'

def find_kernel_metadata(name, hlo_file_name):
  if len(name.split('_')) == 1:
    return name
  name = name.replace('_', '.') + ' '
  with open(hlo_file_name, 'r') as f:
    for line in f.readlines():
      if name in line:
        find_meta = line.split('metadata')
        if len(find_meta) > 1 and len(find_meta[1].split('\"')) > 3:
          name1 =find_meta[1].split('\"')[3]
          name1 = parse_name(name1)
#           if name1 == 'Fusion':
#             print(name, find_meta[1].split('\"')[3])
          return name1
    return 'Fusion'

namespace = ['CONV+BN+ELEMWISE', 'NDP_OP', 'BN+ELEMWISE', 'CONV', 'FC', 'POOLING', 'dxCONV', 'dwCONV', 'dxFC', 'dwFC', 'OPT',\
        'Fusion(Loss)', 'Fusion(Optimizer)', 'Fusion(LayerNorm)','Fusion(BatchNorm)', 'Fusion(Cast)', 'Fusion(Dropout)',\
        'Fusion(Einsum)', 'Fusion(BiasAdd)', 'Fusion(Gelu)', 'Fusion(ReLU)', 'Fusion(SelfAttention)', 'BN+ReLU', 'Fusion(Softmax)', 'LayerNorm',
        'GEMM', 'Dropout', 'LayerNorm + Dropout', 'Softmax+Dropout']

def setup_dataframe(df):
   # df.to_csv('./before.csv')   

   # namespace = ['fusion', 'elementwise', 'GEMM', 'CONV', 'Dgrad', 'Wgrad' ,'others']
   compute_intensives = ['GEMM', 'CONV', 'Wgrad', 'Dgrad']
   df['NAME'] = df['NAME'].replace('fusion.*', 'fusion', regex=True)\
                                       .replace('dgemm', 'GEMM', regex=True)\
                                       .replace('wgemm', 'GEMM', regex=True)\
                                       .replace('.*MetaKernel.*', 'elementwise', regex=True)\
                                       .replace('.*add.*', 'elementwise', regex=True)\
                                       .replace('.*mul.*', 'elementwise', regex=True)\
                                       .replace('.*log.*', 'elementwise', regex=True)\
                                       .replace('.*gemm_.*_tn', 'GEMM' , regex=True)\
                                       .replace('.*gemm_.*_nt', 'GEMM', regex=True)\
                                       .replace('.*dgrad.*', 'dxCONV', regex=True)\
                                       .replace('.*wgrad.*', 'dwCONV', regex=True)\
                                       .replace('.*1688cudnn.*', 'CONV+BN+ELEMWISE', regex=True)\
                                       .replace('.*convol.*', 'CONV+BN+ELEMWISE', regex=True)\
                                       .replace('.*gemm_.*', 'GEMM', regex=True)\
                                       .replace('gemm', 'GEMM', regex=True)\
                                       .replace('conv', 'CONV', regex=True)\
                                       .replace('.*conv2d.*', 'CONV+BN+ELEMWISE', regex=True)\
                                       .replace('.*first_layer.*', "CONV+BN+ELEMWISE", regex=True)\
                                       .replace('pool', "POOLING", regex=True)\
                                       .replace('.*c1_k1.*', "CONV+BN+ELEMWISE", regex=True)\
                                       .replace('.*bn.*', "BN+ELEMWISE", regex=True)\
                                       .replace('.*adam*', "OPT", regex=True)
  


   df['CONFIG'] = df['CONFIG'].replace('NDPX_baseline_64', 'NDPX_wc64')                 
   df.loc[~df['NAME'].isin(namespace), 'NAME'] = 'Others'
   df.loc[df['CYCLE'] == '  NOT FOUND', 'CYCLE'] = '0'
   
   
   return df

def setup_dataframe_baseline(df, hlo_path):
   compute_intensives = ['GEMM', 'CONV', 'Wgrad', 'Dgrad']
   


   df['NAME'] = df['NAME'].replace('dgemm', 'dxFC', regex=True)\
                                       .replace('wgemm', 'GEMM', regex=True)\
                                       .replace('.*MetaKernel.*', 'elementwise', regex=True)\
                                       .replace('.*add.*', 'elementwise', regex=True)\
                                       .replace('.*mul.*', 'elementwise', regex=True)\
                                       .replace('.*log.*', 'elementwise', regex=True)\
                                       .replace('.*gemm_.*_tn', 'GEMM' , regex=True)\
                                       .replace('.*gemm_.*_nt', 'GEMM', regex=True)\
                                       .replace('.*dgrad.*', 'dxCONV', regex=True)\
                                       .replace('.*wgrad.*', 'dwCONV', regex=True)\
                                       .replace('.*1688cudnn.*', 'CONV', regex=True)\
                                       .replace('.*convol.*', 'CONV', regex=True)\
                                       .replace('.*gemm_.*', 'GEMM', regex=True)\
                                       .replace('gemm', 'GEMM', regex=True)\
                                       .replace('conv', 'CONV', regex=True)\
                                       .replace('.*conv2d.*', 'CONV', regex=True)\
                                       .replace('.*first_layer.*', "CONV", regex=True)\
                                       .replace('pool', "POOLING", regex=True)\
                                       .replace('.*c1_k1.*', "CONV", regex=True)\
                                       .replace('.*bn.*', "BN+ELEMWISE", regex=True)\
                                       .replace('.*adam*', "OPT", regex=True)
                                   
   fusions = df.loc[~df['NAME'].isin(namespace), 'NAME']
   for fusion in fusions:
      opname = find_kernel_metadata(fusion, hlo_path)
      df.loc[df['NAME'] == fusion, 'NAME'] = opname                                 
 
   df.loc[~df['NAME'].isin(namespace), 'NAME'] = 'Others'
   df.loc[df['CYCLE'] == '  NOT FOUND', 'CYCLE'] = '0'
   df.to_csv('./after.csv')
   return df


In [143]:
batch_size = [ 4] # [16,64]
models = ['BERT_large_3_b']
single_models = ['BERT_large_1_b']
# models = ['resnet18']
sync = ['nosync']
GPUs = [2]
config = 'NDPX_baseline_64'
NDPX_MIDDLE_ENCODER_FW_START = 11 
NDPX_MIDDLE_ENCODER_FW_END = 19
NDPX_MIDDLE_ENCODER_BW_START = 42 
NDPX_MIDDLE_ENCODER_BW_END = 50
# BASELINE_MIDDLE_ENCODER_FW_START = 8 
# BASELINE_MIDDLE_ENCODER_FW_END = 13
# BASELINE_MIDDLE_ENCODER_BW_START = 30 
# BASELINE_MIDDLE_ENCODER_BW_END = 35
N_ENCODERS = 24
for model, single_model in zip(models, single_models):
   total_result = pd.DataFrame()


   for b in batch_size:
      ndpx_file_paths = []
      baseline_home='/home/shared/CXL_memory_buffer/BASELINE_ISCA/csv_files/'
      baseline_path_3=baseline_home+model+str(b)+'-NDPX_baseline_64.csv'
      baseline_path_single=baseline_home+single_model+str(b)+'-NDPX_baseline_64.csv'

      for GPU in GPUs:
         for s in sync:
            if GPU == 1 and s == 'sync':
               continue
            temp = 1
            if GPU == 8:
               temp = 8
            ndp_csv_home='/home/shared/CXL_memory_buffer/NDP_ISCA/csv_files/'
            file_path=ndp_csv_home + model + str(b)+'-'+ config + '-'+str(GPU)+'-' + str(GPU) + '-'+s+'.csv'
            file_path_bw=ndp_csv_home + model + str(b)+'-'+ config + '-'+str(GPU)+'-' + str(GPU) + '-' +s+'-bw.csv'
            ndpx_file_paths.append(file_path)
            ndpx_file_paths.append(file_path_bw)

      baseline_3 = pd.read_csv(baseline_path_3)
      baseline_3['GPUS'] = '1_GPU'
      baseline_3['CONFIG'] = 'BASELINE'
      baseline_1 = pd.read_csv(baseline_path_single)
      baseline_1['GPUS'] = '1_GPU'
      baseline_1['CONFIG'] = 'BASELINE'
      baseline_hlp_path=baseline_home+'xla_hlo/'+model+str(b)+'.txt'
      baseline_3 = setup_dataframe_baseline(baseline_3, baseline_hlp_path)
      baseline_3['CYCLE'] = pd.to_numeric(baseline_3['CYCLE'])
      baseline_3_hops = pd.read_csv(f'/home/shared/CXL_memory_buffer/BASELINE_ISCA/traces/{model}{str(b)}/kernelslist.g.hops')
      baseline_3 = pd.merge(baseline_3, baseline_3_hops, how='left', on='ID')
      baseline_3.loc[baseline_3.HOPS > 20, 'DIRECTION'] = 'backward'
      baseline_3.loc[baseline_3.HOPS <= 20, 'DIRECTION'] = 'forward'
      baseline_hlp_path=baseline_home+'xla_hlo/'+single_model+str(b)+'.txt'
      baseline_1 = setup_dataframe_baseline(baseline_1, baseline_hlp_path)
      baseline_1['CYCLE'] = pd.to_numeric(baseline_1['CYCLE'])


      baseline_1 = baseline_1.groupby(['CONFIG','GPUS','NAME']).sum()[[ 'CYCLE']] 
      baseline_3 = baseline_3.groupby(['CONFIG','GPUS','NAME']).sum()[[ 'CYCLE']]
      print(baseline_1)
      print(baseline_3)

      baseline_large = baseline_3 - baseline_1
      print(baseline_large/2)
#       mask = baseline_large.CYCLE < 0
#       baseline_large.loc[mask, 'CYCLE'] = 0
#       df = (baseline_large / 2 ) * (N_ENCODERS - 1) + baseline_1

#       ndpx_hops = pd.read_csv(f'/home/shared/CXL_memory_buffer/NDP_ISCA/traces/{model}{str(b)}/kernelslist.g.hops')
#       ndpx = pd.read_csv(ndpx_file_paths[0])
#       ndpx = pd.concat([ndpx, pd.read_csv(ndpx_file_paths[1])])
#       ndpx = pd.merge(ndpx, ndpx_hops, how='left', on='ID')
#       ndpx['GPUS'] = ndpx_file_paths[0].split('-')[-2]+'_GPU'



#       for i in range(1,len(GPUs)):
#           ndpx1 = pd.read_csv(ndpx_file_paths[2*i])
#           ndpx1 = pd.concat([ndpx1, pd.read_csv(ndpx_file_paths[2*i+1])])
#           ndpx1 = pd.merge(ndpx1, ndpx_hops, how='left', on='ID')
#           ndpx1['GPUS'] = ndpx_file_paths[2*i].split('-')[-2]+'_GPU'
#           ndpx = pd.concat([ndpx, ndpx1])
        
#       ndpx = setup_dataframe(ndpx)
#       print(ndpx)
#       ndpx['CYCLE'] = pd.to_numeric(ndpx['CYCLE'])
#       ndpx.loc[ndpx.HOPS > 29, 'DIRECTION'] = 'backward'
#       ndpx.loc[ndpx.HOPS <= 29, 'DIRECTION'] = 'forward'
#       ndpx.loc[(ndpx.HOPS >= NDPX_MIDDLE_ENCODER_FW_START) & (ndpx.HOPS <= NDPX_MIDDLE_ENCODER_FW_END), 'CYCLE'] *= (N_ENCODERS -2)
#       ndpx.loc[(ndpx.HOPS >= NDPX_MIDDLE_ENCODER_BW_START) & (ndpx.HOPS <= NDPX_MIDDLE_ENCODER_BW_END), 'CYCLE'] *= (N_ENCODERS -2)
#       ndpx.loc[(ndpx.HOPS == 0) & (ndpx.HOPS != ndpx.ID.min()), 'CYCLE'] = 0

#       grouped = ndpx.groupby(['CONFIG','GPUS','NAME', "DIRECTION"]).sum()[[ 'CYCLE']] 



     
#       baseline_runtime = pd.to_numeric(df['CYCLE']).sum()
      
#       df = pd.concat([df, grouped])
#       df['BATCH'] = b
#       total_result = pd.concat([total_result, df])
#       df.to_csv('./outputs/'+model+'_grouped_value-'+str(b)+'.csv')
#       normalized = df
#       normalized['CYCLE'] = df['CYCLE'] / baseline_runtime
#       normalized.to_csv('./outputs/'+model+'_total_normalized2-'+str(b)+'.csv')

#    total_result.to_csv(f'./outputs/{model}_total_batches.csv')


                                       CYCLE
CONFIG   GPUS  NAME                         
BASELINE 1_GPU GEMM                 26578592
               LayerNorm + Dropout   2368218
               OPT                  11837881
               Others               11331217
               Softmax+Dropout       7713032
                                       CYCLE
CONFIG   GPUS  NAME                         
BASELINE 1_GPU GEMM                 60659096
               LayerNorm + Dropout   6596630
               OPT                  21465303
               Others               21210773
               Softmax+Dropout      23130172
                                         CYCLE
CONFIG   GPUS  NAME                           
BASELINE 1_GPU GEMM                 17040252.0
               LayerNorm + Dropout   2114206.0
               OPT                   4813711.0
               Others                4939778.0
               Softmax+Dropout       7708570.0


In [122]:
batch_size = [2] # [16,64]
models = ['BERT_large_3_b']
single_models = ['BERT_large_1_b']
# models = ['resnet18']
sync = ['nosync']
GPUs = [2]
config = 'NDPX_baseline_64'
NDPX_MIDDLE_ENCODER_FW_START = 11 
NDPX_MIDDLE_ENCODER_FW_END = 19
NDPX_MIDDLE_ENCODER_BW_START = 42 
NDPX_MIDDLE_ENCODER_BW_END = 50
# BASELINE_MIDDLE_ENCODER_FW_START = 8 
# BASELINE_MIDDLE_ENCODER_FW_END = 13
# BASELINE_MIDDLE_ENCODER_BW_START = 30 
# BASELINE_MIDDLE_ENCODER_BW_END = 35
N_ENCODERS = 24
for model, single_model in zip(models, single_models):
   total_result = pd.DataFrame()


   for b in batch_size:
      ndpx_file_paths = []
      baseline_home='/home/shared/CXL_memory_buffer/BASELINE_ISCA/csv_files/'
      baseline_path_3=baseline_home+model+str(b)+'-NDPX_baseline_64.csv'
      baseline_path_single=baseline_home+single_model+str(b)+'-NDPX_baseline_64.csv'

      for GPU in GPUs:
         for s in sync:
            if GPU == 1 and s == 'sync':
               continue
            temp = 1
            if GPU == 8:
               temp = 8
            ndp_csv_home='/home/shared/CXL_memory_buffer/NDP_ISCA/csv_files/'
            file_path=ndp_csv_home + model + str(b)+'-'+ config + '-'+str(GPU)+'-' + str(GPU) + '-'+s+'.csv'
            file_path_bw=ndp_csv_home + model + str(b)+'-'+ config + '-'+str(GPU)+'-' + str(GPU) + '-' +s+'-bw.csv'
            ndpx_file_paths.append(file_path)
            ndpx_file_paths.append(file_path_bw)

      baseline_3 = pd.read_csv(baseline_path_3)
      baseline_3['GPUS'] = '1_GPU'
      baseline_3['CONFIG'] = 'BASELINE'
      baseline_1 = pd.read_csv(baseline_path_single)
      baseline_1['GPUS'] = '1_GPU'
      baseline_1['CONFIG'] = 'BASELINE'
      baseline_hlp_path=baseline_home+'xla_hlo/'+model+str(b)+'.txt'
      baseline_3 = setup_dataframe_baseline(baseline_3, baseline_hlp_path)
      baseline_3['CYCLE'] = pd.to_numeric(baseline_3['CYCLE'])
      baseline_3_hops = pd.read_csv(f'/home/shared/CXL_memory_buffer/BASELINE_ISCA/traces/{model}{str(b)}/kernelslist.g.hops')
      baseline_3 = pd.merge(baseline_3, baseline_3_hops, how='left', on='ID')
      baseline_3.loc[baseline_3.HOPS > 20, 'DIRECTION'] = 'backward'
      baseline_3.loc[baseline_3.HOPS <= 20, 'DIRECTION'] = 'forward'
      baseline_hlp_path=baseline_home+'xla_hlo/'+single_model+str(b)+'.txt'
      baseline_1 = setup_dataframe_baseline(baseline_1, baseline_hlp_path)
      baseline_1['CYCLE'] = pd.to_numeric(baseline_1['CYCLE'])


      baseline_1 = baseline_1.groupby(['CONFIG','GPUS','NAME']).sum()[[ 'CYCLE']] 
      baseline_3 = baseline_3.groupby(['CONFIG','GPUS','NAME']).sum()[[ 'CYCLE']]
      print(baseline_1)
      print(baseline_3)

      baseline_large = baseline_3 - baseline_1
      print(baseline_large/2)
#       mask = baseline_large.CYCLE < 0
#       baseline_large.loc[mask, 'CYCLE'] = 0
#       df = (baseline_large / 2 ) * (N_ENCODERS - 1) + baseline_1

#       ndpx_hops = pd.read_csv(f'/home/shared/CXL_memory_buffer/NDP_ISCA/traces/{model}{str(b)}/kernelslist.g.hops')
#       ndpx = pd.read_csv(ndpx_file_paths[0])
#       ndpx = pd.concat([ndpx, pd.read_csv(ndpx_file_paths[1])])
#       ndpx = pd.merge(ndpx, ndpx_hops, how='left', on='ID')
#       ndpx['GPUS'] = ndpx_file_paths[0].split('-')[-2]+'_GPU'



#       for i in range(1,len(GPUs)):
#           ndpx1 = pd.read_csv(ndpx_file_paths[2*i])
#           ndpx1 = pd.concat([ndpx1, pd.read_csv(ndpx_file_paths[2*i+1])])
#           ndpx1 = pd.merge(ndpx1, ndpx_hops, how='left', on='ID')
#           ndpx1['GPUS'] = ndpx_file_paths[2*i].split('-')[-2]+'_GPU'
#           ndpx = pd.concat([ndpx, ndpx1])
        
#       ndpx = setup_dataframe(ndpx)
#       print(ndpx)
#       ndpx['CYCLE'] = pd.to_numeric(ndpx['CYCLE'])
#       ndpx.loc[ndpx.HOPS > 29, 'DIRECTION'] = 'backward'
#       ndpx.loc[ndpx.HOPS <= 29, 'DIRECTION'] = 'forward'
#       ndpx.loc[(ndpx.HOPS >= NDPX_MIDDLE_ENCODER_FW_START) & (ndpx.HOPS <= NDPX_MIDDLE_ENCODER_FW_END), 'CYCLE'] *= (N_ENCODERS -2)
#       ndpx.loc[(ndpx.HOPS >= NDPX_MIDDLE_ENCODER_BW_START) & (ndpx.HOPS <= NDPX_MIDDLE_ENCODER_BW_END), 'CYCLE'] *= (N_ENCODERS -2)
#       ndpx.loc[(ndpx.HOPS == 0) & (ndpx.HOPS != ndpx.ID.min()), 'CYCLE'] = 0

#       grouped = ndpx.groupby(['CONFIG','GPUS','NAME', "DIRECTION"]).sum()[[ 'CYCLE']] 



     
#       baseline_runtime = pd.to_numeric(df['CYCLE']).sum()
      
#       df = pd.concat([df, grouped])
#       df['BATCH'] = b
#       total_result = pd.concat([total_result, df])
#       df.to_csv('./outputs/'+model+'_grouped_value-'+str(b)+'.csv')
#       normalized = df
#       normalized['CYCLE'] = df['CYCLE'] / baseline_runtime
#       normalized.to_csv('./outputs/'+model+'_total_normalized2-'+str(b)+'.csv')

#    total_result.to_csv(f'./outputs/{model}_total_batches.csv')



Unnamed: 0,GPUS,CONFIG,SYNC,ID,DIRECTION,NAME,CYCLE,HOPS
71,1_GPU,NDPX_wc64,0,8296,forward,NDP_OP,138569464,11.0
319,1_GPU,NDPX_wc64,0,8403,backward,NDP_OP,60190878,47.0
465,1_GPU,NDPX_wc64,0,8490,backward,NDP_OP,15034162,50.0
387,1_GPU,NDPX_wc64,0,8451,backward,NDP_OP,7390561,60.0
31,1_GPU,NDPX_wc64,0,8276,forward,NDP_OP,6294379,2.0
111,1_GPU,NDPX_wc64,0,8316,forward,NDP_OP,6292216,20.0
285,1_GPU,NDPX_wc64,0,8379,backward,NDP_OP,2826039,38.0
353,1_GPU,NDPX_wc64,0,8427,backward,NDP_OP,2626641,56.0
445,1_GPU,NDPX_wc64,0,8480,backward,NDP_OP,1192253,32.0
499,1_GPU,NDPX_wc64,0,8507,backward,NDP_OP,678689,41.0


In [99]:
df

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,Unnamed: 3_level_0,CYCLE
CONFIG,GPUS,NAME,DIRECTION,Unnamed: 4_level_1
BASELINE,1_GPU,GEMM,backward,0.292147
BASELINE,1_GPU,GEMM,forward,0.140975
BASELINE,1_GPU,LayerNorm + Dropout,backward,0.034847
BASELINE,1_GPU,LayerNorm + Dropout,forward,0.017427
BASELINE,1_GPU,OPT,backward,0.207918
BASELINE,1_GPU,OPT,forward,3.2e-05
BASELINE,1_GPU,Others,backward,0.026479
BASELINE,1_GPU,Others,forward,0.098756
BASELINE,1_GPU,Softmax+Dropout,backward,0.092706
BASELINE,1_GPU,Softmax+Dropout,forward,0.088712
