In [63]:
import subprocess
from matplotlib import pyplot as plt 
import numpy as np
import json 

In [80]:
batch_size = (1, 2, 4, 8, 16, 32, 64, 128, 256)
density = (0.7, 0.65, 0.6, 0.55, 0.5, 0.475, 0.45, 0.425, 0.4, 0.375, 0.35, 0.325, 0.3, 0.275, 0.25, 0.225, 0.2, 0.175, 0.15, 0.125, 0.1, 0.075, 0.05, 0.025, 0.02, 0.01, 0.00175, 0.0015, 0.001)
calculate_sparsity = lambda density: round((1 - density) * 100, 5)
sparsity = list(map(calculate_sparsity, density))
problem = {}
problem["qkv"] = (7680, 2560)
problem['attention_fc'] = (2560, 2560)
problem['linear1'] = (10240, 2560)
problem['linear2'] = (2560,10240)
execution_time = {}
compute_util = {}

In [81]:
peak_flop = 10496 #  Inst/cycle
peak_flops =  10496 * 2
frequency = 1.39e9    # cycle/nsecond
peak_perf = peak_flop * frequency

In [82]:
for layer in problem:
    execution_time[layer] = {}
    compute_util[layer] = {}
    for batch in batch_size:
        execution_time[layer][batch] = {}
        compute_util[layer][batch] = {}
        for sparse in density:
            execution_time[layer][batch][sparse] = 0
            compute_util[layer][batch][sparse] = 0

In [65]:
for layer in problem:
    m, k = problem[layer]
    print(m,k)

7680 2560
2560 2560
10240 2560
2560 10240


In [7]:
with open("./testdata.json", 'w') as json_file:
    json.dump(execution_time, json_file)

In [16]:
result = subprocess.run(f"ncu --replay-mode kernel --metric smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed,smsp__cycles_elapsed.avg.per_second,gpu__time_duration.sum ./cusparse_sgemm 1024 1 1024 0.3 0", stdout=subprocess.PIPE, shell=True)

In [44]:
i = result.stdout.decode().find("CsrMMPolicy")
idx_time = result.stdout.decode()[i:].find("gpu__time_duration.sum")
idx_util = result.stdout.decode()[i:].find("smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed")
result.stdout.decode()[i+idx_time:i+idx_time+120].split(), result.stdout.decode()[i+idx_util:i+idx_util+120].split()

(['gpu__time_duration.sum', 'usecond', '8.99'],
 ['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed',
  'inst/cycle',
  '25.20'])

In [84]:
for layer in problem:
    m, k = problem[layer]
    print("Profile: ",layer, m, k)
    for batch in batch_size:
        print("Batch_size:", m, batch, k)
        for sparse in density:
            print("Sparse: ", round(1 - sparse,4))
            result = subprocess.run(f"ncu --replay-mode kernel --metric smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed,smsp__cycles_elapsed.avg.per_second,gpu__time_duration.sum ./cusparse_sgemm {m} {batch} {k} {sparse} 0", stdout=subprocess.PIPE, shell=True)  
            idx = result.stdout.decode().find("CsrMMPolicy")
            idx_time = result.stdout.decode()[idx:].find("gpu__time_duration.sum")
            idx_compute = result.stdout.decode()[idx:].find("smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed")

            time = idx + idx_time 
            compute = idx + idx_compute

            print(result.stdout.decode()[time:time+120].split())
            print(result.stdout.decode()[compute:compute+120].split())

            if result.stdout.decode()[time:time+120].split()[1] == 'msecond':
                execution_time[layer][batch][sparse] = float(result.stdout.decode()[time:time+120].split()[-1]) * 1000.0
            else:
                execution_time[layer][batch][sparse] = float(result.stdout.decode()[time:time+120].split()[-1])
            print((float(result.stdout.decode()[compute:compute+117].split()[-1]) / float(peak_flop)) * 100)
            compute_util[layer][batch][sparse] = (float(result.stdout.decode()[compute:compute+117].split()[-1]) / float(peak_flop)) * 100

with open("./latency2.json", 'w') as file:
    json.dump(execution_time, file)
    
with open("./compute_util2.json", 'w') as file:
    json.dump(compute_util, file)

Profile:  qkv 7680 2560
Batch_size: 7680 1 2560
Sparse:  0.3
['gpu__time_duration.sum', 'usecond', '139.23']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '70.91']
0.6755907012195121
Sparse:  0.35
['gpu__time_duration.sum', 'usecond', '129.98']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '70.70']
0.6735899390243902
Sparse:  0.4
['gpu__time_duration.sum', 'usecond', '120.90']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '70.03']
0.6672065548780488
Sparse:  0.45
['gpu__time_duration.sum', 'usecond', '114.37']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '68.03']
0.6481516768292682
Sparse:  0.5
['gpu__time_duration.sum', 'usecond', '103.26']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '68.50']
0.6526295731707317
Sparse:  0.525
['gpu__time_duration.sum', 'usecond', '97.70']
['s

In [79]:
compute_util

{'qkv': {1: {0.7: 0.671875,
   0.65: 0.6737804878048781,
   0.6: 0.666920731707317,
   0.55: 0.6451028963414633,
   0.5: 0.6471989329268293,
   0.475: 0.6509146341463414,
   0.45: 0.6403391768292682,
   0.425: 0.6348132621951219,
   0.4: 0.6319550304878049,
   0.375: 0.6257621951219513,
   0.35: 0.6221417682926829,
   0.325: 0.6115663109756097,
   0.3: 0.6047065548780488,
   0.275: 0.5914634146341463,
   0.25: 0.5836509146341463,
   0.225: 0.5725038109756099,
   0.2: 0.5545922256097561,
   0.175: 0.5373475609756098,
   0.15: 0.506859756097561,
   0.125: 0.4876143292682927,
   0.1: 0.43730945121951215,
   0.075: 0.39453125,
   0.05: 0.3113567073170732,
   0.025: 0.27372332317073167,
   0.02: 0.2306592987804878,
   0.01: 0.15396341463414634,
   0.00175: 0.040110518292682924,
   0.0015: 0.036966463414634144,
   0.001: 0.02458079268292683},
  2: {0.7: 0,
   0.65: 0,
   0.6: 0,
   0.55: 0,
   0.5: 0,
   0.475: 0,
   0.45: 0,
   0.425: 0,
   0.4: 0,
   0.375: 0,
   0.35: 0,
   0.325: 0,
   0

In [62]:
with open("./compute_util1.json", 'w') as file:
    json.dump(compute_util, file)

with open("./latency1.json", 'w') as file:
    json.dump(execution_time, file)

In [85]:
baseline = {}
baseline['compute'] = {}
baseline['time'] = {}
for layer in problem:
    baseline['compute'][layer] = {}
    baseline['time'][layer] = {}

In [118]:
baseline

{'compute': {'qkv': {}, 'attention_fc': {}, 'linear1': {}, 'linear2': {}},
 'time': {'qkv': {}, 'attention_fc': {}, 'linear1': {}, 'linear2': {}}}

In [121]:
a, b = result.stdout.decode().find("smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed"),result.stdout.decode().find("gpu__time_duration.sum")

In [124]:
result.stdout.decode()[a:a+120].split(), result.stdout.decode()[b:b+120].split()

(['sm__throughput.avg.pct_of_peak_sustained_elapsed',
  '%',
  '22.127333',
  '22.472679',
  '22.319877'],
 ['gpu__time_duration.sum', 'usecond', '34.240000', '35.552000', '34.867810'])

In [92]:
#cuBlas, Baseline
for layer in problem:
    m, k = problem[layer]
    print("Profile: ",layer, m, k)
    for batch in batch_size:
        print("Batch_size:", m, batch, k)
        result = subprocess.run(f"ncu --replay-mode kernel --metric smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed,smsp__cycles_elapsed.avg.per_second,gpu__time_duration.sum ./cublas_sgemm {m} {batch} {k} 0", stdout=subprocess.PIPE, shell=True)  
        idx1 = result.stdout.decode().find("gpu__time_duration.sum")
        baseline['time'][layer][batch] = float(result.stdout.decode()[idx1:idx1+120].split()[-1])
        
        idx2 = result.stdout.decode().find("smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed")
        baseline['compute'][layer][batch] = (float(result.stdout.decode()[idx2:idx2+117].split()[-1].replace(',', '')) / float(peak_flop)) * 100.0
        print(result.stdout.decode()[idx1:idx1+120].split())
        print(result.stdout.decode()[idx2:idx2+120].split())

Profile:  qkv 7680 2560
Batch_size: 7680 1 2560
['gpu__time_duration.sum', 'usecond', '97.02']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '145.58']
Batch_size: 7680 2 2560
['gpu__time_duration.sum', 'usecond', '99.87']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '282.54']
Batch_size: 7680 4 2560
['gpu__time_duration.sum', 'usecond', '99.26']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '568.84']
Batch_size: 7680 8 2560
['gpu__time_duration.sum', 'usecond', '115.74']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '3,906.19']
Batch_size: 7680 16 2560
['gpu__time_duration.sum', 'usecond', '116.54']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '3,877.65']
Batch_size: 7680 32 2560
['gpu__time_duration.sum', 'usecond', '126.85']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.s

In [89]:
tt = '3,910.80'.replace(',','')

In [90]:
float(tt)

3910.8

In [93]:
with open("./baseline2.json", 'w') as file:
    json.dump(baseline, file)

In [131]:
for key in baseline['time']:
    values = baseline['time'][key]
    print(values)

{1: 97.383619, 2: 100.068571, 4: 99.108571, 8: 116.091429, 16: 116.592762, 32: 126.832762, 64: 164.941714, 128: 312.466286, 256: 547.608381}
{1: 34.930286, 2: 41.426286, 4: 41.369905, 8: 41.475048, 16: 41.641143, 32: 59.943619, 64: 65.854476, 128: 114.140952, 256: 201.237333}
{1: 127.856762, 2: 131.683048, 4: 131.707429, 8: 131.72419, 16: 131.919238, 32: 137.61981, 64: 206.134857, 128: 368.323048, 256: 735.75619}
{1: 138.328381, 2: 138.247619, 4: 131.983238, 8: 132.120381, 16: 140.341333, 32: 136.356571, 64: 247.247238, 128: 386.706286, 256: 754.369524}
