In [1]:
import subprocess
from matplotlib import pyplot as plt 
import numpy as np
import json 

In [52]:
batch_size = (1, 2, 4, 8, 16, 32, 64, 128, 256)
density = (0.7, 0.65, 0.6, 0.55, 0.5, 0.475, 0.45, 0.425, 0.4, 0.375, 0.35, 0.325, 0.3, 0.275, 0.25, 0.225, 0.2, 0.175, 0.15, 0.125, 0.1, 0.075, 0.05, 0.025, 0.02, 0.01, 0.00175, 0.0015, 0.001)
calculate_sparsity = lambda density: round((1 - density) * 100, 5)
sparsity = list(map(calculate_sparsity, density))
problem = {}
problem["qkv"] = (7680, 2560)
problem['attention_fc'] = (2560, 2560)
problem['linear1'] = (10240, 2560)
problem['linear2'] = (2560,10240)
execution_time = {}
compute_util = {}

In [53]:
peak_flop = 10496 #  Inst/cycle
peak_flops =  10496 * 2
frequency = 1.39e9    # cycle/nsecond
peak_perf = peak_flop * frequency

In [54]:
for batch in batch_size:
    execution_time[batch] = {}
    compute_util[batch] = {}
    for sparse in sparsity:
        execution_time[batch][sparse] = 0
        compute_util[batch][sparse] = 0

In [56]:
for layer in problem:
    m, k = problem[layer]
    print(m,k)

7680 2560
2560 2560
10240 2560
2560 10240


In [7]:
with open("./testdata.json", 'w') as json_file:
    json.dump(execution_time, json_file)

In [16]:
result = subprocess.run(f"ncu --replay-mode kernel --metric smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed,smsp__cycles_elapsed.avg.per_second,gpu__time_duration.sum ./cusparse_sgemm 1024 1 1024 0.3 0", stdout=subprocess.PIPE, shell=True)

In [44]:
i = result.stdout.decode().find("CsrMMPolicy")
idx_time = result.stdout.decode()[i:].find("gpu__time_duration.sum")
idx_util = result.stdout.decode()[i:].find("smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed")
result.stdout.decode()[i+idx_time:i+idx_time+120].split(), result.stdout.decode()[i+idx_util:i+idx_util+120].split()

(['gpu__time_duration.sum', 'usecond', '8.99'],
 ['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed',
  'inst/cycle',
  '25.20'])

In [57]:
for layer in problem:
    m, k = problem[layer]
    print("Profile: ",layer, m, k)
    for batch in batch_size:
        print("Batch_size:", m, batch, k)
        for sparse in density:
            print("Sparse: ", round(1 - sparse,4))
            result = subprocess.run(f"ncu --replay-mode kernel --metric smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed,smsp__cycles_elapsed.avg.per_second,gpu__time_duration.sum ./cusparse_sgemm {m} {batch} {k} {sparse} 0", stdout=subprocess.PIPE, shell=True)  
            idx = result.stdout.decode().find("CsrMMPolicy")
            idx_time = result.stdout.decode()[idx:].find("gpu__time_duration.sum")
            idx_compute = result.stdout.decode()[idx:].find("smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed")

            time = idx + idx_time 
            compute = idx + idx_compute

            print(result.stdout.decode()[time:time+120].split())
            print(result.stdout.decode()[compute:compute+120].split())

            if result.stdout.decode()[time:time+120].split()[1] == 'msecond':
                execution_time[batch][sparse] = float(result.stdout.decode()[time:time+120].split()[-1]) * 1000.0
            else:
                execution_time[batch][sparse] = float(result.stdout.decode()[time:time+120].split()[-1])
            print((float(result.stdout.decode()[compute:compute+117].split()[-1]) / float(peak_flop)) * 100)
            compute_util[batch][sparse] = (float(result.stdout.decode()[compute:compute+117].split()[-1]) / float(peak_flop)) * 100

with open("./compute_util", 'w') as file:
    json.dump(compute_util, file)

with open("./latency", 'w') as file:
    json.dump(execution_time, file)

Profile:  qkv 7680 2560
Batch_size: 7680 1 2560
Sparse:  0.3
['gpu__time_duration.sum', 'usecond', '139.46']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '70.80']
Sparse:  0.35
['gpu__time_duration.sum', 'usecond', '130.43']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '70.30']
Sparse:  0.4
['gpu__time_duration.sum', 'usecond', '121.44']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '69.71']
Sparse:  0.45
['gpu__time_duration.sum', 'usecond', '113.70']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '68.25']
Sparse:  0.5
['gpu__time_duration.sum', 'usecond', '103.68']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '68.03']
Sparse:  0.525
['gpu__time_duration.sum', 'usecond', '98.30']
['smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed', 'inst/cycle', '68.18']
S

In [61]:
compute_util[1]

{30.0: 0,
 35.0: 0,
 40.0: 0,
 45.0: 0,
 50.0: 0,
 52.5: 0,
 55.0: 0,
 57.5: 0,
 60.0: 0,
 62.5: 0,
 65.0: 0,
 67.5: 0,
 70.0: 0,
 72.5: 0,
 75.0: 0,
 77.5: 0,
 80.0: 0,
 82.5: 0,
 85.0: 0,
 87.5: 0,
 90.0: 0,
 92.5: 0,
 95.0: 0,
 97.5: 0,
 98.0: 0,
 99.0: 0,
 99.825: 0,
 99.85: 0,
 99.9: 0,
 0.7: 0.6737804878048781,
 0.65: 0.661204268292683,
 0.6: 0.6564405487804879,
 0.55: 0.6700647865853658,
 0.5: 0.6644435975609756,
 0.475: 0.6500571646341464,
 0.45: 0.6529153963414633,
 0.425: 0.6453887195121951,
 0.4: 0.6545350609756098,
 0.375: 0.6492949695121952,
 0.35: 0.6479611280487806,
 0.325: 0.6419588414634146,
 0.3: 0.6405297256097562,
 0.275: 0.6306211890243902,
 0.25: 0.6231897865853658,
 0.225: 0.6223323170731707,
 0.2: 0.5938452743902439,
 0.175: 0.5901295731707317,
 0.15: 0.5804115853658537,
 0.125: 0.551829268292683,
 0.1: 0.5249618902439025,
 0.075: 0.4771341463414634,
 0.05: 0.41873094512195125,
 0.025: 0.29868521341463417,
 0.02: 0.26171875,
 0.01: 0.17444740853658536,
 0.00175:

In [62]:
with open("./compute_util1.json", 'w') as file:
    json.dump(compute_util, file)

with open("./latency1.json", 'w') as file:
    json.dump(execution_time, file)

In [117]:
baseline = {}
baseline['compute'] = {}
baseline['time'] = {}
for layer in problem:
    baseline['compute'][layer] = {}
    baseline['time'][layer] = {}

In [118]:
baseline

{'compute': {'qkv': {}, 'attention_fc': {}, 'linear1': {}, 'linear2': {}},
 'time': {'qkv': {}, 'attention_fc': {}, 'linear1': {}, 'linear2': {}}}

In [121]:
a, b = result.stdout.decode().find("smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed"),result.stdout.decode().find("gpu__time_duration.sum")

In [124]:
result.stdout.decode()[a:a+120].split(), result.stdout.decode()[b:b+120].split()

(['sm__throughput.avg.pct_of_peak_sustained_elapsed',
  '%',
  '22.127333',
  '22.472679',
  '22.319877'],
 ['gpu__time_duration.sum', 'usecond', '34.240000', '35.552000', '34.867810'])

In [125]:
#cuBlas, Baseline
for layer in problem:
    m, k = problem[layer]
    print("Profile: ",layer, m, k)
    for batch in batch_size:
        print("Batch_size:", m, batch, k)
        result = subprocess.run(f"ncu --replay-mode kernel --metric smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed,smsp__cycles_elapsed.avg.per_second,gpu__time_duration.sum ./cublas_sgemm {m} {batch} {k} 0", stdout=subprocess.PIPE, shell=True)  
        idx1 = result.stdout.decode().find("gpu__time_duration.sum")
        baseline['time'][layer][batch] = float(result.stdout.decode()[idx1:idx1+120].split()[-1])
        
        idx2 = result.stdout.decode().find("smsp__sass_thread_inst_executed_op_ffma_pred_on.sum.per_cycle_elapsed")
        baseline['compute'][layer][batch] = (float(result.stdout.decode()[idx2:idx2+117].split()[-1]) / float(peak_flop)) * 100.0
        print(result.stdout.decode()[idx1:idx1+120].split())
        print(result.stdout.decode()[idx2:idx2+120].split())

Profile:  qkv 7680 2560
Batch_size: 7680 1 2560
['gpu__time_duration.sum', 'usecond', '96.672000', '98.336000', '97.383619']
['sm__throughput.avg.pct_of_peak_sustained_elapsed', '%', '23.406406', '23.772617', '23.610685']
Batch_size: 7680 2 2560
['gpu__time_duration.sum', 'usecond', '99.648000', '100.416000', '100.068571']
['sm__throughput.avg.pct_of_peak_sustained_elapsed', '%', '17.934592', '18.029137', '17.974133']
Batch_size: 7680 4 2560
['gpu__time_duration.sum', 'usecond', '98.656000', '99.680000', '99.108571']
['sm__throughput.avg.pct_of_peak_sustained_elapsed', '%', '24.671530', '25.017670', '24.875442']
Batch_size: 7680 8 2560
['gpu__time_duration.sum', 'usecond', '115.584000', '116.896000', '116.091429']
['sm__throughput.avg.pct_of_peak_sustained_elapsed', '%', '46.108100', '46.389750', '46.276350']
Batch_size: 7680 16 2560
['gpu__time_duration.sum', 'usecond', '116.192000', '117.408000', '116.592762']
['sm__throughput.avg.pct_of_peak_sustained_elapsed', '%', '45.745377', '46

In [126]:
with open("./baseline.json", 'w') as file:
    json.dump(baseline, file)

In [131]:
for key in baseline['time']:
    values = baseline['time'][key]
    print(values)

{1: 97.383619, 2: 100.068571, 4: 99.108571, 8: 116.091429, 16: 116.592762, 32: 126.832762, 64: 164.941714, 128: 312.466286, 256: 547.608381}
{1: 34.930286, 2: 41.426286, 4: 41.369905, 8: 41.475048, 16: 41.641143, 32: 59.943619, 64: 65.854476, 128: 114.140952, 256: 201.237333}
{1: 127.856762, 2: 131.683048, 4: 131.707429, 8: 131.72419, 16: 131.919238, 32: 137.61981, 64: 206.134857, 128: 368.323048, 256: 735.75619}
{1: 138.328381, 2: 138.247619, 4: 131.983238, 8: 132.120381, 16: 140.341333, 32: 136.356571, 64: 247.247238, 128: 386.706286, 256: 754.369524}


In [136]:
#compute_util, execution_time, baseline

In [147]:
for batch_size in compute_util:
    for dense in compute_util[batch_size]:
        print(compute_util[batch_size][dense])
print(len(compute_util))

17.48279
17.466215
17.477086
17.48241
17.449177
17.489151
17.413046
17.546226
17.494218
17.475552
17.560623
17.561836
17.544823
17.552697
17.559802
17.558135
17.455843
17.542881
17.47784
17.483838
17.472221
17.472899
17.447281
17.6142
17.399356
17.460912
17.527381
17.536464
17.478566
23.644179
23.442428
23.689058
23.465388
23.400078
23.536178
23.407007
23.530324
23.53157
23.552486
23.740037
23.452384
23.524452
23.762934
23.581654
23.421938
23.490457
23.579126
23.628488
23.498838
23.508242
23.640046
23.487371
23.435753
23.386332
23.548024
23.496986
23.336006
23.44543
34.670585
34.737281
34.876667
34.923948
34.94823
34.932292
34.875987
34.714458
34.772252
34.772115
34.914939
34.931937
34.923679
34.912785
34.939024
34.938583
34.837339
34.742289
34.829355
34.891085
34.796904
34.7341
34.867554
34.861591
34.804178
34.829748
34.632039
34.964606
35.093864
35.317934
35.346287
35.261547
35.283826
34.957593
35.147997
35.07874
35.182855
35.335845
35.330579
35.193964
35.258832
35.3721
35.167053
35.