In [1]:
import pandas as pd

pd.set_option("display.max_columns",0)
pd.set_option("display.max_colwidth",0)

# Load Caliper JSON Data
Drop data with empty function annotations and split function nestings into tuples "main/foo/bar" -> (main, foo, bar)

In [2]:
# Large files (>1gb) should be read in 1000 lines at a time and concatenated together, like so:
df = pd.concat(
    pd.read_json('../datasets/quicksilver_topdown_counters.json', orient='records', lines=True, chunksize=1000)
)
df = df.dropna(subset=['function'])
df['function'] = df['function'].transform(lambda l: tuple(l.split('/')))
df.head()

Unnamed: 0,cali.caliper.version,cali.event.begin,cali.event.end,cali.event.set,event.begin#function,event.end#function,event.set#pthread.id,function,libpfm.counter.BR_MISP_RETIRED:ALL_BRANCHES,libpfm.counter.CPU_CLK_UNHALTED:THREAD_P,libpfm.counter.CYCLE_ACTIVITY:CYCLES_NO_EXECUTE,libpfm.counter.CYCLE_ACTIVITY:STALLS_L1D_PENDING,libpfm.counter.CYCLE_ACTIVITY:STALLS_L2_PENDING,libpfm.counter.CYCLE_ACTIVITY:STALLS_LDM_PENDING,libpfm.counter.IDQ:MS_UOPS,libpfm.counter.IDQ_UOPS_NOT_DELIVERED:CORE,libpfm.counter.INT_MISC:RECOVERY_CYCLES,libpfm.counter.MACHINE_CLEARS:COUNT,libpfm.counter.MEM_LOAD_UOPS_RETIRED:L3_HIT,libpfm.counter.MEM_LOAD_UOPS_RETIRED:L3_MISS,libpfm.counter.RESOURCE_STALLS:SB,libpfm.counter.RS_EVENTS:EMPTY_CYCLES,libpfm.counter.UOPS_EXECUTED:CORE_CYCLES_GE_1,libpfm.counter.UOPS_EXECUTED:CORE_CYCLES_GE_2,libpfm.counter.UOPS_EXECUTED:THREAD,libpfm.counter.UOPS_ISSUED:ANY,libpfm.counter.UOPS_RETIRED:RETIRE_SLOTS,mpi.rank,mpi.size,time.inclusive.duration,time.offset
114,1.7.0-dev,334.0,,,isInside,,,"(main, initMC, initMesh, MC_Domain, findMaterial)",0,0,0,0,0,0,9479,30267,127,4,0,0,0,0,0,0,0,0,0,10.0,16.0,,2813734
115,1.7.0-dev,,334.0,,,SumTasks,,"(main, cycleTracking, SumTasks)",0,0,0,0,0,0,0,0,0,0,0,0,2831,4915,0,0,112918,0,0,1.0,16.0,17.0,32336221
117,1.7.0-dev,334.0,,,collapse,,,"(main, cycleTracking)",0,0,0,0,0,27065,17530,89148,1179,0,0,0,0,0,0,0,0,0,0,0.0,16.0,,29924061
119,1.7.0-dev,334.0,,,collapse,,,"(main, cycleTracking)",1005,13128535,3867463,299,181642,2182242,1948881,2615597,52022,0,0,5,6931,788101,9250136,7512747,24753559,29415199,29398111,1.0,16.0,,32380275
120,1.7.0-dev,,334.0,,,isInside,,"(main, initMC, initMesh, MC_Domain, findMaterial, isInside)",0,0,0,0,0,0,11382,51608,437,7,0,0,0,0,0,0,0,0,0,10.0,16.0,21.0,2813755


# Perform Inclusive Aggregation (Derive the "Call Tree")

In [6]:
from cali_analysis import hatchet

## Define per-column aggregators

In [7]:
aggregators = dict([ (c, 'sum') for c in filter(lambda col: 'libpfm' in col, df.columns)])
aggregators['time.inclusive.duration'] = 'max'
aggregators['cali.caliper.version'] = 'count'
aggregators

{'cali.caliper.version': 'count',
 'libpfm.counter.BR_MISP_RETIRED:ALL_BRANCHES': 'sum',
 'libpfm.counter.CPU_CLK_UNHALTED:THREAD_P': 'sum',
 'libpfm.counter.CYCLE_ACTIVITY:CYCLES_NO_EXECUTE': 'sum',
 'libpfm.counter.CYCLE_ACTIVITY:STALLS_L1D_PENDING': 'sum',
 'libpfm.counter.CYCLE_ACTIVITY:STALLS_L2_PENDING': 'sum',
 'libpfm.counter.CYCLE_ACTIVITY:STALLS_LDM_PENDING': 'sum',
 'libpfm.counter.IDQ:MS_UOPS': 'sum',
 'libpfm.counter.IDQ_UOPS_NOT_DELIVERED:CORE': 'sum',
 'libpfm.counter.INT_MISC:RECOVERY_CYCLES': 'sum',
 'libpfm.counter.MACHINE_CLEARS:COUNT': 'sum',
 'libpfm.counter.MEM_LOAD_UOPS_RETIRED:L3_HIT': 'sum',
 'libpfm.counter.MEM_LOAD_UOPS_RETIRED:L3_MISS': 'sum',
 'libpfm.counter.RESOURCE_STALLS:SB': 'sum',
 'libpfm.counter.RS_EVENTS:EMPTY_CYCLES': 'sum',
 'libpfm.counter.UOPS_EXECUTED:CORE_CYCLES_GE_1': 'sum',
 'libpfm.counter.UOPS_EXECUTED:CORE_CYCLES_GE_2': 'sum',
 'libpfm.counter.UOPS_EXECUTED:THREAD': 'sum',
 'libpfm.counter.UOPS_ISSUED:ANY': 'sum',
 'libpfm.counter.UOPS_R

## Create multi-rooted tree from the samples, using our aggregators

In [14]:
mrt = hatchet.MultiRootTree.from_samples(df, 'function', aggregators)
mrt.df_nodes[['function', 'time.inclusive.duration', 'libpfm.counter.UOPS_EXECUTED:THREAD']].head()

Unnamed: 0,function,time.inclusive.duration,libpfm.counter.UOPS_EXECUTED:THREAD
50,"(main, initMC, initMesh, buildMeshPartition, assignCellsToDomain, nearestCenter, addNbrsToQueue, addTupleToQueue, tupleToIndex)",79.0,2215438952
14,"(main, initMC, initMesh, buildMeshPartition, assignCellsToDomain, nearestCenter, minDist2, indexToTuple)",48.0,2035192437
21,"(main, initMC, initMesh, buildMeshPartition, assignCellsToDomain, nearestCenter, addTupleToQueue, tupleToIndex)",148.0,682603152
26,"(main, initMC, initMesh, buildMeshPartition, assignCellsToDomain, nearestCenter, minDist2, whichCellTuple)",99.0,2043227374
52,"(main, initMC, initMesh, buildMeshPartition, assignCellsToDomain, nearestCenter, addNbrsToQueue, addTupleToQueue)",171.0,6646474530


# Derive Topdown Analysis metrics
Calculates the topdown derived metrics as described in http://ieeexplore.ieee.org/document/6844459/

In [9]:
from cali_analysis import topdown

In [15]:
df_td = topdown.derive_topdown_ivb(mrt.df_nodes)
df_td.head()

Unnamed: 0,hash#function,time.inclusive.duration,cali.caliper.version,function,parent_hash#function,depth#function,retiring,bad_speculation,frontend_bound,backend_bound,branch_mispredict,machine_clear,frontend_latency,frontend_bandwidth,memory_bound,core_bound,mem_bound,l1_bound,l2_bound,l3_bound,uncore_bound
50,-647878498448900250,79.0,93888,"(main, initMC, initMesh, buildMeshPartition, assignCellsToDomain, nearestCenter, addNbrsToQueue, addTupleToQueue, tupleToIndex)",28494405572918431,9,0.537467,0.006472,0.151175,0.304887,0.854832,0.145168,0.6047,0.3953,0.111491,0.250971,9.8e-05,0.105106,0.004113,0.002175,0.002272
14,-6803774206164476110,48.0,1,"(main, initMC, initMesh, buildMeshPartition, assignCellsToDomain, nearestCenter, minDist2, indexToTuple)",9221050682849230756,8,0.536153,0.005928,0.149674,0.308245,0.841537,0.158463,0.598696,0.401304,0.112065,0.250568,7.7e-05,0.105379,0.004169,0.002439,0.002516
21,-5098735054641025010,148.0,1,"(main, initMC, initMesh, buildMeshPartition, assignCellsToDomain, nearestCenter, addTupleToQueue, tupleToIndex)",-8809031931515923421,8,0.531663,0.007423,0.151012,0.309902,0.864017,0.135983,0.604047,0.395953,0.111637,0.250623,7.6e-05,0.10476,0.004543,0.002258,0.002334
26,-3897821748422853644,99.0,1,"(main, initMC, initMesh, buildMeshPartition, assignCellsToDomain, nearestCenter, minDist2, whichCellTuple)",9221050682849230756,8,0.533805,0.007674,0.151219,0.307302,0.865206,0.134794,0.604877,0.395123,0.112407,0.251637,5.6e-05,0.105567,0.004236,0.002548,0.002604
52,28494405572918431,171.0,2,"(main, initMC, initMesh, buildMeshPartition, assignCellsToDomain, nearestCenter, addNbrsToQueue, addTupleToQueue)",6278066493267320591,8,0.536578,0.006327,0.151955,0.305139,0.858711,0.141289,0.607822,0.392178,0.112044,0.251388,9.5e-05,0.105458,0.00423,0.002261,0.002356


# Determine Topdown boundedness
Using derived metrics, determines which function paths are bound by which resources

In [11]:
pd.set_option("display.max_rows",999)

dfb = topdown.analyze_topdown_metrics(df_td)
sorted_dfb = dfb.sort_values(['depth#function', 'time.inclusive.duration'], ascending=[True, False])
sorted_dfb[['function','time.inclusive.duration', 'boundedness', 'memory_bound', 'l1_bound', 'l2_bound', 'l3_bound']].dropna()

Unnamed: 0,function,time.inclusive.duration,boundedness,memory_bound,l1_bound,l2_bound,l3_bound
0,"(main,)",37213720.0,[retiring 48.85%],0.163454,0.137907,0.003898,0.02121
10,"(main, cycleTracking)",3572462.0,[retiring 46.84%],0.185443,0.15152,0.003671,0.030098
13,"(main, initMC)",2829809.0,[retiring 53.37%],0.11897,0.110911,0.004571,0.002979
9,"(main, cycleInit)",128275.0,[retiring 53.45%],0.168545,0.161756,0.001018,0.005548
1,"(main, parseInputFile)",17550.0,[retiring 61.06%],0.149088,0.141534,-4.5e-05,0.006747
7,"(main, cycleFinalize)",2597.0,"[backend_bound 51.33%, memory_bound 7.32%, l1_bound 5.44%]",0.073193,0.054443,0.003145,0.015419
12,"(main, freeArgs)",1045.0,"[backend_bound 71.21%, memory_bound 16.85%, l1_bound 15.60%]",0.168471,0.156003,0.006754,0.005714
11,"(main, gameOver)",623.0,"[backend_bound 49.21%, memory_bound 62.30%, uncore_bound 44.30%]",0.622959,0.314746,-0.134761,0.174897
5,"(main, ~MonteCarlo)",618.0,"[backend_bound 61.74%, memory_bound 128.95%, l1_bound 68.60%]",1.28946,0.686002,0.071766,0.127417
8,"(main, coralBenchmarkCorrectness)",598.0,"[backend_bound 65.56%, memory_bound 35.20%, l1_bound 33.02%]",0.351969,0.330192,0.001093,0.015224


## Investigate the Hot Path

In [12]:
tree = hatchet.Tree(mrt.roots[0])
tree.hot_path('time.inclusive.duration')

Unnamed: 0,hash#function,time.inclusive.duration,cali.caliper.version,function,parent_hash#function,depth#function,retiring,bad_speculation,frontend_bound,backend_bound,branch_mispredict,machine_clear,frontend_latency,frontend_bandwidth,memory_bound,core_bound,mem_bound,l1_bound,l2_bound,l3_bound,uncore_bound,boundedness
0,-1605492503923617678,37213720.0,15,"(main,)",3527539,1,0.488457,0.017789,0.098757,0.394996,0.971668,0.028332,0.395029,0.604971,0.163454,0.250706,0.00044,0.137907,0.003898,0.02121,0.021649,[retiring 48.85%]
10,5784617285950147367,3572462.0,3,"(main, cycleTracking)",-1605492503923617678,2,0.468424,0.023266,0.069226,0.439084,0.989538,0.010462,0.276904,0.723096,0.185443,0.247159,0.000154,0.15152,0.003671,0.030098,0.030252,[retiring 46.84%]
28,2971441756989713502,379.0,1,"(main, cycleTracking, collapse)",5784617285950147367,3,0.346102,0.004715,0.094345,0.554837,0.90466,0.09534,0.377381,0.622619,0.23332,0.298074,0.000939,0.130255,0.021011,0.081114,0.082053,"[backend_bound 55.48%, core_bound 29.81%]"


## Find All Core-Bound Functions

In [21]:
dfb['core_bound'] = dfb['boundedness'].transform(lambda l: any('core_bound' in s for s in l))
dfb[dfb['core_bound']].sort_values('time.inclusive.duration', ascending=False)

Unnamed: 0,hash#function,time.inclusive.duration,cali.caliper.version,function,parent_hash#function,depth#function,retiring,bad_speculation,frontend_bound,backend_bound,branch_mispredict,machine_clear,frontend_latency,frontend_bandwidth,memory_bound,core_bound,mem_bound,l1_bound,l2_bound,l3_bound,uncore_bound,boundedness
9,-4045883957366341701,2945.0,2,"(main, cycleInit, PopulationControl)",5443861434990854893,3,0.355141,0.045296,0.110011,0.489552,0.783818,0.216182,0.440045,0.559955,0.180865,True,0.001987,0.165235,-0.000203,0.013846,0.015833,"[backend_bound 48.96%, core_bound 32.87%]"
63,7979470124698356043,2716.0,1,"(main, cycleInit, PopulationControl, PopulationControlGuts)",-4045883957366341701,4,0.357865,0.045741,0.110508,0.485886,0.778182,0.221818,0.442031,0.557969,0.178832,True,0.00209,0.16758,-0.000698,0.00986,0.01195,"[backend_bound 48.59%, core_bound 33.99%]"
66,5054657606233141787,2096.0,1,"(main, initMC, initMesh, MC_Mesh_Domain, bootstrapNodeMap)",-946492471260909706,5,0.339579,0.146324,0.002563,0.511534,1.0,0.0,0.010253,0.989747,0.042986,True,0.033056,-0.013019,0.022949,0.0,0.033056,"[backend_bound 51.15%, core_bound 32.49%]"
17,-1098579940061278695,1377.0,2,"(main, initMC, initTallies)",8473569804901588989,3,0.138792,0.020233,0.02463,0.816344,0.725058,0.274942,0.098522,0.901478,0.246026,True,0.033302,0.184536,0.011602,0.016586,0.049887,"[backend_bound 81.63%, core_bound 83.99%]"
33,-517809402347088441,1325.0,1,"(main, initMC, initTallies, InitializeTallies)",-1098579940061278695,4,0.13526,0.01858,0.023857,0.822304,0.719841,0.280159,0.095427,0.904573,0.248061,True,0.033044,0.189065,0.010859,0.015093,0.048137,"[backend_bound 82.23%, core_bound 85.40%]"
42,2661260046714118370,1258.0,3,"(main, cycleFinalize, CycleFinalize, PrintSummary)",-8741706490191732344,4,0.471599,-0.05187,0.023714,0.556557,0.973394,0.026606,0.094858,0.905142,0.038029,True,0.000232,0.019415,0.003824,0.014559,0.014791,"[backend_bound 55.66%, core_bound 4.11%]"
11,6661188470843515086,623.0,2,"(main, gameOver)",-1605492503923617678,2,0.288573,-0.096654,0.315943,0.492139,0.660595,0.339405,1.263771,-0.263771,0.622959,True,0.268077,0.314746,-0.134761,0.174897,0.442974,"[backend_bound 49.21%, memory_bound 62.30%, uncore_bound 44.30%]"
45,8853945655847356208,588.0,3,"(main, gameOver, Cumulative_Report)",6661188470843515086,3,0.262541,-0.07724,0.333548,0.481151,0.649317,0.350683,1.334191,-0.334191,0.645916,True,0.303523,0.312159,-0.154107,0.184342,0.487864,"[backend_bound 48.12%, memory_bound 64.59%, uncore_bound 48.79%]"
14,9123190344316702200,503.0,2,"(main, processArgs)",-1605492503923617678,2,0.211555,-0.098959,0.248954,0.63845,0.867175,0.132825,0.995816,0.004184,0.142262,True,0.001072,0.108707,0.009489,0.022993,0.024065,"[backend_bound 63.84%, core_bound 21.40%]"
28,2971441756989713502,379.0,1,"(main, cycleTracking, collapse)",5784617285950147367,3,0.346102,0.004715,0.094345,0.554837,0.90466,0.09534,0.377381,0.622619,0.23332,True,0.000939,0.130255,0.021011,0.081114,0.082053,"[backend_bound 55.48%, core_bound 29.81%]"
