# Import Pandas and Plotly (offline mode)

In [21]:
import pandas as pd

import plotly.graph_objs as go
from plotly.offline import download_plotlyjs, init_notebook_mode, plot, iplot

pd.set_option("display.max_columns", 0) # don't truncate columns
pd.set_option("display.max_colwidth", 0) # don't truncate column values

init_notebook_mode(connected=True)

# Read in Caliper Data 

Caliper key-value data formatted in JSON can be read directly with Pandas.

This particular dataset was generated using the configuration file [configs/data_centric_PEBS.config](https://github.com/LLNL/caliper-examples/blob/master/configs/data_centric_PEBS.conf)

Now, create a Pandas dataframe from the JSON data using `read_json` as follows:

In [22]:
df = pd.read_json('../datasets/lulesh_latency_analysis.json')
df[['function','libpfm.operation','libpfm.memory_level','libpfm.hit_type','libpfm.weight']].head()

Unnamed: 0,function,libpfm.operation,libpfm.memory_level,libpfm.hit_type,libpfm.weight
0,main/LagrangeLeapFrog/LagrangeNodal/CalcForceForNodes,,,,
1,main/LagrangeLeapFrog/LagrangeNodal/CalcForceForNodes/CalcVolumeForceForElems/IntegrateStressForElems,Load,L1,Hit,15.0
2,main/LagrangeLeapFrog/LagrangeNodal/CalcForceForNodes/CalcVolumeForceForElems/CalcHourglassControlForElems,Load,L1,Hit,319.0
3,main/LagrangeLeapFrog/LagrangeNodal/CalcForceForNodes/CalcVolumeForceForElems/CalcHourglassControlForElems/CalcFBHourglassForceForElems,Load,L1,Hit,12.0
4,main/LagrangeLeapFrog/LagrangeNodal/CalcForceForNodes/CalcVolumeForceForElems/CalcHourglassControlForElems/CalcFBHourglassForceForElems,Load,L2,Hit,16.0


In [23]:
df.columns

Index(['alloc.index#libpfm.addr', 'alloc.label#libpfm.addr',
       'alloc.total_size', 'alloc.uid', 'alloc.uid#libpfm.addr',
       'cali.caliper.version', 'cali.event.begin', 'cali.event.end',
       'cali.event.set', 'callpath.address', 'event.begin#function',
       'event.begin#iteration#lulesh.cycle', 'event.begin#loop',
       'event.begin#lulesh.region', 'event.end#function',
       'event.end#iteration#lulesh.cycle', 'event.end#loop',
       'event.end#lulesh.region', 'event.set#pthread.id', 'function',
       'instruction.op#callpath.address',
       'instruction.read_size#callpath.address',
       'instruction.write_size#callpath.address', 'iteration#lulesh.cycle',
       'libpfm.addr',
       'libpfm.counter.MEM_TRANS_RETIRED:LATENCY_ABOVE_THRESHOLD',
       'libpfm.cpu', 'libpfm.data_src', 'libpfm.event_sample_name',
       'libpfm.hit_type', 'libpfm.memory_level', 'libpfm.operation',
       'libpfm.snoop', 'libpfm.tid', 'libpfm.time', 'libpfm.tlb',
       'libpfm.weight',

## Deriving the Base Function Name

In [24]:
import os
df = df.sort_values('libpfm.time')
df['base_function'] = df['function'].transform(lambda s: os.path.basename(str(s)))

# Data Aggregations

## Data-Centric Profiles

First, we plot the sum of latencies per allocation:

In [25]:
df_data_centric = df[['alloc.label#libpfm.addr', 'libpfm.weight']].groupby('alloc.label#libpfm.addr').sum()
data = [
    go.Bar(
        name='Allocation Quantity per Function',
        x=df_data_centric.index, 
        y=df_data_centric['libpfm.weight']
    )
]
iplot(data)

Looks like `fy`, `fz`, `nodelist`, `z`, and `zd`  stand out, but they may have just been accessed more frequently, rather than inefficiently.

Lets do the same plot, but average the latencies instead of sum them:

In [26]:
df_data_centric = df[['alloc.label#libpfm.addr', 'libpfm.weight']].groupby('alloc.label#libpfm.addr').mean()
data = [
    go.Bar(
        name='Allocation Quantity per Function',
        x=df_data_centric.index, 
        y=df_data_centric['libpfm.weight']
    )
]
iplot(data)

Now it looks like `dzz`, `dxx`, stand out. Lets take a look at that data:

In [33]:
df_dzz = df[df['alloc.label#libpfm.addr'].isin(['dzz', 'dxx', 'delx_xi', 'delx_eta'])]

df_dzz[['function',
        'alloc.label#libpfm.addr',
        'libpfm.operation',
        'libpfm.memory_level', 
        'libpfm.hit_type', 
        'libpfm.weight',
        'libpfm.snoop']].sort_values('libpfm.weight', ascending=False)

Unnamed: 0,function,alloc.label#libpfm.addr,libpfm.operation,libpfm.memory_level,libpfm.hit_type,libpfm.weight,libpfm.snoop
10796,main/LagrangeLeapFrog/LagrangeElements/CalcLagrangeElements,dzz,Load,LFB,Hit,1659.0,
5122,main/LagrangeLeapFrog/LagrangeElements/CalcQForElems/CalcMonotonicQRegionForElems,delx_xi,Load,Remote RAM 1 Hop,Hit,1499.0,
10357,main/LagrangeLeapFrog/LagrangeElements/CalcQForElems/CalcMonotonicQRegionForElems,delx_eta,Load,Remote RAM 1 Hop,Hit,1383.0,
7819,main/LagrangeLeapFrog/LagrangeElements/CalcQForElems/CalcMonotonicQRegionForElems,delx_xi,Load,Remote Cache 1 Hops,Hit,1362.0,Hit Modified
8210,main/LagrangeLeapFrog/LagrangeElements/CalcQForElems/CalcMonotonicQRegionForElems,delx_xi,Load,Remote RAM 1 Hop,Hit,1333.0,
13645,main/LagrangeLeapFrog/LagrangeElements/CalcLagrangeElements,dxx,Load,LFB,Hit,1108.0,
5964,main/LagrangeLeapFrog/LagrangeElements/CalcQForElems/CalcMonotonicQRegionForElems,delx_xi,Load,Remote RAM 1 Hop,Hit,898.0,
4097,main/LagrangeLeapFrog/LagrangeElements/CalcQForElems/CalcMonotonicQRegionForElems,delx_eta,Load,Remote RAM 1 Hop,Hit,814.0,
14634,main/LagrangeLeapFrog/LagrangeElements/CalcQForElems/CalcMonotonicQRegionForElems,delx_eta,Load,Local RAM,Hit,597.0,
4946,main/LagrangeLeapFrog/LagrangeElements/CalcLagrangeElements,dxx,Load,LFB,Hit,556.0,


## Call Tree Aggregations of Data-Centric Profiles

We are first and foremost interested in functions that contribute most to execution time.

We can combine our allocation label information with call tree aggregation in Hatchet to narrow down accesses inside of long-executing functions.

In [44]:
from cali_analysis import hatchet

pd.options.mode.chained_assignment = None  # default='warn'

# Derive a tuple from the function path by splitting on '/'
df_callpaths = df.dropna(subset=['function'])
df_callpaths['function'] = df_callpaths['function'].transform(lambda l: tuple(l.split('/')))
df_callpaths['alloc.label'] = df_callpaths['alloc.label#libpfm.addr'].transform(lambda l: set([l]) )

# Construct a call tree, aggregating allocation sizes at each node
mrt = hatchet.MultiRootTree.from_samples(df_callpaths, 'function', {
    'time.inclusive.duration' : 'max',
    'libpfm.weight' : 'sum',
    'alloc.label' : lambda s: set.union(*s)
})
mrt.df_nodes[['depth#function', 'function', 'time.inclusive.duration', 'libpfm.weight']].sort_values(['depth#function', 'time.inclusive.duration'], ascending=[True, False])

Unnamed: 0,depth#function,function,time.inclusive.duration,libpfm.weight
0,1,"(main,)",7356803.0,1498571.0
2,2,"(main, LagrangeLeapFrog)",426843.0,1494949.0
0,2,"(main, TimeIncrement)",779.0,0.0
1,3,"(main, LagrangeLeapFrog, LagrangeNodal)",268656.0,1133676.0
0,3,"(main, LagrangeLeapFrog, LagrangeElements)",162764.0,349359.0
3,3,"(main, LagrangeLeapFrog, CalcTimeConstraintsForElems)",71942.0,11914.0
10,4,"(main, LagrangeLeapFrog, LagrangeNodal, CalcForceForNodes)",247724.0,1081581.0
5,4,"(main, LagrangeLeapFrog, LagrangeElements, ApplyMaterialPropertiesForElems)",100833.0,202305.0
3,4,"(main, LagrangeLeapFrog, LagrangeElements, CalcQForElems)",56844.0,89149.0
9,4,"(main, LagrangeLeapFrog, LagrangeElements, CalcLagrangeElements)",29421.0,57344.0


In [45]:
# Get the first and only tree, rooted at 'main'
tree = hatchet.Tree(mrt.roots[0])
hp = tree.hot_path('time.inclusive.duration')
hp[['time.inclusive.duration']]

Unnamed: 0,hash#function,time.inclusive.duration,libpfm.weight,alloc.label,function,parent_hash#function,depth#function
0,3257255658422958958,7356803.0,1498571.0,"{nan, letam, ql, dyy, ss, dxx, elemMass, delv, fz, lzetap, fx, volo, delv_xi, v, lxip, x, delx_eta, zdd, zd, qq, xd, delx_zeta, y, elemBC, fy, ydd, arealg, lxim, xdd, nodalMass, vdov, z, delv_zeta, e, dzz, delv_eta, yd, q, letap, p, lzetam, nodelist, delx_xi}","(main,)",3527539,1
2,7521264667305038957,426843.0,1494949.0,"{nan, letam, ql, dyy, ss, dxx, elemMass, delv, fz, lzetap, fx, volo, delv_xi, v, lxip, x, delx_eta, zdd, zd, qq, xd, delx_zeta, elemBC, y, fy, ydd, arealg, lxim, xdd, nodalMass, vdov, z, delv_zeta, e, dzz, delv_eta, yd, q, letap, p, lzetam, nodelist, delx_xi}","(main, LagrangeLeapFrog)",3257255658422958958,2
1,-3352412037358043775,268656.0,1133676.0,"{nan, ss, elemMass, delv, fz, fx, volo, v, x, zdd, zd, xd, qq, y, fy, ydd, xdd, nodalMass, z, yd, q, p, nodelist}","(main, LagrangeLeapFrog, LagrangeNodal)",7521264667305038957,3
10,7232988268055804628,247724.0,1081581.0,"{nan, fy, ss, xd, nodalMass, elemMass, z, delv, fz, fx, volo, v, yd, q, p, x, nodelist, zd, qq, y}","(main, LagrangeLeapFrog, LagrangeNodal, CalcForceForNodes)",-3352412037358043775,4
16,8298605949823600296,247001.0,1081379.0,"{nan, fy, ss, nodalMass, elemMass, z, delv, fz, fx, volo, v, yd, q, p, qq, x, nodelist, zd, xd, y}","(main, LagrangeLeapFrog, LagrangeNodal, CalcForceForNodes, CalcVolumeForceForElems)",7232988268055804628,5
5,-4055909081709070579,186653.0,712526.0,"{nan, fy, ss, elemMass, delv, z, fz, fx, volo, v, yd, q, qq, x, nodelist, zd, xd, y}","(main, LagrangeLeapFrog, LagrangeNodal, CalcForceForNodes, CalcVolumeForceForElems, CalcHourglassControlForElems)",8298605949823600296,6
12,2825115130409026931,93395.0,361098.0,"{nan, fx, fy, yd, q, ss, xd, nodelist, elemMass, zd, qq, fz}","(main, LagrangeLeapFrog, LagrangeNodal, CalcForceForNodes, CalcVolumeForceForElems, CalcHourglassControlForElems, CalcFBHourglassForceForElems)",-4055909081709070579,7
