In [1]:
import networkx as nx
import pandas
from hatchet import *
import os
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import igraph
import platform

examples.directory is deprecated; in the future, examples will be found relative to the 'datapath' directory.
  "found relative to the 'datapath' directory.".format(key))
The text.latex.unicode rcparam was deprecated in Matplotlib 2.2 and will be removed in 3.1.
  "2.2", name=key, obj_type="rcparam", addendum=addendum)


In [2]:
# libraries for gromov distance computation.
import numpy as np
import scipy as sp
import ot
from sklearn.decomposition import PCA
from sklearn import manifold

In [3]:

from actions.groupBy import groupBy
from state import State
from callgraph import CallGraph
from preprocess import PreProcess

In [4]:
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:,.2f}'.format
plt.rcParams['figure.figsize'] = (16, 12)

In [5]:
# Linux path
if platform.system() == "Linux":
	callflow_path = "/home/vidi/Work/llnl/CallFlow/"
else:
	#Mac OSx path
	callflow_path = "/Users/jarus/ucd/Research/Visualisation/projects/CallFlow"

dataset_path = ["data/lulesh-1/db-ampi4-100-1", "data/lulesh-1/db-ampi4-100-8"]
dataset = ['db-ampi4-100-1', 'db-ampi4-100-8']  

In [6]:
# Create Graphframes.
def create_gfs(file_format, paths):    
	print("Creating graphframes....")                                                                                             
	ret = []                                                                                                                         
	for idx, path in enumerate(paths):
		path = os.path.abspath(os.path.join(callflow_path, path)) 
		gf = GraphFrame()   
		gf.from_hpctoolkit(path, 3)                                                                            
		ret.append(gf) 
		print(str(idx) + ":" + path)                                                                                              
	return ret 

In [7]:
# util functions
def lookup(df, node):                                                                                                                    
	return df.loc[df['node'] == node] 

def lookup_with_name(df, name):
	return df.loc[df['name'] == name]

def getMaxIncTime(gf):                                                                                                                   
	ret = 0.0                                                                                                                            
	for root in gf.graph.roots:                                                                                                          
		ret = max(ret, lookup(gf.dataframe, root)['CPUTIME (usec) (I)'].max())                                                           
	return ret                                                                                                                           
																																		 
def getMaxExcTime(gf):                                                                                                                   
	ret  = gf.dataframe['CPUTIME (usec) (E)'].max()                                                                                      
	return ret                                                                                                                           
			   
def special_lookup(gf, df_index):   
	return gf.dataframe.loc[gf.dataframe['name'] == df_index] 

In [8]:
# Filter graphframe and graph
def filter_gfs(gfs, filterBy):                                                                                                   
	# Create the graph frames from the paths and corresponding format using hatchet                                                  
	fgfs = []                                                                                                                        
	# Filter graphframes based on threshold                                                                                          
	for idx, gf in enumerate(gfs):                                                                                              
		print("Filtering the dataframe!")                                                                                         
		if filterBy == "IncTime":                                                                                          
			max_inclusive_time = getMaxIncTime(gf)                                                                             
			filter_gf = gf.filter(lambda x: True if(x['CPUTIME (usec) (I)'] > 0.01*max_inclusive_time) else False)                   
		elif self.args.filterBy == "ExcTime":                                                                                        
			max_exclusive_time = getMaxExcTime(gf)                                                                             
			print('[Filter] By Exclusive time = {0})'.format(max_exclusive_time))                                                 
			filter_gf = gf.filter(lambda x: True if (x['CPUTIME (usec) (E)'] > 0.01*max_exclusive_time) else False)                  
		else:                                                                                                                        
			print("Not filtering.... Can take forever. Thou were warned")                                                         
			filter_gf = gf                                                                                                           
		print('[Filter] Removed {0} rows.)'.format(gf.dataframe.shape[0] - filter_gf.dataframe.shape[0]))                                                                                                                            
		print("Grafting the graph!")                                                                                            
		filter_gf = filter_gf.graft()                                                                                                
		print("[Graft] {0} rows left".format(filter_gf.dataframe.shape[0]))                           
		fgfs.append(filter_gf)                                                                                                       
	return fgfs

In [12]:
# add n_index to the dataframe.
def add_n_index(gf):
	gf.dataframe['n_index'] = gf.dataframe.groupby('nid').ngroup()

def df_index_name_mapper(graph, df):
	ret = {}
	node_count = 0
	root = graph.roots[0]
	node_gen = graph.roots[0].traverse()
	try:
		while root.callpath != None:
			node_count += 1
			root = next(node_gen)
			ret[root.callpath[-1]] = root.df_index
	except StopIteration:
		pass
	finally:
		print("Total nodes in graph: ", node_count)
		del root
	return ret

# add df_index to the dataframe
def add_df_index(gf):
	df_index_name_map = df_index_name_mapper(gf.graph, gf.dataframe)
	gf.dataframe['df_index'] = gf.dataframe['name'].apply(lambda node: df_index_name_map[node] if node in df_index_name_map else 'as ')   

In [11]:
# add callee and caller data into the dataframe
def add_callers_and_callee(graph, df):
	callees = {}
	callers = {}
	root = graph.roots[0]
	node_gen = graph.roots[0].traverse()
	root_df = root.callpath[-1]
	callers[root_df] = []
	callees[root_df] = []
	try:                                                                                                                        
		while root.callpath != None:                                                                                            
			root = next(node_gen)                                                                                               
			if root.parent:                                                                                                     
				root_df = root.callpath[-1]                                                                                     
				parent_df = root.parent.callpath[-1]                                                                            
				if parent_df not in callees:                                                                                    
					callees[parent_df] = []              
				callees[parent_df].append(root_df)                                                                              
																																		
				if root_df not in callers:                                                                                      
					callers[root_df] = []                                                                                       
				callers[root_df].append(parent_df)                                                                              
																																		
	except StopIteration:                                                                                                       
		pass                                                                                                                    
	finally:                                                                                                                    
		del root                                                                                                                
																																		
	df['callees'] = df['name'].apply(lambda node: callees[node] if node in callees else [])                           
	df['callers'] = df['name'].apply(lambda node: callers[node] if node in callers else []) 

In [10]:
#pre-process dataframe to add more information. 
def preprocess(state):
	preprocess = PreProcess.Builder(state).add_df_index().add_n_index().add_mod_index().add_path().add_callers_and_callees().add_show_node().add_vis_node_name().update_module_name().clean_lib_monitor().add_max_incTime().add_incTime().add_excTime().add_avg_incTime().add_imbalance_perc().build() 


#NetworkX stuff.
def create_nx_graph(state):
	g = nx.DiGraph()
	return g

In [40]:
def main(dataset_path):
	dataset = []
	for idx, path in enumerate(dataset_path):
		dataset.append(path.split('/')[0])

	gfs = create_gfs('hpctoolkit', dataset_path)
	# filtered graph frames.
	fgfs = filter_gfs(gfs, 'IncTime')  
	
	states = []
	for idx, fgf in enumerate(fgfs):
		print("Shape of the dataframe from graph ({0}): {1}".format(dataset[idx], fgf.dataframe.shape))
		state = State(fgf)
		preprocess(state)
		#groupBy(state, 'module')
		#create_nx_graph(state)
		states.append(state)
	return states

In [41]:
dataset_path = ["data/lulesh-1/db-ampi4-100-1"]
states = main(dataset_path)

Creating graphframes....
0:/home/vidi/Work/llnl/CallFlow/data/lulesh-1/db-ampi4-100-1
Filtering the dataframe!
(185290, 10)
[Filter] Removed 184465 rows.)
Grafting the graph!
[Graft] 825 rows left
Shape of the dataframe from graph (data): (825, 10)
Total nodes in the graph 114


In [21]:
props = {
    "lulesh2.0" :{
        "name" : "lulesh",
        "files" : [ "mpi-linux-x86_64-ifort-mpicxx/tmp/", "mpi-linux-x86_64-ifort-mpicxx/", "mpi-linux-x86_64-ifort-mpicxx/tmp/libs/ck-libs/", "mpi-linux-x86_64-ifort-mpicxx/tmp/libs/conv-libs/"],
        "functions" : [] 
    },
    "libmpi.so.12.0.5" :{
        "name" : "MPI",
        "files" : ["../src/mpi/"],
        "functions" : []
    }
}

In [23]:
def groupby(df, keys, metric = 'mean'):
    # Groups data by the keys provided
    groups = df.groupby(keys)
    measure = getattr(groups, metric)
    data = measure() 
    return data

In [68]:
df = states[0].df

def rename_module_names(df, props):
    module_names = list(props.keys())
    for module_name in module_names:
        rename_to = props[module_name]['name']
        files = props[module_name]['files']
        function = props[module_name]['functions']
        for idx, row in df.iterrows():
            if row.module:
                if(row.module == module_name):
                    df.set_value(idx, 'module', rename_to)
                else:
                    pass
                
        print(df['file'].str.contains(files[0], regex=True))

    print(df['module'].unique().tolist())


rename_module_names(df, props)
print(df[df['module']=='Unkno']['name'].unique().tolist())
group_df = groupby(df, ['name', 'rank'])
print(group_df)
for idx, row in df.iterrows():
    pass

node                  rank
<partial call paths>  0.00    False
                      1.00    False
                      2.00    False
                      3.00    False
                      4.00    False
                              ...  
Loop@lulesh.cc:2327   0.00    False
                      2.00    False
                      4.00    False
                      5.00    False
                      6.00    False
Name: file, Length: 825, dtype: bool
node                  rank
<partial call paths>  0.00    False
                      1.00    False
                      2.00    False
                      3.00    False
                      4.00    False
                              ...  
Loop@lulesh.cc:2327   0.00    False
                      2.00    False
                      4.00    False
                      5.00    False
                      6.00    False
Name: file, Length: 825, dtype: bool
['<unknown load module>', 'libc-2.17.so', 'lulesh', 'Unkno']
['Loop@lulesh.cc:27

Defaulting to column but this will raise an ambiguity error in a future version
  This is separate from the ipykernel package so we can avoid doing imports until
