In [9]:
import networkx as nx
import pandas
from hatchet import *
import os
import matplotlib.pyplot as plt
import platform
import json
from ast import literal_eval as make_tuple
from state import State

In [2]:
%matplotlib inline

In [3]:
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:,.2f}'.format
plt.rcParams['figure.figsize'] = (16, 12)

## Load the datasets and create Dataframes

In [16]:
dataset = 'kripke-mvapich2'
dir_name = "/Users/jarus/ucd/Research/Visualisation/projects/CallFlow/.callflow/"

In [17]:
def replace_str_with_Node(df, graph):
    mapper = {}
    def dfs_recurse(root):
        for node in root.children: 
            mapper[node.callpath[-1]] = Node(node.nid, node.callpath, None)
            dfs_recurse(node)
    for root in graph.roots:
        mapper[root.callpath[-1]] = Node(root.nid, root.callpath, None)
        dfs_recurse(root)
    df['node'] = df['node'].apply(lambda node: mapper[node] if node in mapper else '')
    return df

In [20]:
def read_gf(name):
    state = State()
    path = dir_name + dataset
    df_filepath = path + '/group_df.csv'
    entire_df_filepath = path + '/entire_df.csv'
    graph_filepath = path + '/filter_graph.json'
    entire_graph_filepath = path + '/entire_graph.json'   

    with open(graph_filepath, 'r') as graphFile:
        data = json.load(graphFile)

    state.gf = GraphFrame()
    state.gf.from_literal(data)

    with open(entire_graph_filepath, 'r') as entire_graphFile:
        entire_data = json.load(entire_graphFile)
            
    state.entire_gf = GraphFrame()
    state.entire_gf.from_literal(entire_data)

    state.df = pd.read_csv(df_filepath)
    state.entire_df = pd.read_csv(entire_df_filepath)

    state.graph = state.gf.graph
    state.entire_graph = state.entire_gf.graph


    # replace df['node'] from str to the Node object.
    state.df = replace_str_with_Node(state.df, state.graph)
    state.entire_df = replace_str_with_Node(state.entire_df, state.entire_graph)

    # add path to the dataframes. 
    # state.df['path'] = state.df['node'].apply(lambda node: node.callpath)
    # state.entire_df['path'] = state.entire_df['node'].apply(lambda node: node.callpath if node else [])

    return state

In [21]:
states = {}
states[name] = read_gf(name)
df = states[dataset].df
graph = states[dataset].graph

## Gets the paths of functions inside the module and converts to a dataframe.


In [28]:
def getHierarchyfromdf(state, module):
    df = state.df
    paths = []
    func_in_module = df.loc[df['module'] == module]['name'].unique().tolist()
    print("Number of functions inside the {0} module: {1}".format(module, len(func_in_module)))
    for idx, func in enumerate(func_in_module):
        mean_inc_time = df.loc[df['name'] == func]['time (inc)'].mean()
        mean_exc_time = df.loc[df['name'] == func]['time'].mean()
        paths.append({
            "module": module,
            "opath": df.loc[df['name'] == func]['path'].unique().tolist()[0],
            "path": df.loc[df['name'] == func]['component_path'].unique().tolist()[0],
            "inc_time" : df.loc[df['name'] == func]['time (inc)'].mean(),
            "exclusive" : df.loc[df['name'] == func]['time'].mean(),
#             "imbalance_perc" : df.loc[df['name'] == func]['imbalance_perc'].mean(),
            "component_level": df.loc[df['name'] == func]['component_level'].unique().tolist()[0],
        })
    return pd.DataFrame(paths)

In [29]:
for idx, state_name in enumerate(states):
    state = states[state_name]
    modules = state.df['module'].unique().tolist()
    for idx, module in enumerate(modules):
        paths = getHierarchyfromdf(state, module)
        state.paths_df = paths 
        #paths.to_csv(str(module + ".csv"))

Number of functions inside the libmonitor.so.0.0.0 module: 1
Number of functions inside the kripke module: 10
Number of functions inside the Unknown(NA) module: 2
Number of functions inside the libc-2.17.so module: 1
Number of functions inside the libmpi.so.12.0.5 module: 9


## Methods to add data into NxGraph

In [30]:
def add_levels(state):
    levelMap = {}
    track_level = 0
    nodes = state.g.nbunch_iter(state.root)
    
    for start_node in nodes:
        print("Start node", start_node)
        active_nodes = [start_node]
        levelMap[state.root] = 0
        
        for edge in nx.edge_dfs(state.g, start_node, 'original'):
            #rint("Edge {0}".format(edge))
            head_level = None
            tail_level = None
            head, tail = edge[0], edge[1]
            
            if head != start_node:
                active_nodes.append(head)
                
            if head in active_nodes and head != start_node and tail in active_nodes:
                #rint("Cycle", edge)
                edge_data = state.g.get_edge_data(*edge)                                                                             
                state.g.add_node(tail+'_')                                                                                           
                state.g.add_edge(head, tail+'_', data=edge_data)                                                                     
                state.g.node[tail+'_']['name'] = [tail + '_']                                                                        
                #state.g.node[tail+'_']['weight'] = state.g.node[tail]['weight']   
                state.g.remove_edge(edge[0], edge[1])
    return levelMap

In [31]:
 def flow_map(state):                                                                                                                 
        flowMap = {}                                                                                                                    
        nodes = state.g.nbunch_iter(state.root)                                                                                           
        for start_node in nodes:                                                                                                        
            for edge in nx.edge_dfs(state.g, start_node, 'original'):                                                                    
                head_level = None                                                                                                       
                tail_level = None                                                                                                       
                head, tail = self.tailhead(edge)                                                                                        
                                                                                                                                        
                # Check if there is an existing level mapping for the head node and assign.                                             
                if head in self.level_mapping.keys():                                                                                   
                    head_level =  self.level_mapping[head]                                                                              
                                                                                                                                        
                # Check if there is an existing level mapping for the tail node and assign.                                             
                if tail in self.level_mapping.keys():                                                                                   
                    tail_level = self.level_mapping[tail]                                                                               
                                                                                                                                        
                flowMap[(edge[0], edge[1])] = (int(head_level), int(tail_level))                                                        
        return flowMap 

In [32]:
def calculate_flows(state):
    graph = state.g
    ret = {}                                                                                                                                                                                                                                                                          
    edges = graph.edges()                                                                                                                                                                                                                                                             
    additional_flow = {}                                                                                                                                                                                                                                                              
                                                                                                                                                                                                                                                                                          
    # Calculates the costs in cycles and aggregates to one node.                                                                                                                                                                                                                      
    for edge in edges:                                                                                                                                                                                                                                                                
        source = edge[0]                                                                                                                                                                                                                                                              
        target = edge[1]                                                                                                                                                                                                                                                              
                                                                                                                                                                                                                                                                                          
        if source.endswith('_'):                                                                                                                                                                                                                                                      
            cycle_node = source                                                                                                                                                                                                                                                       
            cycle_node_df = self.state.lookup_with_nodeName(cycle_node[:-1])                                                                                                                                                                                                      
            additional_flow[cycle_node] = cycle_node_df['CPUTIME (usec) (I)'].max()                                                                                                                                                                                                   
        elif target.endswith('_'):                                                                                                                                                                                                                                                    
            cycle_node = target                                                                                                                                                                                                                                                       
            cycle_node_df = state.lookup_with_nodeName(cycle_node[:-1])                                                                                                                                                                                                      
            additional_flow[cycle_node] = cycle_node_df['CPUTIME (usec) (I)'].max()                                                                                                                                                                                                   
                                                                                                                                                                                                                                                                                          
    for edge in edges:                                                                                                                                                                                                                                                                
        added_flow = 0                                                                                                                                                                                                                                                                
        if edge[0].endswith('_'):                                                                                                                                                                                                                                                     
            ret[edge] = additional_flow[edge[0]]                                                                                                                                                                                                                                      
            continue                                                                                                                                                                                                                                                                  
        elif edge[1].endswith('_'):                                                                                                                                                                                                                                                   
            ret[edge] = additional_flow[edge[1]]                                                                                                                                                                                                                                      
            continue                                                                                                                                                                                                                                                                  
        source = state.lookup_with_nodeName(edge[0])                                                                                                                                                                                                                         
        target = state.lookup_with_nodeName(edge[1])                                                                                                                                                                                                                         
                                                                                                                                                                                                                                                                                          
        source_inc = source['time (inc)'].max()                                                                                                                                                                                                                               
        target_inc = target['time (inc)'].max()                                                                         
                                                                                                                                                                                                                                                                                          
        if source_inc == target_inc:                                                                                                                                                                                                                                                  
            ret[edge] = source_inc                                                                                                                                                                                                                                                    
        else:                                                                                                                                                                                                                                                                         
            ret[edge] = target_inc    
    return ret   

In [34]:
def add_edge_attributes(state):
    capacity_mapping = calculate_flows(state)    
    nx.set_edge_attributes(state.g, name='weight', values=capacity_mapping)

In [114]:
def add_node_attributes(state):
    module_mapping = generic_map(state.df, state.g.nodes(), 'module')
    nx.set_node_attributes(state.g, name='module', values=module_mapping)
    
    time_inc_mapping = generic_map(state.df, state.g.nodes(), 'time (inc)')
    nx.set_node_attributes(state.g, name='time (inc)', values=time_inc_mapping)

    time_mapping = generic_map(state.df, state.g.nodes(), 'time')
    nx.set_node_attributes(state.g, name="time", values=time_mapping)

#     name_mapping = generic_map(state.df, state.g.nodes(), 'vis_node_name')
#     nx.set_node_attributes(state.g, name='name', values=name_mapping)

#     type_mapping = generic_map(state.df, state.g.nodes(), 'type')
#     nx.set_node_attributes(state.g, name='type', values=type_mapping)

#     n_index_mapping = generic_map(state.df, state.g.nodes(), 'n_index')
#     nx.set_node_attributes(state.g, name='n_index', values=n_index_mapping)

#     module_mapping = generic_map(state.df, state.g.nodes(), 'module')
#     nx.set_node_attributes(state.g, name='module', values=module_mapping)

#     mod_index_mapping = generic_map(state.df, state.g.nodes(), 'mod_index')
#     nx.set_node_attributes(state.g, name='mod_index', values=mod_index_mapping)

#     children_mapping = immediate_children()
#     nx.set_node_attributes(self.g, name='children', values=children_mapping)
        
#     entry_function_mapping = self.generic_map(self.g.nodes(), 'entry_functions')
#     nx.set_node_attributes(self.g, name='entry_functions', values=entry_function_mapping)

In [80]:
def no_cycle_path(path):
    ret = []
    mapper = {}
    path = make_tuple(path)
    for idx, elem in enumerate(path):
        if elem not in mapper:
            mapper[elem] = 1
            ret.append(elem)
        else:
            ret.append(elem + "_" + str(mapper[elem]))
            mapper[elem] += 1
    return tuple(ret)

In [73]:
def add_paths(state, path_name):
    for idx, row in state.df.iterrows():
        path = row[path_name]
        corrected_path = no_cycle_path(path)
        state.g.add_path(corrected_path)

## Create NxGraph

In [136]:
def callgraph_init():
    state = states[name]
    state.g = nx.DiGraph()
    state.root = state.lookup_with_node(state.graph.roots[0])['name'][0]
    state.rootInc = state.lookup_with_node(state.graph.roots[0])['time (inc)'].max()
        
    add_paths(state, 'group_path')
    add_node_attributes(state)
    
callgraph_init()

libmonitor.so.0.0.0
kripke
Unknown(NA)
kripke:Kernel_3d_DGZ::LTimes
libc-2.17.so
kripke:Kernel_3d_DGZ::scattering
kripke:Kernel_3d_DGZ::LPlusTimes
kripke:SweepSubdomains
Unknown(NA):Loop@<unknown file> [kripke]:0
kripke:SweepComm::readySubdomains
Unknown(NA):Loop@<unknown file> [kripke]:0_1
libmpi.so.12.0.5
kripke:Kernel_3d_DGZ::sweep
kripke:Grid_Data::particleEdit


In [137]:
def generic_map(df, nodes, attr):
        ret = {}
        group_by = 'module'
        for node in nodes:
            if attr == 'time (inc)':
                if group_by == 'module':
                    group_df = df.groupby([group_by]).mean()
                elif group_by == 'name':
                    group_df = df.groupby([group_by]).mean()
                    
                if ':' in node:
                    corrected_node = node.split(':')[0]
                else:
                    corrected_node = node
                print(node)
                
                ret[node] = group_df.loc[corrected_node, 'time (inc)']
            
            elif attr == 'entry_functions':
                module_df = df.loc[df['module'] == node]
                entry_func_df = module_df.loc[(module_df['component_level'] == 2)]
                if(entry_func_df.empty):
                    ret[node] = json.dumps({
                        'name': '',
                        'time': [],
                        'time (inc)': []
                    })
                else:
                    name = entry_func_df['name'].unique().tolist()
                    time = entry_func_df['time'].mean().tolist()
                    time_inc = entry_func_df['time (inc)'].mean().tolist()
                
                    ret[node] = json.dumps({
                        'name': entry_func_df['name'].unique().tolist(),
                        'time': entry_func_df['time'].mean().tolist(),
                        'time (inc)': entry_func_df['time (inc)'].mean().tolist()
                    })

            elif attr == 'imbalance_perc':
                module_df = df.loc[df['module'] == node]
                max_incTime = module_df['time (inc)'].max()
                min_incTime = module_df['time (inc)'].min()
                avg_incTime = module_df['time (inc)'].mean()

                ret[node] = (max_incTime - avg_incTime)/max_incTime

            elif attr == 'time':
                module_df = df.loc[df['module'] == node]
                if group_by == 'module':
                    group_df = df.groupby([group_by]).max()
                elif self.group_by == 'name':
                    group_df = df.groupby([group_by]).mean()
                    
                if ':' in node:
                    corrected_node = node.split(':')[0]
                else:
                    corrected_node = node
                ret[node] = group_df.loc[corrected_node, 'time']
                
            elif attr == 'module':
                module_df = df.loc[df[group_by] == node][attr]
                ret[node] = list(set(module_df.tolist()))
                
            else:
                df = df.loc[df['vis_node_name'] == node][attr]
                if df.empty:
                    ret[node] = df[df[groupby] == node][attr]
                else:
                    ret[node] = list(set(df[df['vis_node_name'] == node][attr].tolist()))            
        return ret

In [138]:
for node in states[dataset].g.nodes(data=True):
    print(node)

('libmonitor.so.0.0.0', {'module': ['libmonitor.so.0.0.0'], 'time (inc)': 194516926.140625, 'time': 0.0})
('kripke', {'module': ['kripke'], 'time (inc)': 65521685.884183235, 'time': 78005891.0})
('Unknown(NA)', {'module': ['Unknown(NA)'], 'time (inc)': 43454729.45870536, 'time': 61528596.0})
('kripke:Kernel_3d_DGZ::LTimes', {'module': [], 'time (inc)': 65521685.884183235, 'time': 78005891.0})
('libc-2.17.so', {'module': ['libc-2.17.so'], 'time (inc)': 2276253.6171875, 'time': 0.0})
('kripke:Kernel_3d_DGZ::scattering', {'module': [], 'time (inc)': 65521685.884183235, 'time': 78005891.0})
('kripke:Kernel_3d_DGZ::LPlusTimes', {'module': [], 'time (inc)': 65521685.884183235, 'time': 78005891.0})
('kripke:SweepSubdomains', {'module': [], 'time (inc)': 65521685.884183235, 'time': 78005891.0})
('Unknown(NA):Loop@<unknown file> [kripke]:0', {'module': [], 'time (inc)': 43454729.45870536, 'time': 61528596.0})
('kripke:SweepComm::readySubdomains', {'module': [], 'time (inc)': 65521685.884183235,

In [69]:
print(states[dataset].df['vis_node_name'])

0       libmonitor.so.0.0.0
1       libmonitor.so.0.0.0
2       libmonitor.so.0.0.0
3       libmonitor.so.0.0.0
4       libmonitor.so.0.0.0
               ...         
5246            Unknown(NA)
5247            Unknown(NA)
5248            Unknown(NA)
5249            Unknown(NA)
5250            Unknown(NA)
Name: vis_node_name, Length: 5251, dtype: object
