In [30]:
import pandas
from hatchet import *
import os
import matplotlib.pyplot as plt
import seaborn.apionly as sns
import matplotlib.cm as cm
import mpld3
import numpy as np

In [18]:
#from actions.groupBy import groupBy
from state import State
from preprocess import PreProcess

In [19]:
%matplotlib inline

In [20]:
pd.options.display.max_rows = 10
pd.options.display.float_format = '{:,.2f}'.format
plt.rcParams['figure.figsize'] = (16, 12)

# Change the directory name according to your system

In [21]:
dirname = "/Users/jarus/ucd/Research/Visualisation/projects/CallFlow/.callflow"
# dirname = "/Users/padmanabanke1/CallFlow/.callflow"
# dirname = "/home/vidi/Work/llnl/CallFlow/.callflow"

## Load the datasets and create Dataframes

In [22]:
def replace_str_with_Node(df, graph):
        mapper = {}
        def dfs_recurse(root):
            for node in root.children: 
                mapper[node.callpath[-1]] = Node(node.nid, node.callpath, None)
                dfs_recurse(node)
        for root in graph.roots:
            mapper[root.callpath[-1]] = Node(root.nid, root.callpath, None)
            dfs_recurse(root)
        df['node'] = df['node'].apply(lambda node: mapper[node] if node in mapper else '')
        return df

In [23]:
def read_gf(name):
    state = State()
    df_filepath = dirname + '/' + name +  '/filter_df.csv'
    entire_df_filepath = dirname + '/' + name + '/entire_df.csv'
    graph_filepath = dirname + '/' + name + '/filter_graph.json'
    entire_graph_filepath = dirname + '/' + name + '/entire_graph.json'   

    with open(graph_filepath, 'r') as graphFile:
        data = json.load(graphFile)

    state.gf = GraphFrame()
    state.gf.from_literal(data)

    with open(entire_graph_filepath, 'r') as entire_graphFile:
        entire_data = json.load(entire_graphFile)
            
    state.entire_gf = GraphFrame()
    state.entire_gf.from_literal(entire_data)

    state.df = pd.read_csv(df_filepath)
    state.entire_df = pd.read_csv(entire_df_filepath)

    state.graph = state.gf.graph
    state.entire_graph = state.entire_gf.graph

#     state.map = state.node_hash_mapper()

    # Print the module group by information. 
    # print(state.df.groupby(['module']).agg(['mean','count']))

    # replace df['node'] from str to the Node object.
    state.df = replace_str_with_Node(state.df, state.graph)
    state.entire_df = replace_str_with_Node(state.entire_df, state.entire_graph)

    return state

In [24]:
# datasets = ["kripke-mvapich2", "kripke-openmpi"]
datasets = ['calc-pi', 'calc-pi-half','calc-pi-random-1']
states = {}
for idx, dataset_name in enumerate(datasets):
    state = read_gf(dataset_name)
    states[dataset_name] = state
print(states)

{'calc-pi': <state.State object at 0x1a196ac208>, 'calc-pi-half': <state.State object at 0x1a196ac1d0>, 'calc-pi-random-1': <state.State object at 0x1a196db0b8>}


# Code to make matplotlib-d3 work

In [25]:
class NumpyEncoder(json.JSONEncoder):
    def default(self, obj):
        import numpy as np
        if isinstance(obj, np.ndarray):
            return obj.tolist()
        return json.JSONEncoder.default(self, obj)
from mpld3 import _display
_display.NumpyEncoder = NumpyEncoder

# Bland altman plot.

In [26]:
group_df = states[datasets[0]].df.groupby(['time (inc)'], sort=True)
for idx, g in group_df:
    print(idx, g['time (inc)'].mean())
    
group_df = states[datasets[1]].df.groupby(['name'])
for idx, g in group_df:
    print(idx, g['time'].mean())

173876.0 173876.0
197718.0 197718.0
221846.0 221846.0
244939.0 244939.0
293767.0 293767.0
298806.0 298806.0
310812.0 310812.0
316759.0 316759.0
316840.0 316840.0
317865.0 317865.0
376650.0 376650.0
377648.0 377648.0
921289.0 921289.0
940345.0 940345.0
945419.0 945419.0
975332.0 975332.0
999238.0 999238.0
999308.0 999308.0
999390.0 999390.0
1000306.0 1000306.0
162:MPIDI_CH3_Finalize 0.0
230:psm_dofinalize 0.0
294:MPID_Finalize 0.0
36:<unknown procedure> 0.0
62:MPI_Finalize 0.0
<program root> 0.0
<unknown file>:0 143646.91666666666
<unknown procedure> 0.0
PMPI_Finalize 0.0
main 0.0


In [13]:
group_df = states[datasets[1]].df.groupby(['name']).mean()
print(group_df.info())

<class 'pandas.core.frame.DataFrame'>
Index: 10 entries, 162:MPIDI_CH3_Finalize to main
Data columns (total 13 columns):
Unnamed: 0        10 non-null float64
rank              10 non-null float64
time (inc)        10 non-null float64
time              10 non-null float64
nid               10 non-null float64
rank.1            10 non-null float64
line              10 non-null float64
n_index           10 non-null float64
mod_index         10 non-null float64
show_node         10 non-null bool
max_incTime       10 non-null float64
avg_incTime       10 non-null float64
imbalance_perc    10 non-null float64
dtypes: bool(1), float64(12)
memory usage: 1.0+ KB
None


In [27]:
def checkequivalence(gdf1, gdf2, col, catcol):
    functions_in_gdf1 = []
    functions_in_gdf2 = []
    for name, group in gdf1:
        functions_in_gdf1.append(name)
        
    for name, group in gdf2:
        functions_in_gdf2.append(name)
        
    functions = set(functions_in_gdf1) | set(functions_in_gdf2)
        
    data1 = {}
    data2 = {}
    for idx, func in enumerate(list(functions)):
        # Check if it exists in both. 
        if func in functions_in_gdf1 and func in functions_in_gdf2:
            print('Exist')
            data1[func] = gdf1.get_group(func)[col].mean()
            data2[func] = gdf2.get_group(func)[col].mean()
        # not present in 1st df, and present in 2nd df 
        elif func not in functions_in_gdf1 and func in functions_in_gdf2:
            print('Not in df1')
            data1[func] = 0
            data2[func] = gdf2.get_group(func)[col].mean()
        # present in 1st df and not in 2nd df
        elif func in functions_in_gdf1 and func not in functions_in_gdf2:
            print('Not in df2')
            data1[func] = gdf1.get_group(func)[col].mean()
            data2[func] = 0            
    data_df1 = pd.DataFrame.from_dict(list(data1.items()))
    data_df1.reset_index()
    data_df1.columns = ['name', col]
    data_df2 = pd.DataFrame.from_dict(list(data2.items()))
    data_df2.reset_index()
    data_df2.columns = ['name', col]
    
    return [data_df1, data_df2]

In [28]:
def bland_altman_plot(df1, df2, col, catcol, *args, **kwargs):
    gdf1 = df1.groupby(['name'])
    gdf2 = df2.groupby(['name'])
    temp = checkequivalence(gdf1, gdf2, col, catcol)

    data1 = np.asarray(temp[0][col])
    data2 = np.asarray(temp[1][col])
    print(data1, data2)
    labels = np.asarray(temp[0]['name'])
    mean      = np.mean([data1, data2], axis=0)
    diff      = data1 - data2                   # Difference between data1 and data2
    md        = np.mean(diff)                   # Mean of the difference
    sd        = np.std(diff, axis=0)            # Standard deviation of the difference
    print(mean, diff)
#     print(gdf2[catcol])
#     categories = np.concatenate(np.unique((gdf1[catcol].tolist()), np.unique(gdf2[catcol].tolist())), axis=1)
    
#     categories = np.unique(gdf1[catcol].tolist())
#     colors = cm.rainbow(np.linspace(0, 1, len(categories)))
#     colordict = dict(zip(categories, colors))

    fig, ax = plt.subplots()
    
#     gdf1["Color"] = gdf1[catcol].apply(lambda x: colordict[x])
#     plt.scatter(mean, diff, c=gdf1.Color, *args, **kwargs)
    scatter = plt.scatter(diff, mean)
    plt.axhline(md,           color='gray', linestyle='--')
    plt.axhline(md + 1.96*sd, color='gray', linestyle='--')
    plt.axhline(md - 1.96*sd, color='gray', linestyle='--')
    
    ax.grid(color='#aaaaaa', linestyle='solid')
    ax.set_title("Bland-altman plot", size=20)
        
    tooltip = mpld3.plugins.PointLabelTooltip(scatter, labels=labels)
    mpld3.plugins.connect(fig, tooltip)

    mpld3.show()

In [None]:
bland_altman_plot(states[datasets[0]].entire_df, states[datasets[1]].entire_df, 'time', 'name')

Exist
Exist
Exist
Exist
Exist
Exist
Exist
Exist
Exist
Exist
Exist
Exist
Exist
Exist
Exist
Exist
Exist
Exist
Exist
Exist
[     0.    157599.375      0.         0.     26982.125      0.
      0.         0.         0.         0.         0.         0.
      0.         0.         0.         0.         0.         0.
      0.         0.   ] [    0.     78799.6875     0.         0.     13491.0625     0.
     0.         0.         0.         0.         0.         0.
     0.         0.         0.         0.         0.         0.
     0.         0.    ]
[     0.      118199.53125      0.           0.       20236.59375
      0.           0.           0.           0.           0.
      0.           0.           0.           0.           0.
      0.           0.           0.           0.           0.     ] [    0.     78799.6875     0.         0.     13491.0625     0.
     0.         0.         0.         0.         0.         0.
     0.         0.         0.         0.         0.         0.
     0.

127.0.0.1 - - [21/Aug/2019 13:52:52] "GET / HTTP/1.1" 200 -
127.0.0.1 - - [21/Aug/2019 13:52:52] "GET /d3.js HTTP/1.1" 200 -
127.0.0.1 - - [21/Aug/2019 13:52:53] "GET /mpld3.js HTTP/1.1" 200 -
