In [1]:
import sys
from glob import glob

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np

import thicket as th
import os

pd.set_option("display.max_rows", None)
pd.set_option("display.max_columns", None)



In [34]:
print(len(glob("caliper_files/*/*.cali")))

all_paths = []
curr_paths = glob("caliper_files/*/*.cali")

for input_type in ["1_perc_perturbed", "Random", "ReverseSorted", "Sorted"]:
        for num_procs in [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]:
            for input_size in [65536, 262144, 1048576, 4194304, 16777216, 67108864, 268435456]:
                  curr_fname = f'caliper_files\{input_type}\p{num_procs}-a{input_size}-{input_type}.cali'
                  #print(curr_fname)
                  all_paths.append(curr_fname)
                  
for fpath in all_paths:
    if fpath not in curr_paths:
        print(fpath)

# for key, value in tkRandom.metadata.items():
#     if (key == "launchdate" 
#         or key == "libraries"
#         or key == "cmdline"
#         or key == "cluster"
#         or key == "algorithm"
#         or key == "programming_model"
#         or key == "data_type"
#         or key == "size_of_data_type"
#         or key == "input_size"
#         or key == "input_type"
#         or key == "num_procs"
#         or key == "scalability"
#         or key == "group_num"
#         or key == "implementation_source"):
#         print(f"{key}: ", end="")
#         if (key == "libraries"):
#             print(value.iloc[0][0] + " + more...")
#         else:
#             print(value.iloc[0])

Read all files

In [None]:
tkSorted = th.Thicket.from_caliperreader(glob("caliper_files/Sorted/*.cali"))
tkRandom = th.Thicket.from_caliperreader(glob("caliper_files/Random/*.cali"))
tkReverse = th.Thicket.from_caliperreader(glob("caliper_files/ReverseSorted/*.cali"))
tkPerturbed = th.Thicket.from_caliperreader(glob("caliper_files/1_perc_perturbed/*.cali"))
                

In [36]:

# print(tkSorted.tree(metric_column="Avg time/rank", precision=5))
# print(tkRandom.tree(metric_column="Avg time/rank", precision=5))
# print(tkReverse.tree(metric_column="Avg time/rank", precision=5))
# print(tkPerturbed.tree(metric_column="Avg time/rank", precision=5))

Group Performance data by `matrix_size` in the Thicket metadata table.

In [37]:
tkRandom.metadata_column_to_perfdata("num_procs")
tkRandom.metadata_column_to_perfdata("input_size")

tkRandom.dataframe = tkRandom.dataframe.reset_index().set_index(["node", "num_procs", "input_size"]).sort_index()

tkSorted.metadata_column_to_perfdata("num_procs")
tkSorted.metadata_column_to_perfdata("input_size")

tkSorted.dataframe = tkSorted.dataframe.reset_index().set_index(["node", "num_procs", "input_size"]).sort_index()

tkReverse.metadata_column_to_perfdata("num_procs")
tkReverse.metadata_column_to_perfdata("input_size")

tkReverse.dataframe = tkReverse.dataframe.reset_index().set_index(["node", "num_procs", "input_size"]).sort_index()

tkPerturbed.metadata_column_to_perfdata("num_procs")
tkPerturbed.metadata_column_to_perfdata("input_size")

tkPerturbed.dataframe = tkPerturbed.dataframe.reset_index().set_index(["node", "num_procs", "input_size"]).sort_index()

In [None]:
leg = ["MIN", "AVG", "MAX"]

processes = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]

def plot_mplRandom(df, df2, df3, title, xlabel, ylabel):    
    for input_size in df.columns:
        if input_size != 268435456:
            continue
        fig = plt.figure(figsize=(15,7))
        ax = fig.add_subplot(1, 1, 1)
        ax.plot(processes, df[input_size].values, marker='o')
        ax.plot(processes, df2[input_size].values, marker='o')
        ax.plot(processes, df3[input_size].values, marker='o')

        ax.set_title(title + ", Random, Input size: " + str(input_size))
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        ax.legend(leg)
        plt.xscale("log", base=2)
        plt.xticks(processes)
        ax.set_xticklabels(processes)
        plt.show()
        
        
for node in tkRandom.graph.traverse():
    if node.frame["name"] not in ['main', 'comp_large', 'comm']:
        break
    plot_mplRandom(
        df=tkRandom.dataframe.loc[node, "Min time/rank"].unstack(level="input_size"),
        df2=tkRandom.dataframe.loc[node, "Avg time/rank"].unstack(level="input_size"),
        df3=tkRandom.dataframe.loc[node, "Max time/rank"].unstack(level="input_size"),
        title=node.frame["name"],
        xlabel="Processes",
        ylabel="Time (seconds)",
    )

def plot_mplRandomOne(df, title, xlabel, ylabel, typ):    
    for input_size in df.columns:
        if input_size != 268435456:
            continue
        
        fig = plt.figure(figsize=(15,7))
        ax = fig.add_subplot(1, 1, 1)
        ax.plot(processes, df[input_size], marker='o')
        ax.set_title(title + " " + typ + ", Random, Input size: " + str(input_size))
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        #ax.legend(leg)
        plt.xscale("log", base=2)
        plt.xticks(processes)
        ax.set_xticklabels(processes)
        plt.show()
    
    
for node in tkRandom.graph.traverse():
    if node.frame["name"] not in ['main', 'comp_large', 'comm']:
        continue
    plot_mplRandomOne(
        df=tkRandom.dataframe.loc[node, "Total time"].unstack(level="input_size"),
        title=node.frame["name"],
        xlabel="Processes",
        ylabel="Time (seconds)",
        typ="Total Time"
    )
    
for node in tkRandom.graph.traverse():
    if node.frame["name"] not in ['main', 'comp_large', 'comm']:
        continue
    plot_mplRandomOne(
        df=tkRandom.dataframe.loc[node, "Variance time/rank"].unstack(level="input_size"),
        title=node.frame["name"],
        xlabel="Processes",
        ylabel="Time (seconds)",
        typ="Variance"
    )
    
    
def plot_mplRandomSpeedup(df, title, xlabel, ylabel, typ):    
    baseline = df.loc[2];
    
    for input_size in df.columns:
        if input_size != 268435456:
            continue
        fig = plt.figure(figsize=(15,7))
        ax = fig.add_subplot(1, 1, 1)
        ax.plot(processes, baseline[input_size] / df[input_size], marker='o')
        ax.set_title(title + " " + typ + " Random, Input size: " + str(input_size))
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        #ax.legend(leg)
        plt.xscale("log", base=2)
        plt.xticks(processes)
        ax.set_xticklabels(processes)
        plt.show()
    
for node in tkRandom.graph.traverse():
    if node.frame["name"] not in ['main', 'comp_large', 'comm']:
        continue
    plot_mplRandomSpeedup(
        df=tkRandom.dataframe.loc[node, "Max time/rank"].unstack(level="input_size"),
        title=node.frame["name"],
        xlabel="Processes",
        ylabel="Speedup",
        typ="Speedup"
    )
    

**For comp_large, comm, main: Strong scaling plots for each input_size with lines for input_type (7 plots - 4 lines each)**

In [None]:
leg = ["Random", "Sorted", "Reverse", "Perturbed"]
processes = [2, 4, 8, 16, 32, 64, 128, 256, 512, 1024]

def plot_comparison(df_random, df_sorted, df_reverse, df_perturbed, title, xlabel, ylabel):    
    for input_size in df_random.columns:
        fig = plt.figure(figsize=(15,7))
        ax = fig.add_subplot(1, 1, 1)
        
        # Plot average times for all variants
        ax.plot(processes, df_random[input_size].values, marker='o', label='Random')
        ax.plot(processes, df_sorted[input_size].values, marker='s', label='Sorted')
        ax.plot(processes, df_reverse[input_size].values, marker='^', label='Reverse')
        ax.plot(processes, df_perturbed[input_size].values, marker='D', label='Perturbed')
        
        ax.set_title(f"{title}, Input size: {input_size}")
        ax.set_xlabel(xlabel)
        ax.set_ylabel(ylabel)
        ax.legend()
        plt.xscale("log", base=2)
        plt.xticks(processes)
        ax.set_xticklabels(processes)
        plt.grid(True, which="both", ls="-", alpha=0.2)
        filename = f"main_graphs/{title}_input_size_{input_size}.png"
        plt.savefig(filename)  # You can adjust the file format (e.g., .pdf, .svg, etc.)
        

# Usage in the loop
for node in tkRandom.graph.traverse():
    if node.frame["name"] not in ['main', 'comp_large', 'comm']:
        continue
    
    # Get corresponding nodes from other variants
    node_sorted = next(n for n in tkSorted.graph.traverse() 
                      if n.frame["name"] == node.frame["name"])
    node_reverse = next(n for n in tkReverse.graph.traverse() 
                       if n.frame["name"] == node.frame["name"])
    node_perturbed = next(n for n in tkPerturbed.graph.traverse() 
                         if n.frame["name"] == node.frame["name"])
    
    plot_comparison(
        # Average times for each variant
        df_random=tkRandom.dataframe.loc[node, "Avg time/rank"].unstack(level="input_size"),
        df_sorted=tkSorted.dataframe.loc[node_sorted, "Avg time/rank"].unstack(level="input_size"),
        df_reverse=tkReverse.dataframe.loc[node_reverse, "Avg time/rank"].unstack(level="input_size"),
        df_perturbed=tkPerturbed.dataframe.loc[node_perturbed, "Avg time/rank"].unstack(level="input_size"),
        title=node.frame["name"],
        xlabel="Processes",
        ylabel="Time (seconds)",
    )

**Strong scaling speedup plot for each input_type (4 plots)**

In [None]:
def plot_strong_scaling_speedup(df_random, df_sorted, df_reverse, df_perturbed, processes):
    """
    Plot strong scaling speedup for each input type (Random, Sorted, Reverse, Perturbed).
    Speedup is calculated relative to the execution time with minimum number of processes.
    """
    fig, axs = plt.subplots(2, 2, figsize=(20, 16))
    fig.suptitle('Strong Scaling Speedup by Input Type', fontsize=16)
    
    # Flatten axs for easier iteration
    axs = axs.flatten()
    
    # Data and settings for each subplot
    plot_data = [
        (df_random, 'Random Input', 0),
        (df_sorted, 'Sorted Input', 1),
        (df_reverse, 'Reverse Input', 2),
        (df_perturbed, 'Perturbed Input', 3)
    ]
    
    # Colors for different input sizes
    colors = plt.cm.viridis(np.linspace(0, 1, len(df_random.columns)))
    
    for df, title, idx in plot_data:
        ax = axs[idx]
        
        # Calculate speedup for each input size
        # Speedup = T(1)/T(p) where T(1) is time with minimum processes
        for i, input_size in enumerate(df.columns):
            baseline = df[input_size].iloc[0]  # Time with minimum processes
            speedup = [baseline/t for t in df[input_size]]
            
            # Plot speedup
            ax.plot(processes, speedup, marker='o', label=f'N={input_size}', 
                   color=colors[i], linewidth=2, markersize=8)
            
            # Plot ideal speedup (y=x line)
#             if i == 0:  # Only plot once per subplot
#                 ideal_speedup = [p/processes[0] for p in processes]
#                 ax.plot(processes, ideal_speedup, 'k--', label='Ideal Speedup', 
#                        alpha=0.5, linewidth=2)
        
        # Customize subplot
        ax.set_title(title, fontsize=14, pad=10)
        ax.set_xlabel('Number of Processes', fontsize=12)
        ax.set_ylabel('Speedup', fontsize=12)
        ax.set_xscale('log', base=2)
        ax.set_yscale('log', base=2)
        ax.grid(True, which="both", ls="-", alpha=0.2)
        ax.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=10)
        
        # Set ticks
        ax.set_xticks(processes)
        ax.set_xticklabels(processes, rotation=45)
        
        # Force aspect ratio to be equal
        ax.set_aspect('equal', adjustable='box')
    
    plt.tight_layout()
    plt.show()

# Usage example:
plot_strong_scaling_speedup(
    df_random=tkRandom.dataframe.loc[node, "Avg time/rank"].unstack(level="input_size"),
    df_sorted=tkSorted.dataframe.loc[node_sorted, "Avg time/rank"].unstack(level="input_size"),
    df_reverse=tkReverse.dataframe.loc[node_reverse, "Avg time/rank"].unstack(level="input_size"),
    df_perturbed=tkPerturbed.dataframe.loc[node_perturbed, "Avg time/rank"].unstack(level="input_size"),
    processes=processes
)

In [None]:
def plot_weak_scaling(df, input_type, processes, node_name):
    # Assuming input_size scales with number of processes to keep workload per process constant
    input_sizes = df.columns
    input_size_per_proc = min(input_sizes) / min(processes)

#Expected input sizes for weak scaling
    expected_input_sizes = [input_size_per_proc * p for p in processes]

#Initialize list to store times for each process count
    times = []

    for num_procs, input_size in zip(processes, expected_input_sizes):
        # Find the closest available input size in df.columns
        closest_input_size = min(input_sizes, key=lambda x: abs(x - input_size))

        if closest_input_size in df.columns:
            time = df[closest_input_size].loc[num_procs]
            times.append(time)
        else:
            times.append(np.nan)

    fig, ax = plt.subplots(figsize=(10, 6))
    ax.plot(processes, times, marker='o', label='Weak Scaling')

    ax.set_title(f"Weak Scaling - {input_type} - {node_name}")
    ax.set_xlabel("Number of Processes")
    ax.set_ylabel("Average Time per Rank (seconds)")
    ax.set_xscale('log', base=2)
    ax.set_xticks(processes)
    ax.get_xaxis().set_major_formatter(plt.ScalarFormatter())
    ax.legend()
    ax.grid(True)
    plt.tight_layout()
    plt.show()

#For each input type
for tk, input_type in zip([tkRandom, tkSorted, tkReverse, tkPerturbed], ['Random', 'Sorted', 'Reverse', 'Perturbed']):
    for node in tk.graph.traverse():
        if node.frame["name"] not in ['main', 'comp_large', 'comm']:
            continue
        node_name = node.frame["name"]
        df = tk.dataframe.loc[node, "Avg time/rank"].unstack(level="input_size")
        df = df.loc[processes]  # Ensure processes are in the correct order

#Plot weak scaling
        plot_weak_scaling(df, input_type, processes, node_name)

In [None]:
def plot_strong_scaling_speedup(df, input_type, processes, node_name):
    input_sizes = sorted(df.columns)
    fig = plt.figure(figsize=(15,7), facecolor=(1, 1, 1))
    ax = fig.add_subplot(1, 1, 1)
    
    colors = plt.cm.viridis(np.linspace(0, 1, len(input_sizes)))
    
    for i, input_size in enumerate(input_sizes):
        baseline = df[input_size].iloc[0]  # Time with minimum processes
        speedup = (baseline / df[input_size]) * 2

        
        ax.plot(processes, speedup, marker='o', label=f'N={input_size}', 
                color=colors[i], linewidth=2, markersize=6)
    
    ax.set_title(f"Strong Scaling Speedup - {input_type} - {node_name}")
    ax.set_xlabel("Number of Processes")
    ax.set_ylabel("Speedup")
    ax.set_xscale('log', base=2)
#     log base might be better for y axis? idk
    ax.set_yscale('log', base=2)
    ax.set_xticks(processes)
    ax.get_xaxis().set_major_formatter(plt.ScalarFormatter())
#     ax.legend(title='Input Sizes', bbox_to_anchor=(1.05, 1), loc='upper left')
    ax.legend()
    ax.grid(True, which="both", ls="-", alpha=0.3)
    plt.tight_layout()
    plt.show()

# For each input type
for tk, input_type in zip([tkRandom, tkSorted, tkReverse, tkPerturbed], ['Random', 'Sorted', 'Reverse', 'Perturbed']):
    for node in tk.graph.traverse():
        if node.frame["name"] not in ['main', 'comp_large', 'comm']:
            continue
        node_name = node.frame["name"]
        df = tk.dataframe.loc[node, "Avg time/rank"].unstack(level="input_size")
        df = df.loc[processes]  # Ensure processes are in the correct order
        
        # Plot strong scaling speedup using the provided method
        plot_strong_scaling_speedup(df, input_type, processes, node_name)