# Visualization of results
## Dependencies

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
import seaborn as sns
import glob
import inspect
import os
import csv

## Utility methods

In [None]:
# Define a method to compute the geometric mean
def geometric_mean(x):
    return np.exp(np.log(x).mean())

# Define a method to compute the average
def average(x):
    return x.mean()

def plot_distribution_by_algorithm(df, algorithms_list, fields, filter_condition_1=None, filter_condition_2=None, fontsize=13, num_bins=30):
    # Create plots for each algorithm
    for algorithm in algorithms_list:
        subset = df[df['algorithm'] == algorithm]

        # Apply the filter condition (if any) to the subset for the second histogram
        if filter_condition_1:
            filtered_subset_1 = subset[filter_condition_1(subset)] 
        # Apply the filter condition (if any) to the subset for the third histogram
        if filter_condition_1 and filter_condition_2:
            filtered_subset_2 = subset[filter_condition_2(subset)] 

        # Create a figure with subplots side by side
        fig, axes = plt.subplots(1, len(fields), figsize=(16, 4))

        for i, field in enumerate(fields):
            # Create logarithmic bins
            log_bins = np.logspace(np.log10(subset[field].min()/1.1), np.log10(subset[field].max() * 1.1), num=num_bins)
            
            # Plot first histogram (original subset)
            axes[i].hist(subset[field], bins=log_bins, color='lightblue', alpha=0.7, edgecolor='darkblue', label='All Runs')

            # Plot second histogram (filtered subset)
            if filter_condition_1:
                axes[i].hist(filtered_subset_1[field], bins=log_bins, color='lightgreen', alpha=0.7, edgecolor='darkgreen', label='Filtered Runs')

                # Plot third histogram (filtered subset)
            if filter_condition_1 and filter_condition_2:
                axes[i].hist(filtered_subset_2[field], bins=log_bins, color='lightpink', alpha=0.7, edgecolor='darkred', label='Second Filtered Runs')

            # Set plot properties
            axes[i].set_xscale('log')  # Set x-axis to logarithmic scale
            axes[i].set_title(f'{field} distribution for {algorithm}', fontsize=fontsize)
            axes[i].set_xlabel(f'{field} (log scale)', fontsize=fontsize)
            axes[i].set_ylabel('Number of Runs', fontsize=fontsize)
            axes[i].grid(True, linestyle=':')

            # Add legend to differentiate between the histograms
            axes[i].legend()

        num_total_runs = len(subset)

        relative_stats = {
            'num_runs_less_1_sec': (subset['time'] <= 1).sum(),
            'num_runs_more_1_min': (subset['time'] >= 60).sum(),
            #'num_runs_timeout': (subset['time'] >= 2 * 60 * 60).sum(),
            'num_runs_exact': (subset['is_exact'] == 1).sum(),
            'num_runs_!naive': ((subset['mincut'] < subset['naive_mincut']) & subset['mincut'].notnull()).sum(),
            'num_runs_!naive_&_!0': ((subset['mincut'] < subset['naive_mincut']) & subset['mincut'].notnull() & subset['mincut'] != 0).sum(),
            'num_runs_aborted': (subset['mincut'].isnull()).sum()
        }
        
        print(f"#################### {algorithm} ####################")
        print(f"num_total_runs: \t\t {num_total_runs}")
        for field in ['time', 'memory']:
            print(f"avg_total_{field}: \t\t {average(subset[field]):.3f}")
            print(f"geo_mean_total_{field}: \t\t {geometric_mean(subset[field]):.3f}")
        for name, stat in relative_stats.items():
            print(f"{name}: \t\t {stat} \t ({stat/num_total_runs * 100:.3f}%)")

        fig.tight_layout()
        plt.savefig(f'./{algorithm}/dist_{algorithm}.pdf')
        plt.show()

def plot_reduction_by_algorithm(algorithms_list, num_rounds=None, hypergraph_list=[], num_suffix_param_columns=4, fontsize=13):
    # Create plots for each algorithm
    for algorithm in algorithms_list:    
        # Define the path of the "all_reductions.csv" file
        all_reductions_path = f'./{algorithm}/all_reductions.csv'
        # Store the number of runs where no reduction rule was applied (i.e. naive mincut is already 0)
        num_runs_instant_exit = 0
        # Get the number of reductions in the preprocessing
        num_reductions_preprocessing = 1
        # Get the number of reductions per round
        num_reductions_per_round = 4 if "_IT0" in algorithm else 5

        if os.path.exists(all_reductions_path):
            with open(all_reductions_path, 'r') as file:
                reader = csv.reader(file, delimiter=';')
                # Store all rows of the CSV file
                all_rows = [row for row in reader]
                # Find the maximum number of columns in any row
                max_num_columns = max(len(row) for row in all_rows)
                # Cut off some reductions if a limit is set
                if num_rounds is not None:
                    max_num_columns = min(max_num_columns, 2 + 3 * (num_reductions_per_round + num_rounds * num_reductions_per_round) + num_suffix_param_columns)
                # The rows can have different number of columnns
                # Adjust each row so that the last three columns are aligned
                aligned_rows = []
                for row in all_rows:
                    # Ingore those rows where no reduction rule was applied (i.e. naive mincut is already 0)
                    if row[2] == '':
                        num_runs_instant_exit += 1
                        continue
                    # Align the row if necessary
                    if max_num_columns == len(row):
                        aligned_rows.append(row)
                    elif max_num_columns < len(row):
                        aligned_rows.append(row[:(max_num_columns-num_suffix_param_columns)] + row[-num_suffix_param_columns:])
                    else:
                        aligned_rows.append(row[:-num_suffix_param_columns] + [pd.NA, pd.NA, pd.NA] * ((max_num_columns - len(row))//3 - 1) + [row[-num_suffix_param_columns-3], row[-num_suffix_param_columns-2], pd.NA] + row[-num_suffix_param_columns:])
            all_reductions_df = pd.DataFrame(aligned_rows)
            if len(hypergraph_list) > 0:
                all_reductions_df = all_reductions_df[all_reductions_df.iloc[:, -3].isin(hypergraph_list)]
        else:
            print(f"File does not exist: {all_reductions_path}")
            continue
                            
        # For the base_solver_time, replace empty fields with NaN and convert to a nullable float (= Float64)
        all_reductions_df[all_reductions_df.columns[-num_suffix_param_columns]] = all_reductions_df[all_reductions_df.columns[-num_suffix_param_columns]].replace('', pd.NA).astype('Float64')

        # Copy the dataframe to compute the relative reductions to the initial size (in percentages)
        relative_all_reductions_df = all_reductions_df.copy()
    
        # Create three figures (reduction of edges, reduction of nodes, time per reduction)
        fig = plt.figure(figsize=(16, 10))
        gs = gridspec.GridSpec(3, 6, height_ratios=[1, 0.5, 0.75])
    
        # Perform the following operations for the edges (i = 0) and for the vertices (i = 1)
        for i, metric in enumerate(['edges', 'nodes']):
            # Get the columns containing the reductions (including the initial number)
            columns = all_reductions_df.columns[i:i+1].tolist() + all_reductions_df.columns[2+i:-num_suffix_param_columns:3].tolist()

            # Divide each column by the initial size to get the relative size (in percentages)
            for col in columns: 
                all_reductions_df[col] = all_reductions_df[col].astype('Int64')
                relative_all_reductions_df[col] = (all_reductions_df[col] / all_reductions_df[columns[0]]) * 100    
            
            # Plot the relative reductions of all runs of the current algorithm
            positions = [gs[0, :3], gs[0, 3:]]
            ax = fig.add_subplot(positions[i])
            for j in range(len(relative_all_reductions_df)):
                ax.plot(range(len(columns)), relative_all_reductions_df[columns].iloc[j], alpha=0.4)
            
            ax.set_title(f'% of remaining {metric} per reduction step for {algorithm}', fontsize=fontsize)
            ax.set_xlabel('reduction step', fontsize=fontsize)
            ax.set_ylabel(f'% of remaining {metric}', fontsize=fontsize)
            ax.set_ylim([-10, 110])
            ax.set_xlim([0, len(columns) - 1])
            ax.grid(True, linestyle=':')

        # Get the columns containing the reduction times
        columns = all_reductions_df.columns[4:-num_suffix_param_columns:3].tolist()
        # Plot the time of all runs of the current algorithm
        ax3 = fig.add_subplot(gs[1, :])
        for col in columns: 
            all_reductions_df[col] = all_reductions_df[col].astype('Float64')
        for j in range(len(all_reductions_df)):
            ax3.plot(range(1, len(columns) + 1), all_reductions_df[columns].iloc[j], alpha=0.4)  
        ax3.set_title(f'time per reduction step for {algorithm}', fontsize=fontsize)
        ax3.set_xlabel('reduction step', fontsize=fontsize)
        ax3.set_ylabel(f'time', fontsize=fontsize)
        ax3.set_xlim([1, len(columns)])
        ax3.set_yscale('log')
        ax3.grid(True, linestyle=':')

        # Compute the effectiveness of each reduction rule for all metrics
        # NB: For the metric time, the effectiveness is simply the time used
        for i, metric in enumerate(['edges', 'nodes', 'time']):
            effectiveness_list = []
            for j in range(num_reductions_per_round):
                # Get the base indices of each occurence of the given reduction rule 
                # NB1: This is always the #edges after the reduction rule
                base_indices = np.array(range(2 + 3 * (num_reductions_preprocessing + j), all_reductions_df.shape[1] - num_suffix_param_columns, 3 * num_reductions_per_round))
                # Compute the effectiveness (i.e. size after reduction rule relative to before when metric is edges or nodes, otherwise simply the time)
                # NB1: Now we can drop the first column
                # NB2: We need to make sure that we do not divide by zero
                if metric == 'time':
                    effectiveness_df = all_reductions_df.iloc[:, base_indices + i].astype('Float64')
                else:
                    effectiveness_df = 100 * (1 - all_reductions_df.iloc[:, base_indices + i].astype('Int64').div(all_reductions_df.iloc[:, base_indices + i - 3].astype('Int64').replace(0, pd.NA).values))
                # Since we previously aligned the rows, we need to make sure that we only keep the columns which do not act as a placeholder (i.e. time is not NaN)
                validity_df = (all_reductions_df.iloc[:, base_indices + 2].notna())
                validity_df.columns = effectiveness_df.columns                
                # NB: It is important to use 'float' instead of 'Float64' because afterwards we use np.round which needs np.nan and not pd.NA
                effectiveness_df = effectiveness_df.where(validity_df).astype('float')
                effectiveness_list.append(np.round(average(effectiveness_df.stack()), 3))

            # Plot the effectiveness of each reduction rule
            positions = [gs[2, :2], gs[2, 2:4], gs[2, 4:]]
            ax_eff = fig.add_subplot(positions[i])
            effectiveness_labels = ['no_heavy_edges', 'no_heavy_overlaps', 'no_shiftable_2-edges', 'no_triangle_2-edges']
            if not "_IT0" in algorithm:
                effectiveness_labels.insert(0, 'label_propagation')
            bars = ax_eff.bar(effectiveness_labels, effectiveness_list, color='lightblue', alpha=0.7, edgecolor='darkblue')
            ax_eff.set_xlabel('reduction rule', fontsize=fontsize)

            if metric == 'time':
                ax_eff.set_title('average time usage (s) for each rule', fontsize=fontsize)
                ax_eff.set_ylabel('average time usage (s)', fontsize=fontsize)
            else:
                ax_eff.set_title(f'% of average {metric} reduction for each rule', fontsize=fontsize)
                ax_eff.set_ylabel(f'% of average {metric} reduction', fontsize=fontsize)
                ax_eff.set_ylim([0, 100])
                ax_eff.set_yticks(ticks=range(0, 101, 10))
            ax_eff.grid(True, linestyle=':')
            # Set font size and rotation of x tick labels
            for label in ax_eff.get_xticklabels():
                label.set_fontsize(10)
                label.set_rotation(10)

            # Add value labels on top of each bar
            for bar in bars:
                height = bar.get_height()
                text = ax_eff.text(bar.get_x() + bar.get_width()/2, height, f'{height:.2f}', ha='center', va='center', color='white', fontsize=8, weight='bold')
                text.set_bbox(dict(boxstyle='square,pad=0.2', facecolor='darkblue',edgecolor='darkblue'))

        relative_stats = {
            'num_runs_instant_exit': num_runs_instant_exit,
            'num_runs_max_reduced': ((all_reductions_df.iloc[:, -num_suffix_param_columns-3] == 0) | (all_reductions_df.iloc[:, -num_suffix_param_columns-2] == 1)).sum(),
            'num_runs_base_solver': (all_reductions_df.iloc[:, -num_suffix_param_columns].notna()).sum()
        }

        num_total_runs = len(all_reductions_df) + num_runs_instant_exit
        print(f"#################### {algorithm} ####################")
        print(f"num_total_runs: \t\t {num_total_runs}")
        for name, stat in relative_stats.items():
            print(f"{name}: \t\t {stat} \t ({stat/num_total_runs * 100:.3f}%)")
        print(f"base_solver_avg_edges: \t\t {average(all_reductions_df[all_reductions_df.iloc[:, -num_suffix_param_columns].notna()].iloc[:, -num_suffix_param_columns-3]):.3f}")
        print(f"base_solver_geo_mean_edges: \t {geometric_mean(all_reductions_df[all_reductions_df.iloc[:, -num_suffix_param_columns].notna()].iloc[:, -num_suffix_param_columns-3]):.3f}")
        print(f"base_solver_avg_nodes: \t\t {average(all_reductions_df[all_reductions_df.iloc[:, -num_suffix_param_columns].notna()].iloc[:, -num_suffix_param_columns-2]):.3f}")
        print(f"base_solver_geo_mean_nodes: \t {geometric_mean(all_reductions_df[all_reductions_df.iloc[:, -num_suffix_param_columns].notna()].iloc[:, -num_suffix_param_columns-2]):.3f}")
        print(f"base_solver_avg_time: \t\t {average(all_reductions_df[all_reductions_df.iloc[:, -num_suffix_param_columns].notna()].iloc[:, -num_suffix_param_columns]):.3f}")
        print(f"base_solver_geo_mean_time: \t {geometric_mean(all_reductions_df[all_reductions_df.iloc[:, -num_suffix_param_columns].notna()].iloc[:, -num_suffix_param_columns]):.3f}")
        
        plt.tight_layout()
        plt.savefig(f'./{algorithm}/reduction_{algorithm}.pdf')
        plt.show()

def plot_performance_profile(df, naive_mincuts_df, algorithms_list, objectives, xlogscales=None, xmaxvals=None, fontsize=13):

    styles = [ ['blue', '--'], ['green', '-.'], ['orange', '--'], ['magenta', '-.'],  ['red', '--'], ['purple', '-.'], ['brown', '--'], ['cyan', '--'] ]
    
    # Create a figure with subplots side by side
    fig, axes = plt.subplots(1, len(objectives), figsize=(16, 4))

    for i, objective in enumerate(objectives):
        # Compute the minimum objective per instance
        min_objective_per_instance = (
                df[df['algorithm'].isin(algorithms_list)].groupby(['hypergraph', 'seed'])
                [objective].min()
                .reset_index()
                .rename(columns={objective: 'min_objective'})
        )

        # Initialize the overall max ratio (used for the x limits of the plot)
        overall_max_ratio = 2
        
        for j, algorithm in enumerate(algorithms_list):
            subset = df[df['algorithm'] == algorithm]
     
            # Join add the minimum objective to every instance of the subset
            subset = pd.merge(subset, min_objective_per_instance, on=['hypergraph', 'seed'], how='inner')
            
            # Compute the ratio to the best for each entry
            def compute_ratio_to_best(row):
                if row['min_objective'] == 0:
                    return 1.0 if row[objective] == 0 else 1000.0
                return row[objective] / row['min_objective']
            
            subset['ratio_to_best'] = subset.apply(compute_ratio_to_best, axis=1)

            # Update the overall max ratio (if necessary)
            max_ratio = subset['ratio_to_best'].max(skipna=True)
            if max_ratio > overall_max_ratio:
                overall_max_ratio = max_ratio
            
            # Sort the values by the ratio
            subset = subset.sort_values(by='ratio_to_best').reset_index()

            # Compute the sorted index (floating value ranging from 0 to 1)
            subset['ind'] = subset.index / (max(len(subset), len(naive_mincuts_df)) -1)
   
            # Plot the algorithm to the performance profile
            axes[i].plot(subset['ratio_to_best'], subset['ind'], label=algorithm, linewidth=3, linestyle=styles[j][1], color=styles[j][0])    

        # Set x-axis to logarithmic scale if the overall max ratio is high
        if xlogscales and xlogscales[i]:
            axes[i].set_xscale('log')  
        axes[i].set_title(f'{objective} performance profile', fontsize=fontsize)
        axes[i].set_xlabel(r'$\tau$', fontsize=fontsize)
        axes[i].set_xlim([0.99, xmaxvals[i] if xmaxvals and xmaxvals[i] else overall_max_ratio])
        axes[i].set_ylim([0, 1])
        axes[i].set_ylabel(rf'% instances $\leq \tau \cdot$ best instance', fontsize=fontsize)
        axes[i].grid(True, linestyle=':')

    # Hide y-ticks for all but the first subplot
    for ax in axes[1:]:  
        ax.yaxis.set_tick_params(labelleft=False)
        ax.set_ylabel('')

    # Collect legend handles and labels from all subplots
    handles, labels = axes[0].get_legend_handles_labels()

    # Add legend to differentiate between the algorithms
    fig.legend(handles, labels, loc='lower center', ncol=len(algorithms_list), frameon=True, fontsize=fontsize)

    # Adjust layout to make space for the legend
    plt.tight_layout(rect=[0, 0.1, 1, 1])  # Leaves space at the bottom for the legend

    plt.savefig(f'./performance_plots.pdf')
    plt.show()    # Get the list of the different parameter configurations/suffixes of the sequential version of the algorithm
    parameter_suffix_list = [a.removeprefix(sequential_algorithm) for a in algorithms_list if a.startswith(sequential_algorithm) and "parallel" not in a]

    if not parameter_suffix_list or len(parameter_suffix_list) == 0:
        print("Could not find the sequential version of the algorithm")
        return

    # Filter for a specific hypergraphs (if necessary)
    if hypergraphs and len(hypergraphs) > 0:
        df = df[df['hypergraph'].isin(hypergraphs)]

    group_methods = [{"name": "geometric mean", "func": geometric_mean}, {"name": "average", "func": average}];

      # Create a figure with subplots side by side
    fig, axes = plt.subplots(len(group_methods), len(fields), figsize=(16, 8))

    for i, group_method in enumerate(group_methods):
        for j, field in enumerate(fields):
            for param_suffix in parameter_suffix_list:
                # Build the common name for the parallel versions of the algorithm
                parallel_common_name = sequential_algorithm + "_parallel" + param_suffix + "_T"
                # Get the list of threads used by the parallel version of the algorithm
                parallel_threads_list = sorted([int(a.removeprefix(parallel_common_name)) for a in algorithms_list if a.startswith(parallel_common_name)])
            
                # Build the list of the geometric means / averages for the passed field starting with the sequential algorithm
                xvals = [group_method["func"](df[df['algorithm'] == (sequential_algorithm + param_suffix)][field])]
                for t in parallel_threads_list:
                    subset = df[df['algorithm'] == (parallel_common_name + str(t))]
                    xvals.append(group_method["func"](subset[field]))
                    
                axes[i, j].plot([0.5] + parallel_threads_list, xvals, label=param_suffix[1:], linewidth=3)   
    
            # Set plot properties
            axes[i, j].set_xscale('log', base=2)  # Set x-axis to logarithmic scale
            axes[i, j].set_title(f'{field} strong scalability for {sequential_algorithm}', fontsize=fontsize)
            axes[i, j].set_ylabel(f'{field} {group_method["name"]}', fontsize=fontsize)
            axes[i, j].set_xlabel(f'#threads (log scale)', fontsize=fontsize)
            axes[i, j].grid(True, linestyle=':')
            # Add legend to differentiate between the algorithms
            axes[i, j].legend()
            
    plt.tight_layout()
    plt.savefig(f'./strong_scalability_{sequential_algorithm}.pdf')
    plt.show()

def print_submodular_stats(algorithms_list):
    # Define the structure of the "all_iterations.csv" files
    all_iterations_columns = ['initial_num_nodes', 'num_iterations', 'num_contractions_per_it', 'hypergraph', 'seed', 'algorithm']
    # Create plots for each algorithm
    for algorithm in algorithms_list:    
        # Define the path of the "all_iterations.csv" file
        all_iterations_path = f'./{algorithm}/all_iterations.csv'

        print(f"#################### {algorithm} ####################")
        
        if os.path.exists(all_iterations_path):
            all_iterations_df = pd.read_csv(all_iterations_path, sep=';', header=None, names=all_iterations_columns)
            # Convert the columns to numeric if not already
            all_iterations_df['initial_num_nodes'] = pd.to_numeric(all_iterations_df['initial_num_nodes'], errors='coerce')
            all_iterations_df['num_iterations'] = pd.to_numeric(all_iterations_df['num_iterations'], errors='coerce')
            all_iterations_df['num_contractions_per_it'] = pd.to_numeric(all_iterations_df['num_contractions_per_it'], errors='coerce')
            # Handle any NaN values introduced during type conversion
            if all_iterations_df[['initial_num_nodes', 'num_iterations', 'num_contractions_per_it']].isna().any().any():
                print("Warning: NaN values found in columns after conversion.")

            num_total_runs = len(all_iterations_df)
            
            relative_stats = {
                'num_runs_finished': (all_iterations_df['num_iterations'].notna()).sum(),
            }
        
            print(f"num_total_runs: \t\t {num_total_runs}")
            print(f"avg_contractions_per_it: \t {average(all_iterations_df['num_contractions_per_it']):.3f}")
            print(f"geo_mean_contractions_per_it: \t {geometric_mean(all_iterations_df['num_contractions_per_it']):.3f}")
            for name, stat in relative_stats.items():
                print(f"{name}: \t\t {stat} \t ({stat/num_total_runs * 100:.3f}%)")
        else:
            print(f'Warning: No all_iterations.csv file found for {algorithm}.')
    # Define the structure of the "all_iterations.csv" files
    all_hypercactus_columns = ['initial_num_edges', 'initial_num_nodes', 'kernel_num_edges', 'kernel_num_nodes', 'hypercactus_num_edges', 'hypercactus_num_nodes', 'time', 'memory', 'hypergraph', 'seed', 'algorithm']
    # Create plots for each algorithm
    for algorithm in algorithms_list:    
        # Define the path of the "all_iterations.csv" file
        all_iterations_path = f'./{algorithm}/all_hypercactus_results.csv'

        print(f"#################### {algorithm} ####################")
        
        if os.path.exists(all_iterations_path):
            all_hypercactus_results_df = pd.read_csv(all_iterations_path, sep=';', header=None, names=all_hypercactus_columns)
            # Convert the columns to numeric if not already
            all_hypercactus_results_df['initial_num_edges'] = pd.to_numeric(all_hypercactus_results_df['initial_num_edges'], errors='coerce')
            all_hypercactus_results_df['initial_num_nodes'] = pd.to_numeric(all_hypercactus_results_df['initial_num_nodes'], errors='coerce')
            all_hypercactus_results_df['kernel_num_edges'] = pd.to_numeric(all_hypercactus_results_df['kernel_num_edges'], errors='coerce')
            all_hypercactus_results_df['kernel_num_nodes'] = pd.to_numeric(all_hypercactus_results_df['kernel_num_nodes'], errors='coerce')
            all_hypercactus_results_df['hypercactus_num_edges'] = pd.to_numeric(all_hypercactus_results_df['hypercactus_num_edges'], errors='coerce')
            all_hypercactus_results_df['hypercactus_num_nodes'] = pd.to_numeric(all_hypercactus_results_df['hypercactus_num_nodes'], errors='coerce')
            all_hypercactus_results_df['time'] = pd.to_numeric(all_hypercactus_results_df['time'], errors='coerce')
            all_hypercactus_results_df['memory'] = pd.to_numeric(all_hypercactus_results_df['memory'], errors='coerce')
            # Handle any NaN values introduced during type conversion
            if all_hypercactus_results_df[['hypercactus_num_edges', 'hypercactus_num_nodes', 'time']].isna().any().any():
                print("Warning: NaN values found in columns after conversion.")

            num_total_runs = len(all_hypercactus_results_df)
            
            relative_stats = {
                'num_runs_finished': (all_hypercactus_results_df['time'].notna()).sum(),
            }
        
            print(f"num_total_runs: \t\t {num_total_runs}")
            for name, stat in relative_stats.items():
                print(f"{name}: \t\t {stat} \t ({stat/num_total_runs * 100:.3f}%)")
            print(f"avg_initial_num_edges: \t\t {average(all_hypercactus_results_df['initial_num_edges']):.3f}")
            print(f"geo_mean_initial_num_edges: \t {geometric_mean(all_hypercactus_results_df['initial_num_edges']):.3f}")
            print(f"avg_initial_num_nodes: \t\t {average(all_hypercactus_results_df['initial_num_nodes']):.3f}")
            print(f"geo_mean_initial_num_nodes: \t {geometric_mean(all_hypercactus_results_df['initial_num_nodes']):.3f}")
            print()
            print(f"avg_kernel_num_edges: \t\t {average(all_hypercactus_results_df['kernel_num_edges']):.3f}")
            print(f"geo_mean_kernel_num_edges: \t {geometric_mean(all_hypercactus_results_df['kernel_num_edges']):.3f}")
            print(f"avg_kernel_num_nodes: \t\t {average(all_hypercactus_results_df['kernel_num_nodes']):.3f}")
            print(f"geo_mean_kernel_num_nodes: \t {geometric_mean(all_hypercactus_results_df['kernel_num_nodes']):.3f}")
            print()
            print(f"avg_hypercactus_num_edges: \t {average(all_hypercactus_results_df['hypercactus_num_edges']):.3f}")
            print(f"geo_mean_hypercactus_num_edges:  {geometric_mean(all_hypercactus_results_df['hypercactus_num_edges']):.3f}")
            print(f"avg_hypercactus_num_nodes: \t {average(all_hypercactus_results_df['hypercactus_num_nodes']):.3f}")
            print(f"geo_mean_hypercactus_num_nodes:  {geometric_mean(all_hypercactus_results_df['hypercactus_num_nodes']):.3f}")
            print()
            print(f"avg_time: \t\t\t {average(all_hypercactus_results_df['time']):.3f}")
            print(f"geo_mean_time: \t\t\t {geometric_mean(all_hypercactus_results_df['time']):.3f}")
            print(f"max_time: \t\t\t {all_hypercactus_results_df['time'].max():.3f}")
            print(f"avg_memory: \t\t\t {average(all_hypercactus_results_df['memory']):.3f}")
            print(f"geo_mean_memory: \t\t {geometric_mean(all_hypercactus_results_df['memory']):.3f}")
            print(f"max_memory: \t\t\t {all_hypercactus_results_df['memory'].max():.3f}")
        else:
            print(f'Warning: No all_hypercactus_results.csv file found for {algorithm}.')

## Read all_results.csv and all_naive_mincuts.csv

In [None]:
# Define the structure and path of the "all_results.csv" files
all_results_columns = ['mincut', 'time', 'memory', 'is_exact', 'hypergraph', 'seed', 'algorithm']
all_results_path = './*/all_results.csv'
# Define the structure and path of the "all_naive_mincuts.csv" file
all_naive_mincuts_columns = ['naive_mincut', 'hypergraph', ]
all_naive_mincuts_path = './all_naive_mincuts.csv'

# Find all "all_results.csv" files recursively
all_results_csv = glob.glob(all_results_path, recursive=True)

# Initialize an empty list to hold all results
all_results_df_list = []

# Loop through each file and read it into a DataFrame
for file in all_results_csv:
    try:
        df = pd.read_csv(file, sep=';', header=None, names=all_results_columns)
        # Convert 'time' and 'memory' to numeric if not already
        df['time'] = pd.to_numeric(df['time'], errors='coerce')
        df['memory'] = pd.to_numeric(df['memory'], errors='coerce')
        # Handle any NaN values introduced during type conversion
        if df[['time', 'memory']].isna().any().any():
            print("Warning: NaN values found in time or memory columns after conversion.")
        # Add the dataframe to the list
        all_results_df_list.append(df)  
    except Exception as e:
        print(f"Error reading file {file}: {e}")

# Combine all DataFrames into a single DataFrame
all_results_df = pd.concat(all_results_df_list, ignore_index=True)

all_naive_mincuts_df = None
if os.path.exists(all_naive_mincuts_path):
    all_naive_mincuts_df = pd.read_csv(all_naive_mincuts_path, sep=';', header=None, names=all_naive_mincuts_columns)
    all_results_df = pd.merge(all_results_df, all_naive_mincuts_df, on='hypergraph', how='left')
else:
    print(f"File does not exist: {all_naive_mincuts_path}")

# Get the list of algorithms
algorithms_list = sorted(all_results_df['algorithm'].unique())

print("Algorithms:", algorithms_list)
print()
print(all_results_df.info())

## Distribution by algorithm

In [None]:
plot_distribution_by_algorithm(
    all_results_df, 
    algorithms_list, 
    ["time", "memory"], 
    lambda df: (df['mincut'] < df['naive_mincut']) & df['mincut'].notnull(),
    lambda df: (df['mincut'] < df['naive_mincut']) & df['mincut'].notnull() & (df['mincut'] != 0)
)

In [None]:
# NB: Setting num_rounds to None shows all rounds, i.e. all reductions
plot_reduction_by_algorithm(algorithms_list, num_rounds=None)

In [None]:
plot_performance_profile(all_results_df, all_naive_mincuts_df, algorithms_list, ['mincut', 'time', 'memory'], [True, False, False], [None, 2, 1.1])

In [None]:
#plot_strong_scalability(all_results_df,algorithms_list, 'submodular', ['time', 'memory'], None)

In [None]:
print_submodular_stats(algorithms_list)

In [None]:
print_hypercactus_stats(['hypercactus_generator'])