In [48]:
import pandas as pd

## Helper Functions For Plotting

In [49]:
def calculate_speedup_and_efficiency(df: pd.DataFrame, num_threads_column: str = 'num_threads', duration_column: str = 'save_duration_seconds') -> pd.DataFrame:
    """
    Calculates speedup and efficiency for a DataFrame of benchmark results.

    Assumes the DataFrame contains results for different numbers of threads,
    including a run with 1 thread, which serves as the baseline.

    Args:
        df (pd.DataFrame): The input DataFrame containing benchmark results.
                          Must have columns for thread count and duration.
        num_threads_column (str): The name of the column containing the number of threads.
        duration_column (str): The name of the column containing the duration (runtime) in seconds.

    Returns:
        pd.DataFrame: A new DataFrame with 'speedup' and 'efficiency' columns added,
                      or an empty DataFrame if a single-thread baseline is not found.
    """
    if df.empty:
        print("Warning: Input DataFrame is empty.")
        return pd.DataFrame()

    # Get the baseline runtime (1 thread)
    baseline_row = df[df[num_threads_column] == 1]
    if baseline_row.empty:
        print(f"Error: Baseline (1 {num_threads_column}) not found in the DataFrame. Cannot calculate speedup.")
        return pd.DataFrame()

    baseline_runtime = baseline_row[duration_column].iloc[0]

    if baseline_runtime == 0:
        print("Warning: Baseline runtime is zero. Speedup and efficiency calculations will result in division by zero.")
        return pd.DataFrame()

    # Calculate Speedup
    # Speedup = Runtime (1 thread) / Runtime (P threads)
    df['speedup'] = baseline_runtime / df[duration_column]

    # Calculate Efficiency
    # Efficiency = Speedup / Number of threads
    df['efficiency'] = df['speedup'] / df[num_threads_column]

    return df

In [50]:
def extract_efficiency_and_speedup_at_threads(dfs: list[pd.DataFrame], threads = 2):
    extracted_data = []
    for i, df in enumerate(dfs):
        df_at_threads = df[df['num_threads'] == threads]
        efficiency_val = df_at_threads['efficiency'].iloc[0]
        value_size_val = df_at_threads['value_size'].iloc[0]
        speedup_val = df_at_threads['speedup'].iloc[0]

        extracted_data.append({'value_size': value_size_val, f'efficiency_at_{threads}_threads': efficiency_val, f'speedup_at_{threads}_threads': speedup_val })

    result_df = pd.DataFrame(extracted_data)
    result_df = result_df.sort_values(by='value_size').reset_index(drop=True)
    
    return result_df

In [51]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_workload_performance(df_metrics: pd.DataFrame, threads: int = 2):
    """
    Generates an interactive Plotly graph showing efficiency and speedup
    vs. value_size for a specific number of threads.

    Args:
        df_metrics (pd.DataFrame): DataFrame containing 'value_size',
                                   'efficiency_at_{threads}_threads',
                                   and 'speedup_at_{threads}_threads' columns.
                                   This DataFrame is typically generated by
                                   extract_efficiency_and_speedup_at_threads.
        threads (int): The number of threads for which the data is being plotted.
    """
    if df_metrics.empty:
        print("Error: Input DataFrame is empty. Cannot generate plot.")
        return

    # Check for required columns
    required_efficiency_col = f'efficiency_at_{threads}_threads'
    required_speedup_col = f'speedup_at_{threads}_threads'
    required_cols = ['value_size', required_efficiency_col, required_speedup_col]

    if not all(col in df_metrics.columns for col in required_cols):
        print(f"Error: DataFrame must contain '{required_cols}' columns for plotting.")
        return

    # Define fixed y-axis ranges for consistent comparison
    # Adjust these ranges based on your expected min/max values across all datasets
    # For speedup, it typically starts at 1.0 (baseline) and goes up.
    # For efficiency, it's typically between 0.0 and 1.0 (or slightly above 1.0 for superlinear).
    speedup_y_range = [0.0, 5.0]  # Example: from 0x to 5x speedup
    efficiency_y_range = [0.0, 1.5] # Example: from 0% to 150% efficiency (to accommodate superlinear)

    # Create subplots: one for Speedup, one for Efficiency
    fig = make_subplots(rows=1, cols=2,
                        subplot_titles=(f'Speedup at {threads} Threads vs. Value Size',
                                        f'Efficiency at {threads} Threads vs. Value Size'))

    # Plot Speedup
    fig.add_trace(
        go.Scatter(
            x=df_metrics['value_size'],
            y=df_metrics[required_speedup_col],
            mode='lines+markers',
            name=f'Speedup at {threads} Threads',
            marker=dict(symbol='circle', size=8),
            line=dict(width=2)
        ),
        row=1, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=df_metrics['value_size'],
            y=[1] * len(df_metrics), # Baseline for speedup
            mode='lines',
            name='Ideal (1x Speedup)',
            line=dict(dash='dash', color='gray'),
            showlegend=True
        ),
        row=1, col=1
    )

    # Plot Efficiency
    fig.add_trace(
        go.Scatter(
            x=df_metrics['value_size'],
            y=df_metrics[required_efficiency_col],
            mode='lines+markers',
            name=f'Efficiency at {threads} Threads',
            marker=dict(symbol='diamond', size=8),
            line=dict(width=2)
        ),
        row=1, col=2
    )
    fig.add_trace(
        go.Scatter(
            x=df_metrics['value_size'],
            y=[1] * len(df_metrics), # Ideal efficiency
            mode='lines',
            name='Ideal Efficiency (100%)',
            line=dict(dash='dash', color='gray'),
            showlegend=True
        ),
        row=1, col=2
    )
    fig.add_trace(
        go.Scatter(
            x=df_metrics['value_size'],
            y=[0.5] * len(df_metrics), # 50% efficiency reference
            mode='lines',
            name='50% Efficiency',
            line=dict(dash='dot', color='orange'),
            showlegend=True
        ),
        row=1, col=2
    )

    # Update layout for a beautiful graph
    fig.update_layout(
        title_text=f'Performance Scaling with Workload (at {threads} Threads)',
        height=600, width=1200,
        showlegend=True,
        hovermode="x unified", # Shows hover info for all traces at x-position
        template="plotly_white", # Clean white background
        title_x=0.5, # Center the main title
    )

    # Update x-axes to be consistent and descriptive
    fig.update_xaxes(
        title_text='Value Size (Bytes)',
        type='log', # Use log scale if value_size spans orders of magnitude
        tickmode='array',
        tickvals=df_metrics['value_size'].tolist(), # Set specific tick values
        ticktext=[f'{s}B' for s in df_metrics['value_size'].tolist()], # Add 'B' for bytes
        row=1, col=1
    )
    fig.update_xaxes(
        title_text='Value Size (Bytes)',
        type='log', # Use log scale if value_size spans orders of magnitude
        tickmode='array',
        tickvals=df_metrics['value_size'].tolist(),
        ticktext=[f'{s}B' for s in df_metrics['value_size'].tolist()],
        row=1, col=2
    )

    # Update y-axes for clarity and consistent ranges
    fig.update_yaxes(title_text='Speedup (X times faster)', range=speedup_y_range, row=1, col=1)
    fig.update_yaxes(title_text='Efficiency (Fraction of Ideal)', range=efficiency_y_range, row=1, col=2)

    fig.show()
    # You can also save it as an HTML file for interactive viewing later:
    # fig.write_html(f"performance_scaling_at_{threads}_threads.html")

In [52]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots

def plot_single_df_performance(df: pd.DataFrame, workload_description: str = "Performance Metrics"):
    """
    Generates an interactive Plotly graph showing speedup and efficiency
    vs. number of threads for a single DataFrame, with consistent y-axis ranges.

    Args:
        df (pd.DataFrame): The input DataFrame containing 'num_threads', 'speedup',
                           and 'efficiency' columns.
        workload_description (str): A descriptive string for the plot title,
                                    e.g., "Value Size 50 Bytes, No Compression".
    """
    if df.empty or 'num_threads' not in df.columns or 'speedup' not in df.columns or 'efficiency' not in df.columns:
        print("Error: DataFrame must contain 'num_threads', 'speedup', and 'efficiency' columns for plotting.")
        return

    # Define fixed y-axis ranges for consistent comparison across all your plots
    # IMPORTANT: Adjust these ranges based on the MIN and MAX values you expect
    # across ALL your benchmark datasets for both speedup and efficiency.
    speedup_y_range = [0.0, 5.0]  # Example: from 0x to 5x speedup
    efficiency_y_range = [0.0, 1.5] # Example: from 0% to 150% efficiency (to accommodate superlinear)

    # Create subplots: one for Speedup, one for Efficiency
    fig = make_subplots(rows=1, cols=2,
                        subplot_titles=(f'Speedup', f'Efficiency'))

    # Plot Speedup
    fig.add_trace(
        go.Scatter(
            x=df['num_threads'],
            y=df['speedup'],
            mode='lines+markers',
            name='Speedup',
            marker=dict(symbol='circle', size=8),
            line=dict(width=2, color='royalblue')
        ),
        row=1, col=1
    )
    fig.add_trace(
        go.Scatter(
            x=df['num_threads'],
            y=[1] * len(df), # Baseline for speedup
            mode='lines',
            name='Ideal Speedup',
            line=dict(dash='dash', color='gray'),
            showlegend=True
        ),
        row=1, col=1
    )

    # Plot Efficiency
    fig.add_trace(
        go.Scatter(
            x=df['num_threads'],
            y=df['efficiency'],
            mode='lines+markers',
            name='Efficiency',
            marker=dict(symbol='diamond', size=8),
            line=dict(width=2, color='firebrick')
        ),
        row=1, col=2
    )
    fig.add_trace(
        go.Scatter(
            x=df['num_threads'],
            y=[1] * len(df), # Ideal efficiency
            mode='lines',
            name='Ideal Efficiency (100%)',
            line=dict(dash='dash', color='gray'),
            showlegend=True
        ),
        row=1, col=2
    )
    fig.add_trace(
        go.Scatter(
            x=df['num_threads'],
            y=[0.5] * len(df), # 50% efficiency reference
            mode='lines',
            name='50% Efficiency',
            line=dict(dash='dot', color='orange'),
            showlegend=True
        ),
        row=1, col=2
    )

    # Update layout for a beautiful graph
    fig.update_layout(
        title_text=f'{workload_description} vs. Number of Threads',
        height=600, width=1200,
        showlegend=True,
        hovermode="x unified", # Shows hover info for all traces at x-position
        template="plotly_white", # Clean white background
        title_x=0.5, # Center the main title
    )

    # Update x-axes
    fig.update_xaxes(
        title_text='Number of Threads',
        tickmode='array',
        tickvals=df['num_threads'].unique(), # Ensure ticks match thread counts
        row=1, col=1
    )
    fig.update_xaxes(
        title_text='Number of Threads',
        tickmode='array',
        tickvals=df['num_threads'].unique(),
        row=1, col=2
    )

    # Update y-axes for clarity and consistent ranges
    # The key change for consistent axes is setting the 'range' property
    fig.update_yaxes(title_text='Speedup (X times faster)', range=speedup_y_range, row=1, col=1)
    fig.update_yaxes(title_text='Efficiency (Fraction of Ideal)', range=efficiency_y_range, row=1, col=2)

    fig.show()
    # You can also save it as an HTML file for interactive viewing later:
    # fig.write_html(f"performance_metrics_{workload_description.replace(' ', '_').replace('.', '')}.html")

In [53]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px # Import plotly.express for its color sequences

def format_size(size: int) -> str:
    """
    Converts an integer byte value into a human-readable string
    using KB (Kilobytes) and MB (Megabytes) units.
    """
    if size >= 1000000:
        return f'{size / 1000000:.0f}MB'
    elif size >= 1000:
        return f'{size / 1000:.0f}KB'
    else:
        return f'{size}B'
    
    
def plot_combined_workload_performance(dfs: list[pd.DataFrame], workload_type_description: str = "Overall Performance"):
    """
    Generates an interactive Plotly graph showing speedup and efficiency
    vs. value_size for multiple thread counts on the same chart, with distinct colors.

    Args:
        dfs (list[pd.DataFrame]): A list of DataFrames, where each DataFrame
                                   contains 'num_threads', 'value_size', 'speedup',
                                   and 'efficiency' columns for a specific workload.
                                   It's assumed 'speedup' and 'efficiency' have
                                   already been calculated for each df.
        workload_type_description (str): A general description for the plot title,
                                         e.g., "with Compression ON".
    """
    if not dfs:
        print("Error: Input list of DataFrames is empty. Cannot generate plot.")
        return

    # Combine all DataFrames into a single one for easier processing
    combined_df = pd.concat(dfs, ignore_index=True)

    # Ensure required columns exist
    required_cols = ['num_threads', 'value_size', 'speedup', 'efficiency']
    if not all(col in combined_df.columns for col in required_cols):
        print(f"Error: Combined DataFrame must contain '{required_cols}' columns for plotting.")
        return

    # Ensure speedup/efficiency are numeric
    combined_df['speedup'] = pd.to_numeric(combined_df['speedup'], errors='coerce')
    combined_df['efficiency'] = pd.to_numeric(combined_df['efficiency'], errors='coerce')
    combined_df.dropna(subset=['speedup', 'efficiency'], inplace=True)

    # Define fixed y-axis ranges for consistent comparison
    speedup_y_range = [0.0, max(5.0, combined_df['speedup'].max() * 1.1)]
    efficiency_y_range = [0.0, max(1.1, combined_df['efficiency'].max() * 1.1)]

    # Get unique thread counts for different lines
    unique_threads = sorted(combined_df['num_threads'].unique())
    # Get unique value sizes for x-axis ticks
    unique_value_sizes = sorted(combined_df['value_size'].unique())

    # --- Color and Marker Definition ---
    # Using Plotly Express's default qualitative color sequence for distinctness
    colors = px.colors.qualitative.Plotly
    # Define a list of distinct marker symbols
    markers = ['circle', 'square', 'diamond', 'cross', 'x', 'triangle-up', 'triangle-down', 'pentagon', 'hexagram']
    
    # Map thread count to a color and marker
    color_map = {thread: colors[i % len(colors)] for i, thread in enumerate(unique_threads)}
    marker_map = {thread: markers[i % len(markers)] for i, thread in enumerate(unique_threads)}
    # --- End Color and Marker Definition ---


    # Create subplots: one for Speedup, one for Efficiency
    fig = make_subplots(rows=1, cols=2,
                        subplot_titles=(f'Speedup vs. Value Size',
                                        f'Efficiency vs. Value Size'))

    # Plot Speedup for each number of threads
    for threads_count in unique_threads:
        df_subset = combined_df[combined_df['num_threads'] == threads_count]
        fig.add_trace(
            go.Scatter(
                x=df_subset['value_size'],
                y=df_subset['speedup'],
                mode='lines+markers',
                name=f'{threads_count} Threads',
                legendgroup=str(threads_count), # Group traces for consistent legend
                showlegend=True, # Show legend for each thread group
                marker=dict(symbol=marker_map[threads_count], size=8), # Apply distinct marker
                line=dict(width=2, color=color_map[threads_count]) # Apply distinct color
            ),
            row=1, col=1
        )
    # Add ideal speedup line
    fig.add_trace(
        go.Scatter(
            x=unique_value_sizes,
            y=[1] * len(unique_value_sizes),
            mode='lines',
            name='1x Speedup',
            line=dict(dash='dash', color='gray'), # Keep ideal lines distinct and subtle
            showlegend=True,
            legendgroup='ideal_speedup'
        ),
        row=1, col=1
    )


    # Plot Efficiency for each number of threads
    for threads_count in unique_threads:
        df_subset = combined_df[combined_df['num_threads'] == threads_count]
        fig.add_trace(
            go.Scatter(
                x=df_subset['value_size'],
                y=df_subset['efficiency'],
                mode='lines+markers',
                name=f'{threads_count} Threads',
                legendgroup=str(threads_count), # Group traces for consistent legend
                showlegend=False, # Hide legend for efficiency to avoid duplication
                marker=dict(symbol=marker_map[threads_count], size=8), # Apply distinct marker
                line=dict(width=2, color=color_map[threads_count]) # Apply distinct color
            ),
            row=1, col=2
        )
    # Add ideal efficiency lines
    fig.add_trace(
        go.Scatter(
            x=unique_value_sizes,
            y=[1] * len(unique_value_sizes),
            mode='lines',
            name='Ideal Efficiency (100%)',
            line=dict(dash='dash', color='gray'),
            showlegend=True,
            legendgroup='ideal_efficiency_1'
        ),
        row=1, col=2
    )
    fig.add_trace(
        go.Scatter(
            x=unique_value_sizes,
            y=[0.5] * len(unique_value_sizes),
            mode='lines',
            name='50% Efficiency',
            line=dict(dash='dot', color='orange'),
            showlegend=True,
            legendgroup='ideal_efficiency_05'
        ),
        row=1, col=2
    )


    # Update layout for a beautiful graph
    fig.update_layout(
        title_text=workload_type_description,
        height=600, width=1200,
        showlegend=True,
        hovermode="x unified",
        template="plotly_white",
        title_x=0.5,
    )

    # Update x-axes to be consistent and descriptive
    fig.update_xaxes(
        title_text='Value Size (Bytes)',
        type='log',
        tickmode='array',
        tickvals=unique_value_sizes,
        ticktext=[f'{format_size(s)}' for s in unique_value_sizes],
        row=1, col=1
    )
    fig.update_xaxes(
        title_text='Value Size (Bytes)',
        type='log',
        tickmode='array',
        tickvals=unique_value_sizes,
        ticktext=[f'{s}B' for s in unique_value_sizes],
        row=1, col=2
    )

    # Update y-axes for clarity and consistent ranges
    fig.update_yaxes(title_text='Speedup (X times faster)', range=speedup_y_range, row=1, col=1)
    fig.update_yaxes(title_text='Efficiency (Fraction of Ideal)', range=efficiency_y_range, row=1, col=2)

    fig.show()
    # fig.write_html(f"combined_performance_scaling_{workload_type_description.replace(' ', '_').replace('.', '')}.html")

In [54]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px # For distinct colors and markers

def plot_performance_by_workload_on_threads(dfs: list[pd.DataFrame], workload_names: list[str], title: str):
    """
    Generates an interactive Plotly graph showing speedup and efficiency
    vs. number of threads, with each line representing a different workload.

    Args:
        dfs (list[pd.DataFrame]): A list of DataFrames, where each DataFrame
                                   contains 'num_threads', 'value_size', 'speedup',
                                   and 'efficiency' columns for a specific workload.
                                   It's assumed 'speedup' and 'efficiency' have
                                   already been calculated for each df.
        workload_names (list[str]): A list of descriptive names for each workload,
                                     used for legend labels (e.g., ["50 Bytes", "500 Bytes"]).
                                     Length must match dfs.
    """
    if not dfs or not workload_names or len(dfs) != len(workload_names):
        print("Error: Input lists 'dfs' and 'workload_names' must not be empty and must have matching lengths.")
        return

    # Define fixed y-axis ranges for consistent comparison across all your plots
    # Adjust these ranges based on the MIN and MAX values you expect across ALL your datasets.
    speedup_y_range = [0.0, 25]  # Example: from 0x to 5x speedup
    efficiency_y_range = [0.0, 2] # Example: from 0% to 150% efficiency (to accommodate superlinear)

    # Get all unique thread counts across all DFs for x-axis ticks
    all_num_threads = sorted(pd.concat(df['num_threads'] for df in dfs).unique())

    # --- Color and Marker Definition ---
    colors = px.colors.qualitative.Plotly # A good default palette
    markers = ['circle', 'square', 'diamond', 'cross', 'x', 'triangle-up', 'triangle-down', 'pentagon', 'hexagram']
    # --- End Color and Marker Definition ---

    # Create subplots: one for Speedup, one for Efficiency
    fig = make_subplots(rows=1, cols=2,
                        subplot_titles=(f'Speedup vs. Number of Threads',
                                        f'Efficiency vs. Number of Threads'))

    # Plot Speedup for each workload
    for i, df in enumerate(dfs):
        workload_label = workload_names[i]
        color = colors[i % len(colors)]
        marker = markers[i % len(markers)]

        fig.add_trace(
            go.Scatter(
                x=df['num_threads'],
                y=df['speedup'],
                mode='lines+markers',
                name=f'Speedup: {workload_label}',
                marker=dict(symbol=marker, size=8),
                line=dict(width=2, color=color),
                legendgroup=workload_label, # Group traces for consistent legend
                showlegend=True
            ),
            row=1, col=1
        )

    # Add Ideal Speedup line (for reference, only once)
    fig.add_trace(
        go.Scatter(
            x=all_num_threads,
            y=[1] * len(all_num_threads),
            mode='lines',
            name='(1x Speedup)',
            line=dict(dash='dash', color='gray'),
            showlegend=True,
            legendgroup='ideal_speedup'
        ),
        row=1, col=1
    )


    # Plot Efficiency for each workload
    for i, df in enumerate(dfs):
        workload_label = workload_names[i]
        color = colors[i % len(colors)]
        marker = markers[i % len(markers)]

        fig.add_trace(
            go.Scatter(
                x=df['num_threads'],
                y=df['efficiency'],
                mode='lines+markers',
                name=f'Efficiency: {workload_label}',
                marker=dict(symbol=marker, size=8),
                line=dict(width=2, color=color),
                legendgroup=workload_label, # Group traces for consistent legend
                showlegend=False # Hide legend here to avoid duplication, as it's shown for Speedup
            ),
            row=1, col=2
        )

    # Add Ideal Efficiency lines (for reference, only once)
    fig.add_trace(
        go.Scatter(
            x=all_num_threads,
            y=[1] * len(all_num_threads),
            mode='lines',
            name='Ideal Efficiency (100%)',
            line=dict(dash='dash', color='gray'),
            showlegend=True,
            legendgroup='ideal_efficiency_1'
        ),
        row=1, col=2
    )
    fig.add_trace(
        go.Scatter(
            x=all_num_threads,
            y=[0.5] * len(all_num_threads),
            mode='lines',
            name='50% Efficiency',
            line=dict(dash='dot', color='orange'),
            showlegend=True,
            legendgroup='ideal_efficiency_05'
        ),
        row=1, col=2
    )


    # Update layout for a beautiful graph
    fig.update_layout(
        title_text=title,
        height=600, width=1200,
        showlegend=True,
        hovermode="x unified",
        template="plotly_white",
        title_x=0.5,
    )

    # Update x-axes for clarity
    fig.update_xaxes(
        title_text='Number of Threads',
        tickmode='array',
        tickvals=all_num_threads,
        row=1, col=1
    )
    fig.update_xaxes(
        title_text='Number of Threads',
        tickmode='array',
        tickvals=all_num_threads,
        row=1, col=2
    )

    # Update y-axes for clarity and consistent ranges
    fig.update_yaxes(title_text='Speedup (X times faster)', range=speedup_y_range, row=1, col=1)
    fig.update_yaxes(title_text='Efficiency (Fraction of Ideal)', range=efficiency_y_range, row=1, col=2)

    fig.show()
    # fig.write_html(output_filename)
    # You can also save it as an HTML file for interactive viewing later:
    # fig.write_html(f"performance_scaling_across_workloads.html")

In [55]:
import os
os.listdir('./')

['save_summary_workload_heavy-product_2.0keys_100B_tempfs_yes_comp-no_csum-yes.csv',
 'performance_improvement_vs_value_size.ipynb',
 'save_summary_workload_heavy-product_1.0keys_100B_tempfs_no_comp-no_csum-yes.csv']

## Loading and Plotting The Speed up and efficiency of the program for differnet thread configs and workloads

In [56]:
test_configs = [
    # {"num_keys" : 20.0, "value_size": 100},
    {"num_keys" : 2.0, "value_size": 100, "tempfs": True, "workload": "heavy-product"},
    {"num_keys" : 1.0, "value_size": 100, "tempfs": False, "workload": "heavy-product"},

    

    # {"num_keys": 2000.0, "value_size": 50},
    # {"num_keys": 1000.0, "value_size": 300},
    # {"num_keys": 500.0, "value_size": 500},
    # {"num_keys": 50.0, "value_size": 5000},
    # {"num_keys": 10.0, "value_size": 20000},
    # {"num_keys": 5.0, "value_size": 64000},
    # {"num_keys": 5.0, "value_size": 66000},
    # {"num_keys": 1.0, "value_size": 100000},
    # {"num_keys": 0.5, "value_size": 262000},
    # {"num_keys": 0.5, "value_size": 272000},
    # {"num_keys": 0.1, "value_size": 1000000},
    ]


def load_df(test_config: dict, compression_on: bool):
    num_keys = test_config["num_keys"]
    value_size = test_config["value_size"]
    compression_str = "yes" if compression_on else "no"
    tempfs_str =  "yes" if test_config["tempfs"] else "no"
    df = pd.read_csv(f"save_summary_workload_{test_config["workload"]}_{num_keys}keys_{value_size}B_tempfs_{tempfs_str}_comp-no_csum-yes.csv")
    
    ret_df = df[df["rdbcompression"] == compression_str].copy()
    return ret_df


dfs_compression_on = [load_df(config, compression_on=True) for config in test_configs]
dfs_compression_on = [calculate_speedup_and_efficiency(df) for df in dfs_compression_on]
dfs_compression_off = [load_df(config, compression_on=False) for config in test_configs]
dfs_compression_off = [calculate_speedup_and_efficiency(df) for df in dfs_compression_off]

print("\n" + "="*50) # Top border
print("                 COMPRESSION ON")
print("="*50 + "\n") # Bottom border

all_workload_names = [f"Value Size {format_size(config["value_size"])}" for config in test_configs]
plot_performance_by_workload_on_threads(dfs_compression_on, all_workload_names, "Speedup / Efficiency Vs. Num Threads for each workload Compression ON")
# plot_combined_workload_performance(dfs_compression_on )

print("\n" + "="*50) # Top border
print("                 COMPRESSION OFF")
print("="*50 + "\n") # Bottom border

plot_performance_by_workload_on_threads(dfs_compression_off, all_workload_names, "Speedup / Efficiency Vs. Num Threads for each workload Compression OFF")
# plot_combined_workload_performance(dfs_compression_off)
        


                 COMPRESSION ON




                 COMPRESSION OFF



In [57]:
import pandas as pd
import plotly.express as px
import numpy as np

def plot_raw_times_bar_chart(df: pd.DataFrame, workload_name: str):
    """
    Generates a bar chart showing raw save duration vs. number of threads,
    with separate bars for compression ON and OFF for each thread count,
    and displays the plot directly.
    
    Args:
        df (pd.DataFrame): A DataFrame containing 'num_threads', 'save_duration_seconds',
                           and 'rdbcompression' columns for a single workload.
        workload_name (str): The name of the workload for the plot title.
    """
    if df is None or df.empty:
        print("Error: Input DataFrame is empty or invalid. Cannot generate plot.")
        return

    # Ensure required columns are present and numeric where needed
    required_cols = ['num_threads', 'save_duration_seconds', 'rdbcompression']
    if not all(col in df.columns for col in required_cols):
        print(f"Error: DataFrame must contain '{required_cols}' columns for plotting.")
        return
    
    df['save_duration_seconds'] = pd.to_numeric(df['save_duration_seconds'], errors='coerce')
    df['num_threads'] = df['num_threads'].astype(str)
    df['save_duration_minutes'] = df['save_duration_seconds'] / 60


    fig = px.bar(
        df,
        x='num_threads',
        y='save_duration_seconds',
        color='rdbcompression',
        barmode='group',
        title=f'Raw Save Times for Workload: {workload_name}',
        labels={'num_threads': 'Number of Threads', 'save_duration_seconds': 'Save Duration (Seconds)', 'rdbcompression': 'Compression'},
        template="plotly_white",
    )

    fig.update_layout(
        title_x=0.5,
        xaxis_title="Number of Threads",
        yaxis_title="Save Duration (seconds)",
        legend_title="RDB Compression",
    )

    fig.show()


## Plotting the Raw Save Time for each workload for each thread configuration

In [58]:
combined_dfs = []
for i in range(len(dfs_compression_on)):
    combined_df = pd.concat([dfs_compression_on[i], dfs_compression_off[i]], ignore_index=True)
    combined_dfs.append(combined_df)

for df in combined_dfs:
    if not df.empty:
        num_keys = f"{round(df['keys'].iloc[0] * 1e-6, 1)}"
        value_size = format_size(df['value_size'].iloc[0])
        workload_name = f"Num Keys: {num_keys} Million, Value: Complex JSON"
        
        plot_raw_times_bar_chart(df, workload_name)
        

In [59]:
def print_raw_data(df):
    print(f"Num Keys: {df['keys'].iloc[0]}, Value Size: {df['value_size'].iloc[0]} Bytes ")
    return df

In [67]:
dfs_compression_on[0].columns

Index(['keys', 'value_size', 'num_threads', 'rdbcompression', 'rdbchecksum',
       'valkey_data_throughput_mb_s', 'actual_throughput_mb_s',
       'rdb_file_size_bytes', 'status', 'port', 'save_duration_seconds',
       'cpu_utilization_percent', 'cpu_total_time_seconds', 'io_read_bytes',
       'io_write_bytes', 'memory_rss_bytes', 'context_switches_voluntary',
       'context_switches_involuntary', 'iowait_time_seconds',
       'iowait_percentage', 'speedup', 'efficiency'],
      dtype='object')

In [77]:
dfs_compression_on[0].iloc[0]["rdb_file_size_bytes"]
dfs_compression_off[0].iloc[0]["rdb_file_size_bytes"]
comp_ratio = dfs_compression_off[0].iloc[0]["rdb_file_size_bytes"] / dfs_compression_on[0].iloc[0]["rdb_file_size_bytes"]
print(f"Compression Ratio: {comp_ratio}")
print()
best_time_ratio = min(dfs_compression_off[0]["save_duration_seconds"])/ min(dfs_compression_on[0]["save_duration_seconds"])
print(f"Best Time ratio {best_time_ratio}")

Compression Ratio: 1.5580819310023364

Best Time ratio 1.4813158600800544


In [73]:
for df_comp_on, df_comp_off in zip(dfs_compression_on, dfs_compression_off):
    display(print_raw_data(df_comp_on))
    display(print_raw_data(df_comp_off))


Num Keys: 2000000, Value Size: 100 Bytes 


Unnamed: 0,keys,value_size,num_threads,rdbcompression,rdbchecksum,valkey_data_throughput_mb_s,actual_throughput_mb_s,rdb_file_size_bytes,status,port,...,cpu_total_time_seconds,io_read_bytes,io_write_bytes,memory_rss_bytes,context_switches_voluntary,context_switches_involuntary,iowait_time_seconds,iowait_percentage,speedup,efficiency
0,2000000,100,1,yes,yes,0,88.325058,18456215143,ok,7000,...,210.86,0,0,42293137408,1,847,0.09,0.043071,1.0,1.0
1,2000000,100,2,yes,yes,0,200.795436,18456215143,ok,7000,...,184.8,0,0,42295848960,41,305,0.11,0.119675,2.273369,1.136684
2,2000000,100,3,yes,yes,0,283.426525,18456215143,ok,7000,...,194.4,0,0,42297171968,2094,189,0.02,0.030713,3.208903,1.069634
3,2000000,100,4,yes,yes,0,365.158194,18456215143,ok,7000,...,201.82,0,0,42298531840,361,139,0.02,0.03957,4.134254,1.033563
4,2000000,100,5,yes,yes,0,462.883556,18456215143,ok,7000,...,198.04,0,0,42299850752,599,55,0.02,0.05016,5.240682,1.048136
5,2000000,100,6,yes,yes,0,579.841538,18456215143,ok,7000,...,188.71,0,0,42301341696,1698,69,0.01,0.031417,6.564859,1.094143
6,2000000,100,7,yes,yes,0,699.85101,18456215143,ok,7000,...,182.86,0,0,42294837248,528,33,0.01,0.03792,7.923584,1.131941
7,2000000,100,8,yes,yes,0,777.272348,18456215143,ok,7000,...,187.77,0,0,42296307712,380,31,0.0,0.0,8.800134,1.100017
8,2000000,100,9,yes,yes,0,912.616507,18456215143,ok,7000,...,180.26,0,0,42296549376,84,45,0.01,0.049448,10.332476,1.148053
9,2000000,100,10,yes,yes,0,1029.467496,18456215143,ok,7000,...,177.57,0,0,42299437056,1479,27,0.01,0.055779,11.655441,1.165544


Num Keys: 2000000, Value Size: 100 Bytes 


Unnamed: 0,keys,value_size,num_threads,rdbcompression,rdbchecksum,valkey_data_throughput_mb_s,actual_throughput_mb_s,rdb_file_size_bytes,status,port,...,cpu_total_time_seconds,io_read_bytes,io_write_bytes,memory_rss_bytes,context_switches_voluntary,context_switches_involuntary,iowait_time_seconds,iowait_percentage,speedup,efficiency
16,2000000,100,1,no,yes,0,459.69908,28756295329,ok,7000,...,63.15,0,0,42297700352,1,176,0.02,0.031972,1.0,1.0
17,2000000,100,2,no,yes,0,1118.616498,28756295329,ok,7000,...,50.87,0,0,42299846656,2486,67,0.01,0.0389,2.433367,1.216683
18,2000000,100,3,no,yes,0,1754.988376,28756295329,ok,7000,...,45.27,0,0,42300919808,4458,29,0.01,0.06103,3.81769,1.272563
19,2000000,100,4,no,yes,0,1733.15811,28756295329,ok,7000,...,50.09,0,0,42301992960,7325,20,0.01,0.060271,3.770201,0.94255
20,2000000,100,5,no,yes,0,1762.997717,28756295329,ok,7000,...,46.91,0,0,42303066112,4810,15,0.01,0.061308,3.835113,0.767023
21,2000000,100,6,no,yes,0,1750.95317,28756295329,ok,7000,...,49.85,0,0,42304126976,4309,34,0.0,0.0,3.808912,0.634819
22,2000000,100,7,no,yes,0,1766.389674,28756295329,ok,7000,...,47.72,0,0,42297675776,3944,20,0.01,0.061426,3.842491,0.548927
23,2000000,100,8,no,yes,0,1775.444085,28756295329,ok,7000,...,44.7,0,0,42298671104,3116,30,0.01,0.061741,3.862188,0.482773
24,2000000,100,9,no,yes,0,1768.783298,28756295329,ok,7000,...,46.69,0,0,42298658816,3006,31,0.0,0.0,3.847698,0.427522
25,2000000,100,10,no,yes,0,1759.928022,28756295329,ok,7000,...,49.97,0,0,42300805120,2821,19,0.01,0.061201,3.828435,0.382843


Num Keys: 1000000, Value Size: 100 Bytes 


Unnamed: 0,keys,value_size,num_threads,rdbcompression,rdbchecksum,valkey_data_throughput_mb_s,actual_throughput_mb_s,rdb_file_size_bytes,status,port,...,cpu_total_time_seconds,io_read_bytes,io_write_bytes,memory_rss_bytes,context_switches_voluntary,context_switches_involuntary,iowait_time_seconds,iowait_percentage,speedup,efficiency
0,1000000,100,1,yes,yes,0,81.939417,9223976972,ok,7000,...,121.96,0,9223983104,21155422208,36,448,0.15,0.13325,1.0,1.0
1,1000000,100,2,yes,yes,0,175.843859,9223976972,ok,7000,...,106.7,0,9223983104,21158129664,1915,133,0.08,0.15251,2.146023,1.073011
2,1000000,100,3,yes,yes,0,252.690771,9223976972,ok,7000,...,110.07,0,9223983104,21159456768,1108,92,0.14,0.38353,3.083873,1.027958
3,1000000,100,4,yes,yes,0,319.73655,9223976972,ok,7000,...,112.02,0,9223979008,21160837120,1999,54,0.06,0.207982,3.902109,0.975527
4,1000000,100,5,yes,yes,0,446.619139,9223976972,ok,7000,...,104.51,0,9223983104,21161082880,930,49,0.07,0.338936,5.450602,1.09012
5,1000000,100,6,yes,yes,0,521.169306,9223976972,ok,7000,...,103.65,0,9223979008,21159284736,1242,772,0.12,0.678019,6.360422,1.06007
6,1000000,100,7,yes,yes,0,611.012887,9223976972,ok,7000,...,102.83,0,9223983104,21163700224,1343,949,0.28,1.854771,7.456886,1.065269
7,1000000,100,8,yes,yes,0,658.545541,9223976972,ok,7000,...,105.1,0,9223979008,21157556224,2173,909,0.24,1.713479,8.036981,1.004623
8,1000000,100,9,yes,yes,0,750.425837,9223976972,ok,7000,...,103.07,0,9223979008,21160030208,1930,977,0.59,4.800004,9.158301,1.017589
9,1000000,100,10,yes,yes,0,751.205782,9223976972,ok,7000,...,101.74,0,9223983104,21161410560,3489,836,0.81,6.596685,9.167819,0.916782


Num Keys: 1000000, Value Size: 100 Bytes 


Unnamed: 0,keys,value_size,num_threads,rdbcompression,rdbchecksum,valkey_data_throughput_mb_s,actual_throughput_mb_s,rdb_file_size_bytes,status,port,...,cpu_total_time_seconds,io_read_bytes,io_write_bytes,memory_rss_bytes,context_switches_voluntary,context_switches_involuntary,iowait_time_seconds,iowait_percentage,speedup,efficiency
12,1000000,100,1,no,yes,0,340.756315,14372114288,ok,7000,...,46.31,0,14372118528,21157687296,110,155,0.14,0.331934,1.0,1.0
13,1000000,100,2,no,yes,0,690.718719,14372114288,ok,7000,...,39.07,0,14372114432,21159833600,8677,68,0.42,2.018505,2.027017,1.013508
14,1000000,100,3,no,yes,0,756.164816,14372114288,ok,7000,...,35.85,0,14372118528,21160906752,20124,197,2.07,10.89096,2.219078,0.739693
15,1000000,100,4,no,yes,0,755.075226,14372114288,ok,7000,...,35.77,0,14372114432,21161979904,18747,31,1.9,9.982129,2.21588,0.55397
16,1000000,100,5,no,yes,0,754.965381,14372114288,ok,7000,...,35.1,0,14372118528,21163053056,9692,99,1.91,10.033206,2.215558,0.443112
17,1000000,100,6,no,yes,0,754.611569,14372114288,ok,7000,...,36.72,0,14372114432,21164113920,7626,317,1.42,7.455747,2.21452,0.369087
18,1000000,100,7,no,yes,0,755.256855,14372114288,ok,7000,...,36.96,0,14372118528,21165109248,5010,300,1.01,5.307566,2.216413,0.31663
19,1000000,100,8,no,yes,0,754.566354,14372114288,ok,7000,...,37.37,0,14372114432,21166170112,4049,285,0.73,3.832654,2.214387,0.276798
20,1000000,100,9,no,yes,0,755.05015,14372114288,ok,7000,...,35.76,0,14372118528,21167251456,4846,213,1.51,7.932902,2.215807,0.246201
21,1000000,100,10,no,yes,0,755.11538,14372114288,ok,7000,...,36.27,0,14372114432,21168336896,4005,207,1.0,5.254031,2.215998,0.2216


In [63]:
import pandas as pd
import plotly.graph_objects as go
from plotly.subplots import make_subplots
import plotly.express as px
from typing import List, Tuple, Any

def plot_throughput_by_workload(dfs: list[pd.DataFrame], workload_names: list[str], title: str):
    """
    Generates an interactive Plotly graph showing throughput vs. number of threads,
    with each line representing a different workload.

    Args:
        dfs (List[pd.DataFrame]): A list of DataFrames, where each DataFrame
                                   contains 'num_threads' and 'actual_throughput_mb_s'
                                   columns.
        workload_names (List[str]): A list of descriptive names for each workload,
                                     used for legend labels. Must match the length of dfs.
        title (str): The main title for the graph.
    """
    if not dfs or not workload_names or len(dfs) != len(workload_names):
        print("Error: Input lists 'dfs' and 'workload_names' must not be empty and must have matching lengths.")
        return

    # Get all unique thread counts across all DFs for x-axis ticks
    all_num_threads = sorted(pd.concat(df['num_threads'] for df in dfs).unique())

    # --- Color and Marker Definition ---
    colors = px.colors.qualitative.Plotly
    markers = ['circle', 'square', 'diamond', 'cross', 'x', 'triangle-up', 'triangle-down', 'pentagon', 'hexagram']
    # --- End Color and Marker Definition ---

    fig = go.Figure()

    # Plot Throughput for each workload
    for i, df in enumerate(dfs):
        workload_label = workload_names[i]
        color = colors[i % len(colors)]
        marker = markers[i % len(markers)]

        fig.add_trace(
            go.Scatter(
                x=df['num_threads'],
                y=df['actual_throughput_mb_s'],
                mode='lines+markers',
                name=f'{workload_label}',
                marker=dict(symbol=marker, size=8),
                line=dict(width=2, color=color),
                showlegend=True
            )
        )

    # Update layout for a beautiful graph
    fig.update_layout(
        title_text=f'{title} - Throughput vs. Number of Threads',
        height=600, width=800,
        showlegend=True,
        hovermode="x unified",
        template="plotly_white",
        title_x=0.5,
        xaxis=dict(
            title_text='Number of Threads',
            tickmode='array',
            tickvals=all_num_threads
        ),
        yaxis=dict(
            title_text='Throughput (MB/s)'
        )
    )

    fig.show()

In [64]:
workload_names =["Large JSON, 1750 M/S Max IO throughput", "Large JSON, 750 M/S Max IO throughput"]
plot_throughput_by_workload(dfs_compression_off, workload_names, "Compression OFF")
plot_throughput_by_workload(dfs_compression_on, workload_names, "Compression ON")

In [65]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
from typing import List, Tuple, Any

def plot_cpu_utilization_by_workload(dfs: List[pd.DataFrame], workload_names: List[str], title: str):
    """
    Generates an interactive Plotly graph showing CPU Utilization vs. number of threads,
    with each line representing a different workload.

    Args:
        dfs (List[pd.DataFrame]): A list of DataFrames, where each DataFrame
                                   contains 'num_threads' and 'cpu_utilization_percent' columns.
        workload_names (List[str]): A list of descriptive names for each workload,
                                     used for legend labels. Must match the length of dfs.
        title (str): The main title for the graph.
    """
    if not dfs or not workload_names or len(dfs) != len(workload_names):
        print("Error: Input lists 'dfs' and 'workload_names' must not be empty and must have matching lengths.")
        return

    # Get all unique thread counts across all DFs for x-axis ticks
    all_num_threads = sorted(pd.concat(df['num_threads'] for df in dfs).unique())

    # --- Color and Marker Definition ---
    colors = px.colors.qualitative.Plotly
    markers = ['circle', 'square', 'diamond', 'cross', 'x', 'triangle-up', 'triangle-down', 'pentagon', 'hexagram']
    # --- End Color and Marker Definition ---

    fig = go.Figure()

    # Plot CPU Utilization for each workload
    for i, df in enumerate(dfs):
        workload_label = workload_names[i]
        color = colors[i % len(colors)]
        marker = markers[i % len(markers)]
        
        fig.add_trace(
            go.Scatter(
                x=df['num_threads'],
                y=df['cpu_utilization_percent'],
                mode='lines+markers',
                name=f'{workload_label}',
                marker=dict(symbol=marker, size=8),
                line=dict(width=2, color=color),
                showlegend=True
            )
        )

    # Add Ideal CPU line (100% utilization)
    fig.add_trace(
        go.Scatter(
            x=all_num_threads,
            y=[100] * len(all_num_threads),
            mode='lines',
            name='Ideal (100%) CPU',
            line=dict(dash='dash', color='gray'),
            showlegend=True
        )
    )

    # Update layout for a beautiful graph
    fig.update_layout(
        title_text=f'{title} - CPU Utilization vs. Number of Threads',
        height=600, width=800,
        showlegend=True,
        hovermode="x unified",
        template="plotly_white",
        title_x=0.5,
        xaxis=dict(
            title_text='Number of Threads',
            tickmode='array',
            tickvals=all_num_threads
        ),
        yaxis=dict(
            title_text='CPU Utilization (%)',
            range=[0, 2000] # Adjust the range to accommodate percentage
        )
    )

    fig.show()

In [66]:
plot_cpu_utilization_by_workload(dfs_compression_off, workload_names, "Compression OFF")
plot_cpu_utilization_by_workload(dfs_compression_on, workload_names, "Compression ON")