# Pre-Processing Steps to extract all needed data as CSV
Running the entire file might take multiple minutes depending on experiment duration.

In [1]:
ENERGY_INTENSITY_STORAGE_KWH_PER_GB = 0.0046 # kWh/GB


In [3]:
!pip install numpy pandas seaborn matplotlib ipython scipy --quiet
# Install prerequisites for evaluation
import numpy as np
import pandas as pd
import seaborn as sns
import importlib
from matplotlib import pyplot as plt
import matplotlib.gridspec as gridspec
import matplotlib.dates as mdates
from os import listdir, path
from IPython.display import display, Markdown
import math
from scipy.interpolate import interp1d

import os
import sys

# if current working directory includes "evaluation" change to parent directory
if "evaluation" in os.getcwd():
    os.chdir("..")
    # Add parent directory to sys.path
    sys.path.append(os.getcwd())



# Verify
print(f"Current working directory: {os.getcwd()}")
print(f"sys.path: {sys.path}")

Current working directory: /root/oxn
sys.path: ['/usr/lib/python311.zip', '/usr/lib/python3.11', '/usr/lib/python3.11/lib-dynload', '', '/root/oxn/venv/lib/python3.11/site-packages', '/root/oxn']


# Short Setup Test
This has to be exeuted twice, first time it seems to fail always. Dont ask me why ...

In [5]:
# Import report
from gevent import monkey
monkey.patch_all()
import evaluation.evaluation as evaluation


importlib.reload(evaluation)

from evaluation.evaluation import Report

assert os.path.exists("reports"), "No reports directory found. Please run the experiments first."


yaml_files = sorted(
    (f for f in os.listdir("reports") if f.lower().endswith(".yaml")),
    key=lambda fn: os.path.getmtime(path.join("reports", fn))
)

if not yaml_files:
    raise RuntimeError("No .yaml reports found in reports/")

# sort them by date
yaml_files.sort(key=lambda x: path.getctime(path.join("reports",x)))
yaml_files = yaml_files[::-1]

# Load the last experiment
last_experiment = yaml_files[0]

report_path = path.join("reports", last_experiment)
report = Report.from_file(report_path)
print(f"Loaded report {last_experiment}")

assert report is not [], "No report found. Please run the experiments first."
assert isinstance(report, Report), "Report is not of type Report"


assert report.interactions is not [], "No interactions found."
assert report.interactions[0] is not [], "No interactions found. "
assert report.interactions[0].response_data is not [], "No data in interaction found"


Loaded report report_2025-07-21_00-53-25.yaml


In [6]:
# Patch all and reload evaluation module
monkey.patch_all()
importlib.reload(evaluation)

# cache the dataframes
cached_dataframes_storage_pods = []
cached_dataframes_storage_namespace = []
cached_dataframes_kepler_pods = []
cached_dataframes_kepler_namespace = []

In [7]:
# invert the files_in_direcotry list 
#files_in_direcotory = files_in_direcotory[::-1]

experiments_map = {
    "recommendation_k8_base_1m_otel_persistence": None,
    "recommendation_k8_base_5_percent_persistence": None,
    "recommendation_k8_base_10_percent_persistence": None,
    "recommendation_k8_base_50_percent_persistence": None,
    "recommendation_k8_base_1m_otel_persistence_istio": None,
    "recommendation_k8_base_1m_otel_persistence_scrape_5s": None,
    "recommendation_k8_base_1m_otel_persistence_scrape_30s": None,
}


# iterate over all reports and find a string in them
print("Searching for most recent experiments in list of reports " + len(yaml_files).__str__())
for file in yaml_files:
    report_path = path.join("reports", file)
    # read the file as a string
    with open(report_path, "r") as f:
        content = f.read()
        # iterate over all experiments and check if the string contains the experiment name
        
        for experiment in experiments_map.keys():
            if f"{experiment}.yml" in content:
                if experiments_map[experiment]:
                    print(f"Found {experiment} in {file} but already found in {experiments_map[experiment]}. Did the recent run not included all experiments?")
                    continue
                experiments_map[experiment] = file
                print(f"Found {experiment} in {file}")
                break
    
    if all(experiments_map.values()):
        print("All experiments found")
        break
        

Searching for most recent experiments in list of reports 14
Found recommendation_k8_base_1m_otel_persistence_istio in report_2025-07-21_00-53-25.yaml
Found recommendation_k8_base_50_percent_persistence in report_2025-07-20_23-37-45.yaml
Found recommendation_k8_base_10_percent_persistence in report_2025-07-20_22-23-02.yaml
Found recommendation_k8_base_5_percent_persistence in report_2025-07-20_21-08-22.yaml
Found recommendation_k8_base_1m_otel_persistence_scrape_30s in report_2025-07-20_19-53-44.yaml
Found recommendation_k8_base_1m_otel_persistence_scrape_5s in report_2025-07-20_18-39-04.yaml
Found recommendation_k8_base_1m_otel_persistence in report_2025-07-20_17-24-24.yaml
All experiments found


Loading some basic functions used later on

In [8]:

# Function to load a report and extract specific interactions
def load_report_interactions(report_directory, report_name, response_name, treatment_name):
    report_path = os.path.join(report_directory, report_name)
    if os.path.exists(report_path):
        report = Report.from_file(report_path)
        print(report.interactions)
        selected_interactions = [
            interaction for interaction in report.interactions
            if interaction.response_name == response_name and interaction.treatment_name == treatment_name
        ]
        print(f"Found {len(selected_interactions)} interactions for {report_name} and {treatment_name}.")
        return selected_interactions[0] if selected_interactions else None
    else:
        print(f"Report {report_name} not found.")
        return None      
        
def load_reports_and_normalize_time(cached_dataframes, report_directory, response_name, treatment_names, report_names):
    response_dataframes = cached_dataframes or []
    # Load the interactions if not already loaded
    if not response_dataframes or len(response_dataframes) == 0:
        print(f"Loading dataframes from reports because cached dataframes are empty.")
        for report_name in report_names:
            for treatment_name in treatment_names:
                interaction = load_report_interactions(report_directory, report_name, response_name, treatment_name)
                if interaction:
                    print(interaction)
                    print(interaction.response_data)
                    # Add columns to identify the treatment and report
                    interaction.response_data["treatment"] = f"{report_name}_{treatment_name}"
                    interaction.response_data["treatment_start"] = interaction.treatment_start
                    interaction.response_data["treatment_end"] = interaction.treatment_end
                    interaction.response_data["label"] = f"{REPORT_LABELS[report_name]}"

                    response_dataframes.append(interaction.response_data.copy())  # Make an explicit copy
    else:
        print("Using cached dataframes.")
    cached_dataframes = response_dataframes

    # Normalize time and concatenate all dataframes
    if response_dataframes:
        normalized_dataframes = []
        treatment_start_time = None
        treatment_end_time = None

        assert isinstance(response_dataframes, list)

        for df in response_dataframes:
            assert isinstance(df, pd.DataFrame)
            # Convert index to datetime if needed and ensure both index and treatment_start have the same timezone
            if not pd.api.types.is_datetime64_any_dtype(df.index):
                df.index = pd.to_datetime(df.index)

            # Convert treatment_start and treatment_end to datetime if not already
            current_treatment_start = pd.to_datetime(df["treatment_start"].iloc[0])
            current_treatment_end = pd.to_datetime(df["treatment_end"].iloc[0])

            # Ensure both are timezone-naive or the same timezone-aware if applicable
            df.index = df.index.tz_localize(None)
            current_treatment_start = current_treatment_start.tz_localize(None)
            current_treatment_end = current_treatment_end.tz_localize(None)

            # Calculate the interval between data points in seconds
            time_interval = (df.index[1] - df.index[0]).total_seconds()

            # Add the experiment_seconds column
            df = df.copy()  # Work with a copy to avoid modifying a view
            df.loc[:, "experiment_seconds"] = (df.index - current_treatment_start).total_seconds()
            df.loc[:, "time_normalized"] = df["experiment_seconds"]

            # Set or verify consistent treatment start and end times for plotting
            if treatment_start_time is None:
                treatment_start_time = current_treatment_start
                treatment_end_time = current_treatment_end
            else:
                if treatment_start_time != current_treatment_start or treatment_end_time != current_treatment_end:
                    print(f"Warning: Inconsistent treatment start or end times for report {df['treatment'].iloc[0]}.")

            normalized_dataframes.append(df)
    else:
        print("No valid interactions found for the specified treatment names.")


    # Combine all the normalized dataframes
    combined_df = pd.concat(normalized_dataframes)
    return cached_dataframes, combined_df, treatment_start_time, treatment_end_time


def plot_storage_metrics_per_pod_seaborn(combined_df, response_name, treatment_end_time, treatment_start_time):
    # Normalize t0 and t1 to seconds relative to treatment start time
    treatment_start_seconds = 0  # t0 is always 0 after normalization
    treatment_end_seconds = (treatment_end_time - treatment_start_time).total_seconds()

    print(combined_df)
    # Get unique container names
    container_names = combined_df["label_app_kubernetes_io_name"].unique()

    # Determine the grid size for subplots
    num_containers = len(container_names)
    num_cols = 2  # Define the number of columns for the grid
    num_rows = math.ceil(num_containers / num_cols)

    # Set the style for seaborn
    sns.set(style="whitegrid")
    sns.set_palette("pastel")

    # Create subplots
    fig, axes = plt.subplots(num_rows, num_cols, figsize=(15, num_rows * 5))
    axes = axes.flatten()  # Flatten in case of a 2D array to make iteration easier

    # hue is the list of all unique treatments
    hue = combined_df["treatment"].unique()
    
    # use a lowpass filter to smooth the data
    combined_df[response_name] = combined_df[response_name].rolling(window=6, center=True, min_periods=2).mean()
   
        

    # Plot the data for each container_name in a subplot
    for idx, container_name in enumerate(container_names):
        ax = axes[idx]
        container_df = combined_df[combined_df["label_app_kubernetes_io_name"] == container_name]
        # Interpolate the data for smoothing
        for treatment, df in container_df.groupby("treatment"):
            if len(df) > 1:
                # if there is an  error in interp1d then just skip this one
                try:
                    f = interp1d(df["experiment_seconds"], df[response_name], kind='cubic')
                except:
                    continue 
                experiment_seconds_new = np.linspace(df["experiment_seconds"].min(), df["experiment_seconds"].max(), num=500)
                response_interpolated = f(experiment_seconds_new)
                sns.lineplot(
                    x=experiment_seconds_new,
                    y=response_interpolated,
                    ax=ax,
                    label=f"{df['label'].iloc[0]}",
                    linewidth=2
                )
            else:
                print(f"Only one data point for {container_name} in treatment {treatment}.")
                """ sns.lineplot(
                    data=df,
                    x="experiment_seconds",
                    y=response_name,
                    ax=ax,
                    label=f"{treatment}",
                    palette="tab10",
                    linewidth=2
                ) """

        # Add vertical lines for treatment start and end times (normalized)
        ax.axvline(x=treatment_start_seconds, color="r", linewidth=1, linestyle="--", label="t0")
        ax.axvline(x=treatment_end_seconds, color="r", linewidth=1, linestyle="--", label="t1")

        ax.set_title(f"Container: {container_name}")
        ax.set_xlabel("Time (seconds from experiment start)")
        ax.set_ylabel("Usage (Bytes)")
        ax.set_xlim(-500, treatment_end_seconds + 500)
        ax.legend()

    # Hide any unused subplots
    for idx in range(num_containers, len(axes)):
        fig.delaxes(axes[idx])

    # Adjust layout
    plt.tight_layout()
    plt.show()



# Pre-Process Storage Data

In [9]:
# uncomment to reset cache
cached_dataframes_kepler_namespace = []
# Directory containing reports
report_directory = "reports"

# Directory containing reports
report_directory = "reports"
treatment_names = ["delay_treatment", "empty_treatment"]
#                       baseline                                    5% base                                                10% base                                                      50% base                                            1s
report_names = [ experiments_map["recommendation_k8_base_1m_otel_persistence"], experiments_map["recommendation_k8_base_5_percent_persistence"], experiments_map["recommendation_k8_base_10_percent_persistence"], experiments_map["recommendation_k8_base_50_percent_persistence"], experiments_map["recommendation_k8_base_1m_otel_persistence_istio"], experiments_map["recommendation_k8_base_1m_otel_persistence_scrape_5s"], experiments_map["recommendation_k8_base_1m_otel_persistence_scrape_30s"]]
REPORT_LABELS = {report_names[0]: "Base Persistence", report_names[1]: "5% Persistence", report_names[2]: "10% Persistence", report_names[3]: "50% Persistence", report_names[4]: "Persistence Istio",  report_names[5]: "Scrape 5s Persistence", report_names[6]: "Scrape 30s Persistence"}


response_name_storage_namespace_combined_absolute = "cadvisor_storage_usage_combined_absolute"


In [10]:
def load_report_interactions(report_directory, report_name, response_name, treatment_name):
    report_path = os.path.join(report_directory, report_name)
    if os.path.exists(report_path):
        report = Report.from_file(report_path)
        #print(report.interactions)
        selected_interactions = [
            interaction for interaction in report.interactions
            if interaction.response_name == response_name and interaction.treatment_name == treatment_name
        ]
        print(f"Found {len(selected_interactions)} interactions for {report_name} and {treatment_name}.")
        return selected_interactions[0] if selected_interactions else None
    else:
        print(f"Report {report_name} not found.")
        return None     
    
    
def load_reports_and_normalize_time(cached_dataframes, report_directory, response_name, treatment_names, report_names):
    response_dataframes = cached_dataframes or []
    # Load the interactions if not already loaded
    if not response_dataframes or len(response_dataframes) == 0:
        print(f"Loading dataframes from reports because cached dataframes are empty.")
        for report_name in report_names:
            for treatment_name in treatment_names:
                interaction = load_report_interactions(report_directory, report_name, response_name, treatment_name)
                if interaction:
                    #print(interaction)
                    #print(interaction.response_data)
                    # Add columns to identify the treatment and report
                    interaction.response_data["treatment"] = f"{report_name}_{treatment_name}"
                    interaction.response_data["treatment_start"] = interaction.treatment_start
                    interaction.response_data["treatment_end"] = interaction.treatment_end
                    interaction.response_data["report"] = report_name
                    interaction.response_data["label"] = f"{REPORT_LABELS[report_name]}"
                    response_dataframes.append(interaction.response_data.copy())  # Make an explicit copy
    else:
        print("Using cached dataframes.")
    cached_dataframes = response_dataframes

    # Normalize time and concatenate all dataframes
    if response_dataframes:
        normalized_dataframes = []
        treatment_start_time = None
        treatment_end_time = None

        assert isinstance(response_dataframes, list)

        for df in response_dataframes:
            assert isinstance(df, pd.DataFrame)
            # Convert index to datetime if needed and ensure both index and treatment_start have the same timezone
            if not pd.api.types.is_datetime64_any_dtype(df.index):
                df.index = pd.to_datetime(df.index)

            # Convert treatment_start and treatment_end to datetime if not already
            current_treatment_start = pd.to_datetime(df["treatment_start"].iloc[0])
            current_treatment_end = pd.to_datetime(df["treatment_end"].iloc[0])

            # Ensure both are timezone-naive or the same timezone-aware if applicable
            df.index = df.index.tz_localize(None)
            current_treatment_start = current_treatment_start.tz_localize(None)
            current_treatment_end = current_treatment_end.tz_localize(None)

            # Calculate the interval between data points in seconds
            time_interval = (df.index[1] - df.index[0]).total_seconds()

            # Add the experiment_seconds column
            df = df.copy()  # Work with a copy to avoid modifying a view
            df.loc[:, "experiment_seconds"] = (df.index - current_treatment_start).total_seconds()
            df.loc[:, "time_normalized"] = df["experiment_seconds"]

            # Set or verify consistent treatment start and end times for plotting
            if treatment_start_time is None:
                treatment_start_time = current_treatment_start
                treatment_end_time = current_treatment_end
            else:
                if treatment_start_time != current_treatment_start or treatment_end_time != current_treatment_end:
                    print(f"Warning: Inconsistent treatment start or end times for report {df['treatment'].iloc[0]}.")

            normalized_dataframes.append(df)
    else:
        print("No valid interactions found for the specified treatment names.")


    # Combine all the normalized dataframes
    combined_df = pd.concat(normalized_dataframes)
    return cached_dataframes, combined_df, treatment_start_time, treatment_end_time


cached_baseline_50_percent_dataframes_kepler_pods = []


cached_baseline_50_percent_dataframes_kepler_pods, combined_df, treatment_start_time, treatment_end_time = load_reports_and_normalize_time(cached_baseline_50_percent_dataframes_kepler_pods, report_directory, "cadvisor_storage_usage_writes_all_absolute", ["empty_treatment"], report_names)

print(combined_df)



# Drop timestamp index if needed
df = combined_df.reset_index(drop=True)

# Round the normalized time (e.g., to nearest second)
df["time_normalized_rounded"] = df["time_normalized"].round(0)

# Select necessary columns
columns_to_keep = [
    "time_normalized_rounded", "report", "label",
    "label_app_kubernetes_io_name", "namespace", "cadvisor_storage_usage_writes_all_absolute"
]
df_clean = df[columns_to_keep]

# Pivot: rows = report + label + time, columns = container/namespace
pivot_df = df_clean.pivot_table(
    index=["report", "label", "time_normalized_rounded"],
    columns=["label_app_kubernetes_io_name", "namespace"],
    values="cadvisor_storage_usage_writes_all_absolute",
    aggfunc="mean"
)

# Flatten column headers
pivot_df.columns = [f"{name}_{namespace}" for name, namespace in pivot_df.columns]

# Sort for readability
pivot_df = pivot_df.sort_index()

# Drop rows where time < 0 or > 3600
pivot_df = pivot_df[pivot_df.index.get_level_values("time_normalized_rounded") >= 0]
pivot_df = pivot_df[pivot_df.index.get_level_values("time_normalized_rounded") <= 3600]

# Subtract the first value (at time = 0) for each (report, label)
def subtract_first(df_group):
    first_row = df_group.loc[df_group.index.get_level_values("time_normalized_rounded") == 0]
    if not first_row.empty:
        return df_group - first_row.values[0]
    else:
        return df_group  # nothing to subtract if no t=0

pivot_df = pivot_df.groupby(["report", "label"], group_keys=False).apply(subtract_first)


# Save to CSV
pivot_df.to_csv("cadvisor_storage_usage_writes_all_absolute_bytes.csv")



Loading dataframes from reports because cached dataframes are empty.
Found 1 interactions for report_2025-07-20_17-24-24.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_21-08-22.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_22-23-02.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_23-37-45.yaml and empty_treatment.
Found 1 interactions for report_2025-07-21_00-53-25.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_18-39-04.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_19-53-44.yaml and empty_treatment.
                              label_app_kubernetes_io_name  \
timestamp                                                    
2025-07-20 15:24:01.762000084                   accounting   
2025-07-20 15:24:31.762000084                   accounting   
2025-07-20 15:25:01.762000084                   accounting   
2025-07-20 15:25:31.762000084                   accounting   
2025-07-20 15

# Pre-Processing Network Data

In [11]:
report_directory = "reports/"
treatment_names = ["delay_treatment", "empty_treatment"]

In [12]:
report_names = [ experiments_map["recommendation_k8_base_1m_otel_persistence"], experiments_map["recommendation_k8_base_5_percent_persistence"], experiments_map["recommendation_k8_base_10_percent_persistence"], experiments_map["recommendation_k8_base_50_percent_persistence"], experiments_map["recommendation_k8_base_1m_otel_persistence_istio"], experiments_map["recommendation_k8_base_1m_otel_persistence_scrape_5s"], experiments_map["recommendation_k8_base_1m_otel_persistence_scrape_30s"]]
REPORT_LABELS = {report_names[0]: "Base Persistence", report_names[1]: "5% Persistence", report_names[2]: "10% Persistence", report_names[3]: "50% Persistence", report_names[4]: "Persistence Istio",  report_names[5]: "Scrape 5s Persistence", report_names[6]: "Scrape 30s Persistence"}

response_name = "cadvisor_network_bytes_received_all"


In [13]:
def load_report_interactions(report_directory, report_name, response_name, treatment_name):
    report_path = os.path.join(report_directory, report_name)
    if os.path.exists(report_path):
        report = Report.from_file(report_path)
        #print(report.interactions)
        selected_interactions = [
            interaction for interaction in report.interactions
            if interaction.response_name == response_name and interaction.treatment_name == treatment_name
        ]
        print(f"Found {len(selected_interactions)} interactions for {report_name} and {treatment_name}.")
        return selected_interactions[0] if selected_interactions else None
    else:
        print(f"Report {report_name} not found.")
        return None     
    
    
def load_reports_and_normalize_time(cached_dataframes, report_directory, response_name, treatment_names, report_names):
    response_dataframes = cached_dataframes or []
    # Load the interactions if not already loaded
    if not response_dataframes or len(response_dataframes) == 0:
        print(f"Loading dataframes from reports because cached dataframes are empty.")
        for report_name in report_names:
            for treatment_name in treatment_names:
                interaction = load_report_interactions(report_directory, report_name, response_name, treatment_name)
                if interaction:
                    #print(interaction)
                    #print(interaction.response_data)
                    # Add columns to identify the treatment and report
                    interaction.response_data["treatment"] = f"{report_name}_{treatment_name}"
                    interaction.response_data["treatment_start"] = interaction.treatment_start
                    interaction.response_data["treatment_end"] = interaction.treatment_end
                    interaction.response_data["report"] = report_name
                    interaction.response_data["label"] = f"{REPORT_LABELS[report_name]}"
                    response_dataframes.append(interaction.response_data.copy())  # Make an explicit copy
    else:
        print("Using cached dataframes.")
    cached_dataframes = response_dataframes

    # Normalize time and concatenate all dataframes
    if response_dataframes:
        normalized_dataframes = []
        treatment_start_time = None
        treatment_end_time = None

        assert isinstance(response_dataframes, list)

        for df in response_dataframes:
            assert isinstance(df, pd.DataFrame)
            # Convert index to datetime if needed and ensure both index and treatment_start have the same timezone
            if not pd.api.types.is_datetime64_any_dtype(df.index):
                df.index = pd.to_datetime(df.index)

            # Convert treatment_start and treatment_end to datetime if not already
            current_treatment_start = pd.to_datetime(df["treatment_start"].iloc[0])
            current_treatment_end = pd.to_datetime(df["treatment_end"].iloc[0])

            # Ensure both are timezone-naive or the same timezone-aware if applicable
            df.index = df.index.tz_localize(None)
            current_treatment_start = current_treatment_start.tz_localize(None)
            current_treatment_end = current_treatment_end.tz_localize(None)

            # Calculate the interval between data points in seconds
            time_interval = (df.index[1] - df.index[0]).total_seconds()

            # Add the experiment_seconds column
            df = df.copy()  # Work with a copy to avoid modifying a view
            df.loc[:, "experiment_seconds"] = (df.index - current_treatment_start).total_seconds()
            df.loc[:, "time_normalized"] = df["experiment_seconds"]

            # Set or verify consistent treatment start and end times for plotting
            if treatment_start_time is None:
                treatment_start_time = current_treatment_start
                treatment_end_time = current_treatment_end
            else:
                if treatment_start_time != current_treatment_start or treatment_end_time != current_treatment_end:
                    print(f"Warning: Inconsistent treatment start or end times for report {df['treatment'].iloc[0]}.")

            normalized_dataframes.append(df)
    else:
        print("No valid interactions found for the specified treatment names.")


    # Combine all the normalized dataframes
    combined_df = pd.concat(normalized_dataframes)
    return cached_dataframes, combined_df, treatment_start_time, treatment_end_time


cached_baseline_50_percent_dataframes_kepler_pods = []


cached_baseline_50_percent_dataframes_kepler_pods, combined_df, treatment_start_time, treatment_end_time = load_reports_and_normalize_time(cached_baseline_50_percent_dataframes_kepler_pods, report_directory, "cadvisor_network_bytes_received_all_absolute", ["empty_treatment"], report_names)

print(combined_df)



# Drop timestamp index if needed
df = combined_df.reset_index(drop=True)

# Round the normalized time (e.g., to nearest second)
df["time_normalized_rounded"] = df["time_normalized"].round(0)

# Select necessary columns
columns_to_keep = [
    "time_normalized_rounded", "report", "label",
    "label_app_kubernetes_io_name", "namespace", "cadvisor_network_bytes_received_all_absolute", "component"
]
df_clean = df[columns_to_keep]

# Pivot: rows = report + label + time, columns = container/namespace
pivot_df = df_clean.pivot_table(
    index=["report", "label", "time_normalized_rounded"],
    columns=["label_app_kubernetes_io_name", "namespace", "component"],
    values="cadvisor_network_bytes_received_all_absolute",
    aggfunc="mean"
)

# Flatten column headers
pivot_df.columns = [f"{components}_{namespace}" for name, namespace, components in pivot_df.columns]

# Sort for readability
pivot_df = pivot_df.sort_index()

# Drop rows where time < 0 or > 3600
pivot_df = pivot_df[pivot_df.index.get_level_values("time_normalized_rounded") >= 0]
pivot_df = pivot_df[pivot_df.index.get_level_values("time_normalized_rounded") <= 3600]

# Subtract the first value (at time = 0) for each (report, label)
def subtract_first(df_group):
    first_row = df_group.loc[df_group.index.get_level_values("time_normalized_rounded") == 0]
    if not first_row.empty:
        return df_group - first_row.values[0]
    else:
        return df_group  # nothing to subtract if no t=0

pivot_df = pivot_df.groupby(["report", "label"], group_keys=False).apply(subtract_first)


# Save to CSV
pivot_df.to_csv("cadvisor_network_bytes_received_all_absolute_bytes.csv")



Loading dataframes from reports because cached dataframes are empty.
Found 1 interactions for report_2025-07-20_17-24-24.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_21-08-22.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_22-23-02.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_23-37-45.yaml and empty_treatment.
Found 1 interactions for report_2025-07-21_00-53-25.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_18-39-04.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_19-53-44.yaml and empty_treatment.
                                component label_app_kubernetes_io_name  \
timestamp                                                                
2025-07-20 15:23:31.762000084  accounting                   accounting   
2025-07-20 15:24:01.762000084  accounting                   accounting   
2025-07-20 15:24:31.762000084  accounting                   accounting   
2025-07-20 15:2

In [14]:
def load_report_interactions(report_directory, report_name, response_name, treatment_name):
    report_path = os.path.join(report_directory, report_name)
    if os.path.exists(report_path):
        report = Report.from_file(report_path)
        #print(report.interactions)
        selected_interactions = [
            interaction for interaction in report.interactions
            if interaction.response_name == response_name and interaction.treatment_name == treatment_name
        ]
        print(f"Found {len(selected_interactions)} interactions for {report_name} and {treatment_name}.")
        return selected_interactions[0] if selected_interactions else None
    else:
        print(f"Report {report_name} not found.")
        return None     
    
    
def load_reports_and_normalize_time(cached_dataframes, report_directory, response_name, treatment_names, report_names):
    response_dataframes = cached_dataframes or []
    # Load the interactions if not already loaded
    if not response_dataframes or len(response_dataframes) == 0:
        print(f"Loading dataframes from reports because cached dataframes are empty.")
        for report_name in report_names:
            for treatment_name in treatment_names:
                interaction = load_report_interactions(report_directory, report_name, response_name, treatment_name)
                if interaction:
                    #print(interaction)
                    #print(interaction.response_data)
                    # Add columns to identify the treatment and report
                    interaction.response_data["treatment"] = f"{report_name}_{treatment_name}"
                    interaction.response_data["treatment_start"] = interaction.treatment_start
                    interaction.response_data["treatment_end"] = interaction.treatment_end
                    interaction.response_data["report"] = report_name
                    interaction.response_data["label"] = f"{REPORT_LABELS[report_name]}"
                    response_dataframes.append(interaction.response_data.copy())  # Make an explicit copy
    else:
        print("Using cached dataframes.")
    cached_dataframes = response_dataframes

    # Normalize time and concatenate all dataframes
    if response_dataframes:
        normalized_dataframes = []
        treatment_start_time = None
        treatment_end_time = None

        assert isinstance(response_dataframes, list)

        for df in response_dataframes:
            assert isinstance(df, pd.DataFrame)
            # Convert index to datetime if needed and ensure both index and treatment_start have the same timezone
            if not pd.api.types.is_datetime64_any_dtype(df.index):
                df.index = pd.to_datetime(df.index)

            # Convert treatment_start and treatment_end to datetime if not already
            current_treatment_start = pd.to_datetime(df["treatment_start"].iloc[0])
            current_treatment_end = pd.to_datetime(df["treatment_end"].iloc[0])

            # Ensure both are timezone-naive or the same timezone-aware if applicable
            df.index = df.index.tz_localize(None)
            current_treatment_start = current_treatment_start.tz_localize(None)
            current_treatment_end = current_treatment_end.tz_localize(None)

            # Calculate the interval between data points in seconds
            time_interval = (df.index[1] - df.index[0]).total_seconds()

            # Add the experiment_seconds column
            df = df.copy()  # Work with a copy to avoid modifying a view
            df.loc[:, "experiment_seconds"] = (df.index - current_treatment_start).total_seconds()
            df.loc[:, "time_normalized"] = df["experiment_seconds"]

            # Set or verify consistent treatment start and end times for plotting
            if treatment_start_time is None:
                treatment_start_time = current_treatment_start
                treatment_end_time = current_treatment_end
            else:
                if treatment_start_time != current_treatment_start or treatment_end_time != current_treatment_end:
                    print(f"Warning: Inconsistent treatment start or end times for report {df['treatment'].iloc[0]}.")

            normalized_dataframes.append(df)
    else:
        print("No valid interactions found for the specified treatment names.")


    # Combine all the normalized dataframes
    combined_df = pd.concat(normalized_dataframes)
    return cached_dataframes, combined_df, treatment_start_time, treatment_end_time


cached_baseline_50_percent_dataframes_kepler_pods = []


cached_baseline_50_percent_dataframes_kepler_pods, combined_df, treatment_start_time, treatment_end_time = load_reports_and_normalize_time(cached_baseline_50_percent_dataframes_kepler_pods, report_directory, "cadvisor_network_bytes_transmitted_all_absolute", ["empty_treatment"], report_names)

print(combined_df)



# Drop timestamp index if needed
df = combined_df.reset_index(drop=True)

# Round the normalized time (e.g., to nearest second)
df["time_normalized_rounded"] = df["time_normalized"].round(0)

# Select necessary columns
columns_to_keep = [
    "time_normalized_rounded", "report", "label",
    "label_app_kubernetes_io_name", "namespace", "cadvisor_network_bytes_transmitted_all_absolute", "component"
]
df_clean = df[columns_to_keep]

# Pivot: rows = report + label + time, columns = container/namespace
pivot_df = df_clean.pivot_table(
    index=["report", "label", "time_normalized_rounded"],
    columns=["label_app_kubernetes_io_name", "namespace", "component"],
    values="cadvisor_network_bytes_transmitted_all_absolute",
    aggfunc="mean"
)

# Flatten column headers
pivot_df.columns = [f"{components}_{namespace}" for name, namespace, components in pivot_df.columns]
#pivot_df.columns = [f"{name}_{namespace}" for name, namespace in pivot_df.columns]

# Sort for readability
pivot_df = pivot_df.sort_index()

# Drop rows where time < 0 or > 3600
pivot_df = pivot_df[pivot_df.index.get_level_values("time_normalized_rounded") >= 0]
pivot_df = pivot_df[pivot_df.index.get_level_values("time_normalized_rounded") <= 3600]

# Subtract the first value (at time = 0) for each (report, label)
def subtract_first(df_group):
    first_row = df_group.loc[df_group.index.get_level_values("time_normalized_rounded") == 0]
    if not first_row.empty:
        return df_group - first_row.values[0]
    else:
        return df_group  # nothing to subtract if no t=0

pivot_df = pivot_df.groupby(["report", "label"], group_keys=False).apply(subtract_first)


# Save to CSV
pivot_df.to_csv("cadvisor_network_bytes_transmitted_all_absolute_bytes.csv")



Loading dataframes from reports because cached dataframes are empty.
Found 1 interactions for report_2025-07-20_17-24-24.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_21-08-22.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_22-23-02.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_23-37-45.yaml and empty_treatment.
Found 1 interactions for report_2025-07-21_00-53-25.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_18-39-04.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_19-53-44.yaml and empty_treatment.
                                component label_app_kubernetes_io_name  \
timestamp                                                                
2025-07-20 15:23:31.762000084  accounting                   accounting   
2025-07-20 15:24:01.762000084  accounting                   accounting   
2025-07-20 15:24:31.762000084  accounting                   accounting   
2025-07-20 15:2

# Pre-Processing Kepler Data

In [15]:

# Directory containing reports
report_directory = "reports/"
treatment_names = ["delay_treatment", "empty_treatment"]
#                       base persistence
report_names = [ experiments_map["recommendation_k8_base_1m_otel_persistence"], experiments_map["recommendation_k8_base_5_percent_persistence"], experiments_map["recommendation_k8_base_10_percent_persistence"], experiments_map["recommendation_k8_base_50_percent_persistence"], experiments_map["recommendation_k8_base_1m_otel_persistence_istio"], experiments_map["recommendation_k8_base_1m_otel_persistence_scrape_5s"], experiments_map["recommendation_k8_base_1m_otel_persistence_scrape_30s"]]
REPORT_LABELS = {report_names[0]: "Base Persistence", report_names[1]: "5% Persistence", report_names[2]: "10% Persistence", report_names[3]: "50% Persistence", report_names[4]: "Persistence Istio",  report_names[5]: "Scrape 5s Persistence", report_names[6]: "Scrape 30s Persistence"}

response_name = "pods_kepler_joules_all"

cached_dataframes_kepler_pods, combined_df, treatment_start_time, treatment_end_time = load_reports_and_normalize_time(cached_dataframes_kepler_pods, report_directory, response_name, treatment_names, report_names)



Loading dataframes from reports because cached dataframes are empty.
Found 0 interactions for report_2025-07-20_17-24-24.yaml and delay_treatment.
Found 1 interactions for report_2025-07-20_17-24-24.yaml and empty_treatment.
Found 0 interactions for report_2025-07-20_21-08-22.yaml and delay_treatment.
Found 1 interactions for report_2025-07-20_21-08-22.yaml and empty_treatment.
Found 0 interactions for report_2025-07-20_22-23-02.yaml and delay_treatment.
Found 1 interactions for report_2025-07-20_22-23-02.yaml and empty_treatment.
Found 0 interactions for report_2025-07-20_23-37-45.yaml and delay_treatment.
Found 1 interactions for report_2025-07-20_23-37-45.yaml and empty_treatment.
Found 0 interactions for report_2025-07-21_00-53-25.yaml and delay_treatment.
Found 1 interactions for report_2025-07-21_00-53-25.yaml and empty_treatment.
Found 0 interactions for report_2025-07-20_18-39-04.yaml and delay_treatment.
Found 1 interactions for report_2025-07-20_18-39-04.yaml and empty_treatm

In [16]:
cached_dataframes_kepler_pods = []


cached_dataframes_kepler_pods, combined_df, treatment_start_time, treatment_end_time = load_reports_and_normalize_time(cached_dataframes_kepler_pods, report_directory, "pods_kepler_joules_all_absolute", ["empty_treatment"], report_names)

print(combined_df)



# Drop timestamp index if needed
df = combined_df.reset_index(drop=True)

# Round the normalized time (e.g., to nearest second)
df["time_normalized_rounded"] = df["time_normalized"].round(0)

# Select necessary columns
columns_to_keep = [
    "time_normalized_rounded", "report", "label",
    "container_name", "container_namespace", "pods_kepler_joules_all_absolute"
]
df_clean = df[columns_to_keep]

# Pivot: rows = treatment + label + time, columns = containers
pivot_df = df_clean.pivot_table(
    index=["report", "label", "time_normalized_rounded"],
    columns=["container_name", "container_namespace"],
    values="pods_kepler_joules_all_absolute",
    aggfunc="mean"
)
# Filter for time between 0 and 3600 seconds
pivot_df = pivot_df[pivot_df.index.get_level_values("time_normalized_rounded") >= 0]
pivot_df = pivot_df[pivot_df.index.get_level_values("time_normalized_rounded") <= 3600]

# Flatten the multi-index column headers
pivot_df.columns = [f"{name}_{namespace}" for name, namespace in pivot_df.columns]

def subtract_first(df_group):
    first_row = df_group.loc[df_group.index.get_level_values("time_normalized_rounded") == 0]
    if not first_row.empty:
        return df_group - first_row.values[0]
    else:
        return df_group  # nothing to subtract if no t=0

pivot_df = pivot_df.groupby(["report", "label"], group_keys=False).apply(subtract_first)

# Sort for readability
pivot_df = pivot_df.sort_index()


# Save to CSV
pivot_df.to_csv("pods_kepler_joules_all_absolute_joules.csv")



Loading dataframes from reports because cached dataframes are empty.
Found 1 interactions for report_2025-07-20_17-24-24.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_21-08-22.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_22-23-02.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_23-37-45.yaml and empty_treatment.
Found 1 interactions for report_2025-07-21_00-53-25.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_18-39-04.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_19-53-44.yaml and empty_treatment.
                              container_name      container_namespace  \
timestamp                                                               
2025-07-20 15:24:31.762000084     accounting  system-under-evaluation   
2025-07-20 15:25:01.762000084     accounting  system-under-evaluation   
2025-07-20 15:25:31.762000084     accounting  system-under-evaluation   
2025-07-20 15:26:01.

In [17]:
cached_dataframes_kepler_pods = []


cached_dataframes_kepler_pods, combined_df, treatment_start_time, treatment_end_time = load_reports_and_normalize_time(cached_dataframes_kepler_pods, report_directory, "pods_kepler_joules_all_container_level_all_namespaces", ["empty_treatment"], report_names)

print(combined_df)



# Drop timestamp index if needed
df = combined_df.reset_index(drop=True)

# Round the normalized time (e.g., to nearest second)
df["time_normalized_rounded"] = df["time_normalized"].round(0)

# Select necessary columns
columns_to_keep = [
    "time_normalized_rounded", "report", "label",
    "container_name", "container_namespace", "pods_kepler_joules_all_container_level_all_namespaces",
    "label_app_kubernetes_io_name"
]
df_clean = df[columns_to_keep]

# Pivot: rows = treatment + label + time, columns = containers
pivot_df = df_clean.pivot_table(
    index=["report", "label", "time_normalized_rounded"],
    columns=["container_name", "container_namespace", "label_app_kubernetes_io_name"],
    values="pods_kepler_joules_all_container_level_all_namespaces",
    aggfunc="mean"
)
# Filter for time between 0 and 3600 seconds
pivot_df = pivot_df[pivot_df.index.get_level_values("time_normalized_rounded") >= 0]
pivot_df = pivot_df[pivot_df.index.get_level_values("time_normalized_rounded") <= 3600]

# Flatten the multi-index column headers
pivot_df.columns = [f"{pod_name}_{name}_{namespace}" for pod_name, namespace, name in pivot_df.columns]

def subtract_first(df_group):
    first_row = df_group.loc[df_group.index.get_level_values("time_normalized_rounded") == 0]
    if not first_row.empty:
        return df_group - first_row.values[0]
    else:
        return df_group  # nothing to subtract if no t=0

pivot_df = pivot_df.groupby(["report", "label"], group_keys=False).apply(subtract_first)

# Sort for readability
pivot_df = pivot_df.sort_index()


# Save to CSV
pivot_df.to_csv("pods_kepler_joules_all_container_level_all_namespaces_joules.csv")



Loading dataframes from reports because cached dataframes are empty.
Found 1 interactions for report_2025-07-20_17-24-24.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_21-08-22.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_22-23-02.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_23-37-45.yaml and empty_treatment.
Found 1 interactions for report_2025-07-21_00-53-25.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_18-39-04.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_19-53-44.yaml and empty_treatment.
                                     container_name      container_namespace  \
timestamp                                                                      
2025-07-20 15:24:31.762000084            accounting  system-under-evaluation   
2025-07-20 15:25:01.762000084            accounting  system-under-evaluation   
2025-07-20 15:25:31.762000084            accounting  system-under

In [18]:
cached_dataframes_kepler_pods = []


cached_dataframes_kepler_pods, combined_df, treatment_start_time, treatment_end_time = load_reports_and_normalize_time(cached_dataframes_kepler_pods, report_directory, "pods_kepler_dram_joules_all", ["empty_treatment"], report_names)

print(combined_df)



# Drop timestamp index if needed
df = combined_df.reset_index(drop=True)

# Round the normalized time (e.g., to nearest second)
df["time_normalized_rounded"] = df["time_normalized"].round(0)

# Select necessary columns
columns_to_keep = [
    "time_normalized_rounded", "report", "label",
    "container_name", "container_namespace", "pods_kepler_dram_joules_all"
]
df_clean = df[columns_to_keep]

# Pivot: rows = treatment + label + time, columns = containers
pivot_df = df_clean.pivot_table(
    index=["report", "label", "time_normalized_rounded"],
    columns=["container_name", "container_namespace"],
    values="pods_kepler_dram_joules_all",
    aggfunc="mean"
)
# Filter for time between 0 and 3600 seconds
pivot_df = pivot_df[pivot_df.index.get_level_values("time_normalized_rounded") >= 0]
pivot_df = pivot_df[pivot_df.index.get_level_values("time_normalized_rounded") <= 3600]

# Flatten the multi-index column headers
pivot_df.columns = [f"{name}_{namespace}" for name, namespace in pivot_df.columns]

def subtract_first(df_group):
    first_row = df_group.loc[df_group.index.get_level_values("time_normalized_rounded") == 0]
    if not first_row.empty:
        return df_group - first_row.values[0]
    else:
        return df_group  # nothing to subtract if no t=0

pivot_df = pivot_df.groupby(["report", "label"], group_keys=False).apply(subtract_first)

# Sort for readability
pivot_df = pivot_df.sort_index()


# Save to CSV
pivot_df.to_csv("pods_kepler_dram_joules_all_joules.csv")



Loading dataframes from reports because cached dataframes are empty.
Found 1 interactions for report_2025-07-20_17-24-24.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_21-08-22.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_22-23-02.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_23-37-45.yaml and empty_treatment.
Found 1 interactions for report_2025-07-21_00-53-25.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_18-39-04.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_19-53-44.yaml and empty_treatment.
                              container_name      container_namespace  \
timestamp                                                               
2025-07-20 15:25:01.762000084     accounting  system-under-evaluation   
2025-07-20 15:25:31.762000084     accounting  system-under-evaluation   
2025-07-20 15:26:01.762000084     accounting  system-under-evaluation   
2025-07-20 15:26:31.

In [19]:
cached_dataframes_kepler_pods = []


cached_dataframes_kepler_pods, combined_df, treatment_start_time, treatment_end_time = load_reports_and_normalize_time(cached_dataframes_kepler_pods, report_directory, "pods_kepler_package_joules_all", ["empty_treatment"], report_names)

print(combined_df)



# Drop timestamp index if needed
df = combined_df.reset_index(drop=True)

# Round the normalized time (e.g., to nearest second)
df["time_normalized_rounded"] = df["time_normalized"].round(0)

# Select necessary columns
columns_to_keep = [
    "time_normalized_rounded", "report", "label",
    "container_name", "container_namespace", "pods_kepler_package_joules_all"
]
df_clean = df[columns_to_keep]

# Pivot: rows = treatment + label + time, columns = containers
pivot_df = df_clean.pivot_table(
    index=["report", "label", "time_normalized_rounded"],
    columns=["container_name", "container_namespace"],
    values="pods_kepler_package_joules_all",
    aggfunc="mean"
)
# Filter for time between 0 and 3600 seconds
pivot_df = pivot_df[pivot_df.index.get_level_values("time_normalized_rounded") >= 0]
pivot_df = pivot_df[pivot_df.index.get_level_values("time_normalized_rounded") <= 3600]

# Flatten the multi-index column headers
pivot_df.columns = [f"{name}_{namespace}" for name, namespace in pivot_df.columns]

def subtract_first(df_group):
    first_row = df_group.loc[df_group.index.get_level_values("time_normalized_rounded") == 0]
    if not first_row.empty:
        return df_group - first_row.values[0]
    else:
        return df_group  # nothing to subtract if no t=0

pivot_df = pivot_df.groupby(["report", "label"], group_keys=False).apply(subtract_first)

# Sort for readability
pivot_df = pivot_df.sort_index()


# Save to CSV
pivot_df.to_csv("pods_kepler_package_joules_all_joules.csv")



Loading dataframes from reports because cached dataframes are empty.
Found 1 interactions for report_2025-07-20_17-24-24.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_21-08-22.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_22-23-02.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_23-37-45.yaml and empty_treatment.
Found 1 interactions for report_2025-07-21_00-53-25.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_18-39-04.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_19-53-44.yaml and empty_treatment.
                              container_name      container_namespace  \
timestamp                                                               
2025-07-20 15:25:01.762000084     accounting  system-under-evaluation   
2025-07-20 15:25:31.762000084     accounting  system-under-evaluation   
2025-07-20 15:26:01.762000084     accounting  system-under-evaluation   
2025-07-20 15:26:31.

In [20]:
cached_dataframes_kepler_pods = []


cached_dataframes_kepler_pods, combined_df, treatment_start_time, treatment_end_time = load_reports_and_normalize_time(cached_dataframes_kepler_pods, report_directory, "prometheus_scrape_amount", ["empty_treatment"], report_names)

print(combined_df)



# Drop timestamp index if needed
df = combined_df.copy()

# Round normalized time
df["time_normalized_rounded"] = df["time_normalized"].round(0)

# Only keep relevant columns for aggregation
columns = [
    "report", "label", "time_normalized_rounded", "prometheus_scrape_amount"
]
df_clean = df[columns]

# Filter time window
df_clean = df_clean[
    (df_clean["time_normalized_rounded"] >= 0) &
    (df_clean["time_normalized_rounded"] <= 3600)
]

# Group and deduplicate using mean
df_clean = df_clean.groupby(
    ["report", "label", "time_normalized_rounded"],
    as_index=False
).agg({"prometheus_scrape_amount": "max"})

# subtract the first value in each group
def subtract_first(df_group):
    first_value = df_group["prometheus_scrape_amount"].iloc[0]
    df_group["prometheus_scrape_amount"] -= first_value
    return df_group

df_clean = df_clean.groupby(["report", "label"], group_keys=False).apply(subtract_first)

# Export both
df_clean.to_csv("prometheus_scrape_amount_simple.csv", index=False)


Loading dataframes from reports because cached dataframes are empty.
Found 1 interactions for report_2025-07-20_17-24-24.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_21-08-22.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_22-23-02.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_23-37-45.yaml and empty_treatment.
Found 1 interactions for report_2025-07-21_00-53-25.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_18-39-04.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_19-53-44.yaml and empty_treatment.
                                                                  __name__  \
timestamp                                                                    
2025-07-20 15:24:01.762000084  prometheus_tsdb_head_samples_appended_total   
2025-07-20 15:24:31.762000084  prometheus_tsdb_head_samples_appended_total   
2025-07-20 15:25:01.762000084  prometheus_tsdb_head_samples_appended_tota

  df_clean = df_clean.groupby(["report", "label"], group_keys=False).apply(subtract_first)


In [21]:
cached_dataframes_kepler_pods = []


cached_dataframes_kepler_pods, combined_df, treatment_start_time, treatment_end_time = load_reports_and_normalize_time(cached_dataframes_kepler_pods, report_directory, "sampling_rates", ["empty_treatment"], report_names)


# Clone the original dataframe to be safe
df = combined_df.copy()

# Optional: restrict to first hour (0–3600s) if "time_normalized" exists
if "time_normalized" in df.columns:
    df = df[(df["time_normalized"] >= 0) & (df["time_normalized"] <= 3600)]
    
print(df.head())
print(df.columns)
# only keep columns with sampled = true
df = df[df["sampled"] == "true"]

# Group by report and label, and calculate total sampled traces
sampled_total = df.groupby(["report", "label"])["sampling_rates"].sum()
sampled_total.name = "sampled_traces_total"

# Count how many data points (1 per minute interval) per experiment
data_points = df.groupby(["report", "label"]).size().rename("data_points")

# Combine into a single DataFrame
summary = pd.concat([sampled_total, data_points], axis=1)

# Calculate average number of sampled traces per minute
summary["avg_sampled_per_min"] = summary["sampled_traces_total"] / summary["data_points"]

# Optional: sort for readability
summary = summary.sort_values(by="sampled_traces_total", ascending=False)

# Save to CSV
summary.to_csv("trace_sampling_summary.csv")

# Optionally print to console
print(summary)


Loading dataframes from reports because cached dataframes are empty.
Found 1 interactions for report_2025-07-20_17-24-24.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_21-08-22.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_22-23-02.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_23-37-45.yaml and empty_treatment.
Found 0 interactions for report_2025-07-21_00-53-25.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_18-39-04.yaml and empty_treatment.
Found 1 interactions for report_2025-07-20_19-53-44.yaml and empty_treatment.
                              app_kubernetes_io_instance  \
timestamp                                                  
2025-07-20 15:33:01.762000084             astronomy-shop   
2025-07-20 15:34:01.762000084             astronomy-shop   
2025-07-20 15:35:01.762000084             astronomy-shop   
2025-07-20 15:36:01.762000084             astronomy-shop   
2025-07-20 15:37:01.76200

# Pre-Processing Energy Data

In [40]:
TRACE_RETENTION_DAYS = 30 # 30 days

# energy intensity kWh/GB
ENERGY_INTENSITY_NETWORK_KWH_PER_GB = 0.001875
ENERGY_INTENSITY_STORAGE_KWH_PER_GB_YEAR = 0.0046
ENERGY_INTENSITY_STORAGE_KWH_PER_GB = (ENERGY_INTENSITY_STORAGE_KWH_PER_GB_YEAR / 365) * TRACE_RETENTION_DAYS


## Load pre-processed data
This data can be pre-processed with the help of kepler-evaluation.ipynb (CPU+RAM), network-evaluation.ipynb and storage-evaluation.ipynb. If you want to do new experiments, you have to create these files yourself in the meantioned jupyter notebooks.

In [41]:
import pandas as pd
import tabulate as tabulate
network_data = pd.read_csv("cadvisor_network_bytes_received_all_absolute_bytes.csv", index_col=0)
network_data_transmitted = pd.read_csv("cadvisor_network_bytes_transmitted_all_absolute_bytes.csv", index_col=0)
storage_data = pd.read_csv("cadvisor_storage_usage_writes_all_absolute_bytes.csv", index_col=0)
kepler_data_joules = pd.read_csv("pods_kepler_joules_all_absolute_joules.csv", index_col=0)
kepler_dram = pd.read_csv("pods_kepler_dram_joules_all_joules.csv", index_col=0)


In [42]:
print("Total measurement points: ",  network_data.size + network_data_transmitted.size + storage_data.size + kepler_data_joules.size + kepler_dram.size)


Total measurement points:  130438


## Network
Further process the network data to transform total bytes received to kWh using the energy intensity

In [43]:
# get the last value for each label based on the time_normalized_rounded as this is the total amount of data received
network_data_processed = network_data.groupby("label").last().reset_index()


# remove the column time_normalized_rounded as it is not needed anymore
network_data_processed = network_data_processed.drop(columns=["time_normalized_rounded"])

# remove _system-under-evaluation from the column names
network_data_processed.columns = network_data_processed.columns.str.replace("_system-under-evaluation", "", regex=False)

network_data_processed = network_data_processed.set_index("label")

network_data_processed = network_data_processed.reindex([
    "Baseline",
    "0%",
    "5%",
    "10%",
    "50%",
    "1s",
    "30s",
    "Base Persistence",
    "Scrape 30s Persistence",
    "Scrape 5s Persistence",
    "5% Persistence",
    "10% Persistence",
    "50% Persistence",
    "Persistence Istio",
])

# drop indexes the experiments that are not needed e.g. Baseline, 0%, 5%, 10%, 50%, 1s, 30s
network_data_processed = network_data_processed.drop(index=[
    "Baseline",
    "0%",
    "5%",
    "10%",
    "50%",
    "1s",
    "30s",
])

# rename the columns to be more readable
network_data_processed = network_data_processed.rename(index={
    "Base Persistence": "Baseline",
    "Scrape 30s Persistence": "Scrape 30s",
    "Scrape 5s Persistence": "Scrape 5s",
    "5% Persistence": "5%",
    "10% Persistence": "10%",
    "50% Persistence": "50%",
    "Persistence Istio": "Istio",
})



# this values are in bytes, convert them to GB
network_data_processed = network_data_processed / (1000 * 1000 * 1000)


print("Total network data received per Experiment (label) in GB:")
print(network_data_processed.round(2).to_markdown())


# calculate the energy consumption in kWh
network_data_processed = network_data_processed * ENERGY_INTENSITY_NETWORK_KWH_PER_GB


# print as table
print("Total network data received per Experiment (label) and energy consumption in kWh:")
print(network_data_processed.to_markdown())
print("Total energy consumption for storage in kWh per Experiment (label) and percentage difference from label 'Baseline':")

for label in network_data_processed.index:
    if label == "Baseline":
        print(f"{label}: {network_data_processed.loc[label].sum()} kWh (100%)")
    else:
        percentage_difference = (network_data_processed.loc[label].sum() / network_data_processed.loc["Baseline"].sum()) * 100
        print(f"{label}: {network_data_processed.loc[label].sum()} kWh ({percentage_difference:.2f}%)")

Total network data received per Experiment (label) in GB:
| label      |   accounting |   ad |   cart |   checkout |   currency |   elasticsearch |   email |   flagd |   fraud-detection |   frontend |   frontend-proxy |   grafana |   image-provider |   jaeger-collector |   jaeger-query |   kafka |   otel-collector |   payment |   product-catalog |   prometheus |   quote |   recommendation |   shipping |   valkey-cart |
|:-----------|-------------:|-----:|-------:|-----------:|-----------:|----------------:|--------:|--------:|------------------:|-----------:|-----------------:|----------:|-----------------:|-------------------:|---------------:|--------:|-----------------:|----------:|------------------:|-------------:|--------:|-----------------:|-----------:|--------------:|
| Baseline   |            0 | 0.04 |   0.14 |          0 |          0 |            0.32 |       0 |    0.03 |                 0 |       1.41 |             1.59 |         0 |                0 |               0.04 

In [48]:
# get the last value for each label based on the time_normalized_rounded as this is the total amount of data received
network_data_transmitted_processed = network_data_transmitted.groupby("label").last().reset_index()


# remove the column time_normalized_rounded as it is not needed anymore
network_data_transmitted_processed = network_data_transmitted_processed.drop(columns=["time_normalized_rounded"])

# remove _system-under-evaluation from the column names
network_data_transmitted_processed.columns = network_data_transmitted_processed.columns.str.replace("_system-under-evaluation", "", regex=False)

network_data_transmitted_processed = network_data_transmitted_processed.set_index("label")

network_data_transmitted_processed = network_data_transmitted_processed.reindex([
    "Baseline",
    "0%",
    "5%",
    "10%",
    "50%",
    "1s",
    "30s",
    "Base Persistence",
    "Scrape 30s Persistence",
    "Scrape 5s Persistence",
    "5% Persistence",
    "10% Persistence",
    "50% Persistence",
    "Persistence Istio"
])

# drop indexes the experiments that are not needed e.g. Baseline, 0%, 5%, 10%, 50%, 1s, 30s
network_data_transmitted_processed = network_data_transmitted_processed.drop(index=[
    "Baseline",
    "0%",
    "5%",
    "10%",
    "50%",
    "1s",
    "30s",
])

# rename the columns to be more readable
network_data_transmitted_processed = network_data_transmitted_processed.rename(index={
    "Base Persistence": "Baseline",
    "Scrape 30s Persistence": "Scrape 30s",
    "Scrape 5s Persistence": "Scrape 5s",
    "5% Persistence": "5%",
    "10% Persistence": "10%",
    "50% Persistence": "50%",
    "Persistence Istio": "Istio",
})


# this values are in bytes, convert them to GB
network_data_transmitted_processed = network_data_transmitted_processed / (1000 * 1000 * 1000)


print("Total network data transmitted per Experiment (label) in GB:")
print(network_data_transmitted_processed.round(2).to_markdown())


# calculate the energy consumption in kWh
network_data_transmitted_processed = network_data_transmitted_processed * ENERGY_INTENSITY_NETWORK_KWH_PER_GB


# print as table
print("Total network data transmitted per Experiment (label) and energy consumption in kWh:")
print(network_data_transmitted_processed.to_markdown())
print("Total energy consumption for storage in kWh per Experiment (label) and percentage difference from label 'Baseline':")

for label in network_data_transmitted_processed.index:
    if label == "Baseline":
        print(f"{label}: {network_data_transmitted_processed.loc[label].sum()} kWh (100%)")
    else:
        percentage_difference = (network_data_transmitted_processed.loc[label].sum() / network_data_transmitted_processed.loc["Baseline"].sum()) * 100
        print(f"{label}: {network_data_transmitted_processed.loc[label].sum()} kWh ({percentage_difference:.2f}%)")

Total network data transmitted per Experiment (label) in GB:
| label      |   accounting |   ad |   cart |   checkout |   currency |   elasticsearch |   email |   flagd |   fraud-detection |   frontend |   frontend-proxy |   grafana |   image-provider |   jaeger-collector |   jaeger-query |   kafka |   otel-collector |   payment |   product-catalog |   prometheus |   quote |   recommendation |   shipping |   valkey-cart |
|:-----------|-------------:|-----:|-------:|-----------:|-----------:|----------------:|--------:|--------:|------------------:|-----------:|-----------------:|----------:|-----------------:|-------------------:|---------------:|--------:|-----------------:|----------:|------------------:|-------------:|--------:|-----------------:|-----------:|--------------:|
| Baseline   |         0    | 0.26 |   0.65 |       0    |       0    |            0.01 |    0    |    0.09 |              0    |       3.34 |             3.73 |      0    |             0    |               0.

## Storage
Further process the storage data to transform total bytes used to kWh using the energy intensity

First we have to load the sizes of the persistent volume claims on the disk.

In [49]:
import pandas as pd
from pathlib import Path
import re

# ---------------------------------------------------------------------
# 1. Locate the raw CSVs
# ---------------------------------------------------------------------
DATA_DIR = Path("storage_snapshots")            # adapt if your files live elsewhere
csv_files = sorted(DATA_DIR.glob("recommendation_k8_*_storage_*.csv"))

print(f"Found {len(csv_files)} CSV files:")
for f in csv_files:
    print(f" - {f.name}")

# ---------------------------------------------------------------------
# 2. Map file-names → human-readable experiment label
#    (this reproduces the labels you used later in the notebook)
# ---------------------------------------------------------------------
def label_from_fname(fname: str) -> str:
    fname = fname.lower()
    if "50_percent_persistence"      in fname: return "50% Persistence"
    if "10_percent_persistence"      in fname: return "10% Persistence"
    if "5_percent_persistence"       in fname: return "5% Persistence"
    if "scrape_30s"                  in fname: return "Scrape 30s Persistence"
    if "scrape_5s"                   in fname: return "Scrape 5s Persistence"
    if "istio"                       in fname: return "Persistence Istio"
    # falls back to the “plain” persistence baseline
    return "Base Persistence"

# ---------------------------------------------------------------------
# 3. Read every CSV and aggregate Prometheus / Elasticsearch volumes
# ---------------------------------------------------------------------
records = []
for fp in csv_files:
    df = pd.read_csv(fp)
    rec = {
        "label":          label_from_fname(fp.name),
        "prometheus":     df[df["PVC"].str.contains("prometheus",     case=False)]["USAGE_BYTES"].sum(),
        "elasticsearch":  df[df["PVC"].str.contains("elasticsearch",  case=False)]["USAGE_BYTES"].sum(),
    }
    records.append(rec)

storage_data_pvc = pd.DataFrame.from_records(records)

# ---------------------------------------------------------------------
# 4. Bring labels & order in line
# ---------------------------------------------------------------------
storage_data_pvc["label"] = storage_data_pvc["label"].replace({
    "Base Persistence":           "Baseline",
    "Scrape 30s Persistence":     "Scrape 30s",
    "Scrape 5s Persistence":      "Scrape 5s",
    "5% Persistence":             "5%",
    "10% Persistence":            "10%",
    "50% Persistence":            "50%",
    "Persistence Istio":          "Istio",
})

storage_data_pvc = (
    storage_data_pvc
        .groupby("label", as_index=False)
        .mean(numeric_only=True)
        .set_index("label")
        .reindex(["Baseline", "5%", "10%", "50%", "Scrape 30s",
                  "Scrape 5s", "Istio"])
)

# ---------------------------------------------------------------------
# 5. Bytes → GB (decimal)  and print a quick GB table for sanity-check
# ---------------------------------------------------------------------
storage_data_pvc_tb = storage_data_pvc / 1_000_000_000      # TB
print("Storage used per experiment [GB]:")
print((storage_data_pvc_tb).to_markdown())


Found 14 CSV files:
 - recommendation_k8_base_10_percent_persistence_storage_2025-07-20_05-37-34.csv
 - recommendation_k8_base_10_percent_persistence_storage_2025-07-20_23-32-49.csv
 - recommendation_k8_base_1m_otel_persistence_istio_storage_2025-07-20_08-07-44.csv
 - recommendation_k8_base_1m_otel_persistence_istio_storage_2025-07-21_02-03-26.csv
 - recommendation_k8_base_1m_otel_persistence_scrape_30s_storage_2025-07-20_03-08-11.csv
 - recommendation_k8_base_1m_otel_persistence_scrape_30s_storage_2025-07-20_21-03-29.csv
 - recommendation_k8_base_1m_otel_persistence_scrape_5s_storage_2025-07-20_01-53-32.csv
 - recommendation_k8_base_1m_otel_persistence_scrape_5s_storage_2025-07-20_19-48-49.csv
 - recommendation_k8_base_1m_otel_persistence_storage_2025-07-20_00-38-54.csv
 - recommendation_k8_base_1m_otel_persistence_storage_2025-07-20_18-34-09.csv
 - recommendation_k8_base_50_percent_persistence_storage_2025-07-20_06-52-16.csv
 - recommendation_k8_base_50_percent_persistence_storage_20

In [50]:
storage_data_processed = storage_data.groupby("label").last().reset_index()

# remove the column time_normalized_rounded as it is not needed anymore
storage_data_processed = storage_data_processed.drop(columns=["time_normalized_rounded"])

# remove _system-under-evaluation from the column names
storage_data_processed.columns = storage_data_processed.columns.str.replace("_system-under-evaluation", "", regex=False)

storage_data_processed = storage_data_processed.set_index("label")

# sort the index for readability in this order: Baseline, 5%, 10%, 50%, Baseline Persistence, 5% Persistence, 10% Persistence, 50% Persistence
storage_data_processed = storage_data_processed.reindex([
    "Baseline",
    "0%",
    "5%",
    "10%",
    "50%",
    "1s",
    "30s",
    "Base Persistence",
    "Scrape 30s Persistence",
    "Scrape 5s Persistence",
    "5% Persistence",
    "10% Persistence",
    "50% Persistence",
    "Persistence Istio",
])

# drop indexes the experiments that are not needed e.g. Baseline, 0%, 5%, 10%, 50%, 1s, 30s
storage_data_processed = storage_data_processed.drop(index=[
    "Baseline",
    "0%",
    "5%",
    "10%",
    "50%",
    "1s",
    "30s",
])

# rename the columns to be more readable
storage_data_processed = storage_data_processed.rename(index={
    "Base Persistence": "Baseline",
    "Scrape 30s Persistence": "Scrape 30s",
    "Scrape 5s Persistence": "Scrape 5s",
    "5% Persistence": "5%",
    "10% Persistence": "10%",
    "50% Persistence": "50%",
    "Persistence Istio": "Istio",
})

# rename opentelemetry-collector to otel-collector
storage_data_processed = storage_data_processed.rename(index={"opentelemetry-collector": "otel-collector"})


# this values are in bytes, convert them to GB
storage_data_processed = storage_data_processed / (1000 * 1000 * 1000)  # GB



cols_to_replace = ["prometheus", "elasticsearch"]          # narrow the scope
storage_data_processed.loc[storage_data_pvc_tb.index, cols_to_replace] = (
    storage_data_pvc_tb[cols_to_replace]
)


print("Storage data used per Experiment (label) in GB:")
print(storage_data_processed.to_markdown())

# calculate the energy consumption in kWh
storage_data_processed = (storage_data_processed * ENERGY_INTENSITY_STORAGE_KWH_PER_GB)




# print as table
print("Total energy consumption for storage in kWh per Experiment (label):")
print(storage_data_processed.to_markdown())
print("Total energy consumption for storage in kWh per Experiment (label) and percentage difference from label 'Baseline':")

for label in storage_data_processed.index:
    if label == "Baseline":
        print(f"{label}: {storage_data_processed.loc[label].sum()} kWh (100%)")
    else:
        percentage_difference = (storage_data_processed.loc[label].sum() / storage_data_processed.loc["Baseline"].sum()) * 100
        print(f"{label}: {storage_data_processed.loc[label].sum()} kWh ({percentage_difference:.2f}%)")


Storage data used per Experiment (label) in GB:
| label      |   accounting |        ad |   cart |   checkout |   currency |   elasticsearch |   email |   flagd |   fraud-detection |   frontend |   frontend-proxy |     grafana |   image-provider |   jaeger |     kafka |   opensearch |   opentelemetry-collector |   payment |   product-catalog |   prometheus |   quote |   recommendation |   shipping |   valkey-cart |
|:-----------|-------------:|----------:|-------:|-----------:|-----------:|----------------:|--------:|--------:|------------------:|-----------:|-----------------:|------------:|-----------------:|---------:|----------:|-------------:|--------------------------:|----------:|------------------:|-------------:|--------:|-----------------:|-----------:|--------------:|
| Baseline   |            0 | 0.0212541 |      0 |          0 |          0 |        0.202086 |       0 |       0 |         0.0194929 |          0 |                0 | 0.000581632 |                0 |      nan |

Further process the CPU energy data. We use the average over the experiment as the kWh

In [51]:
""" kepler_data_processed = kepler_data.groupby("label").mean()

# remove the column time_normalized_rounded as it is not needed anymore
kepler_data_processed = kepler_data_processed.drop(columns=["time_normalized_rounded"])

# remove _system-under-evaluation from the column names
kepler_data_processed.columns = kepler_data_processed.columns.str.replace("_Auxiliary-under-evaluation", "", regex=False)

# sort the index for readability in this order: Baseline, 5%, 10%, 50%, Baseline Persistence, 5% Persistence, 10% Persistence, 50% Persistence
kepler_data_processed = kepler_data_processed.reindex([
    "Baseline",
    "0%",
    "5%",
    "10%",
    "50%",
    "1s",
    "Base Persistence",
    "5% Persistence",
    "10% Persistence",
    "50% Persistence",
    "Base Persistence Istio"
])

# print as table
print("Total energy consumption in kWh per Experiment (label):")
print(kepler_data_processed.to_markdown())
print("Total energy consumption in kWh per Experiment (label) and percentage difference from label 'Baseline':")

for label in kepler_data_processed.index:
    if label == "Baseline":
        print(f"{label}: {kepler_data_processed.loc[label].sum()} kWh (100%)")
    else:
        percentage_difference = (kepler_data_processed.loc[label].sum() / kepler_data_processed.loc["Baseline"].sum()) * 100
        print(f"{label}: {kepler_data_processed.loc[label].sum()} kWh ({percentage_difference:.2f}%)") """

' kepler_data_processed = kepler_data.groupby("label").mean()\n\n# remove the column time_normalized_rounded as it is not needed anymore\nkepler_data_processed = kepler_data_processed.drop(columns=["time_normalized_rounded"])\n\n# remove _system-under-evaluation from the column names\nkepler_data_processed.columns = kepler_data_processed.columns.str.replace("_Auxiliary-under-evaluation", "", regex=False)\n\n# sort the index for readability in this order: Baseline, 5%, 10%, 50%, Baseline Persistence, 5% Persistence, 10% Persistence, 50% Persistence\nkepler_data_processed = kepler_data_processed.reindex([\n    "Baseline",\n    "0%",\n    "5%",\n    "10%",\n    "50%",\n    "1s",\n    "Base Persistence",\n    "5% Persistence",\n    "10% Persistence",\n    "50% Persistence",\n    "Base Persistence Istio"\n])\n\n# print as table\nprint("Total energy consumption in kWh per Experiment (label):")\nprint(kepler_data_processed.to_markdown())\nprint("Total energy consumption in kWh per Experiment 

In [52]:
kepler_data_joules_processed = kepler_data_joules.groupby("label").last()

# remove the column time_normalized_rounded as it is not needed anymore
kepler_data_joules_processed = kepler_data_joules_processed.drop(columns=["time_normalized_rounded"])

# remove _system-under-evaluation from the column names
kepler_data_joules_processed.columns = kepler_data_joules_processed.columns.str.replace("_system-under-evaluation", "", regex=False)

# drop the columns that are not needed
kepler_data_joules_processed = kepler_data_joules_processed.drop(columns=["master", "worker"])


# sort the index for readability in this order: Baseline, 5%, 10%, 50%, Baseline Persistence, 5% Persistence, 10% Persistence, 50% Persistence
kepler_data_joules_processed = kepler_data_joules_processed.reindex([
    "Baseline",
    "0%",
    "5%",
    "10%",
    "50%",
    "1s",
    "30s",
    "Base Persistence",
    "Scrape 30s Persistence",
    "Scrape 5s Persistence",
    "5% Persistence",
    "10% Persistence",
    "50% Persistence",
    "Persistence Istio",
])  

# drop indexes the experiments that are not needed e.g. Baseline, 0%, 5%, 10%, 50%, 1s, 30s
kepler_data_joules_processed = kepler_data_joules_processed.drop(index=[
    "Baseline",
    "0%",
    "5%",
    "10%",
    "50%",
    "1s",
    "30s",
])

# rename the columns to be more readable
kepler_data_joules_processed = kepler_data_joules_processed.rename(index={
    "Base Persistence": "Baseline",
    "Scrape 30s Persistence": "Scrape 30s",
    "Scrape 5s Persistence": "Scrape 5s",
    "5% Persistence": "5%",
    "10% Persistence": "10%",
    "50% Persistence": "50%",
    "Persistence Istio": "Istio",
})

kepler_data_joules_processed_total = kepler_data_joules_processed.copy()
kepler_data_joules_processed_total["Total"] = kepler_data_joules_processed.sum(axis=1)

# print as table
print("Total energy consumption in Joules per Experiment (label):")
print(kepler_data_joules_processed_total.to_markdown())

# convert Joules to kWh
kepler_data_joules_processed = kepler_data_joules_processed / 3600000  # 1 kWh = 3.6 million Joules
print("Total energy consumption in kWh per Experiment (label) and percentage difference from label 'Baseline':")

for label in kepler_data_joules_processed.index:
    if label == "Baseline":
        print(f"{label}: {kepler_data_joules_processed.loc[label].sum()} kWh (100%)")
    else:
        percentage_difference = (kepler_data_joules_processed.loc[label].sum() / kepler_data_joules_processed.loc["Baseline"].sum()) * 100
        print(f"{label}: {kepler_data_joules_processed.loc[label].sum()} kWh ({percentage_difference:.2f}%)")


Total energy consumption in Joules per Experiment (label):
| label      |   accounting |      ad |    cart |   checkout |   configfile |   currency |   elasticsearch |   email |   exporter |    flagd |   flagd-ui |   fraud-detection |   frontend |   frontend-proxy |   grafana |   image-provider |   init-config |   istio-init |   istio-proxy |   jaeger-agent-sidecar |   jaeger-collector |   jaeger-query |   kafka |   opensearch |   opentelemetry-collector |   payment |   product-catalog |   prometheus-server |   quote |   recommendation |   shipping |   valkey-cart |   wait-for-kafka |   wait-for-valkey-cart |   Total |
|:-----------|-------------:|--------:|--------:|-----------:|-------------:|-----------:|----------------:|--------:|-----------:|---------:|-----------:|------------------:|-----------:|-----------------:|----------:|-----------------:|--------------:|-------------:|--------------:|-----------------------:|-------------------:|---------------:|--------:|-------------:|

# DRAM
Does not work as expected currently. Values are strange and mostly negativ.


In [53]:
kepler_ram_joules_processed = kepler_dram.groupby("label").last()

# remove the column time_normalized_rounded as it is not needed anymore
kepler_ram_joules_processed = kepler_ram_joules_processed.drop(columns=["time_normalized_rounded"])

# remove _system-under-evaluation from the column names
kepler_ram_joules_processed.columns = kepler_ram_joules_processed.columns.str.replace("_system-under-evaluation", "", regex=False)

# sort the index for readability in this order: Baseline, 5%, 10%, 50%, Baseline Persistence, 5% Persistence, 10% Persistence, 50% Persistence
kepler_ram_joules_processed = kepler_ram_joules_processed.reindex([
    "Baseline",
    "0%",
    "5%",
    "10%",
    "50%",
    "1s",
    "30s",
    "Base Persistence",
    "Scrape 5s Persistence",
    "5% Persistence",
    "10% Persistence",
    "50% Persistence",
    "Persistence Istio",
])  


# drop indexes the experiments that are not needed e.g. Baseline, 0%, 5%, 10%, 50%, 1s, 30s
kepler_ram_joules_processed = kepler_ram_joules_processed.drop(index=[
    "Baseline",
    "0%",
    "5%",
    "10%",
    "50%",
    "1s",
    "30s",
])

# rename the columns to be more readable
kepler_ram_joules_processed = kepler_ram_joules_processed.rename(index={
    "Base Persistence": "Baseline",
    "Scrape 5s Persistence": "Scrape 5s",
    "5% Persistence": "5%",
    "10% Persistence": "10%",
    "50% Persistence": "50%",
    "Persistence Istio": "Istio",
})

kepler_ram_joules_processed_total = kepler_ram_joules_processed.copy()
kepler_ram_joules_processed_total["Total"] = kepler_ram_joules_processed.sum(axis=1)

# print as table
print("DRAM energy consumption in Joules per Experiment (label):")
print(kepler_ram_joules_processed_total.to_markdown())

# convert Joules to kWh
kepler_ram_joules_processed = kepler_ram_joules_processed / 3600000  # 1 kWh = 3.6 million Joules
print("Total energy consumption in kWh per Experiment (label) and percentage difference from label 'Baseline':")

for label in kepler_ram_joules_processed.index:
    if label == "Baseline":
        print(f"{label}: {kepler_ram_joules_processed.loc[label].sum()} kWh (100%)")
    else:
        percentage_difference = (kepler_ram_joules_processed.loc[label].sum() / kepler_ram_joules_processed.loc["Baseline"].sum()) * 100
        print(f"{label}: {kepler_ram_joules_processed.loc[label].sum()} kWh ({percentage_difference:.2f}%)")


DRAM energy consumption in Joules per Experiment (label):
| label     |   accounting |       ad |     cart |   checkout |   configfile |   currency |   elasticsearch |   email |     exporter |    flagd |   flagd-ui |   fraud-detection |   frontend |   frontend-proxy |   grafana |   image-provider |   init-config |   istio-init |   istio-proxy |   jaeger-agent-sidecar |   jaeger-collector |   jaeger-query |    kafka |   master |   opensearch |   opentelemetry-collector |   payment |   product-catalog |   prometheus-server |   quote |   recommendation |   shipping |   valkey-cart |   wait-for-kafka |   wait-for-valkey-cart |   worker |    Total |
|:----------|-------------:|---------:|---------:|-----------:|-------------:|-----------:|----------------:|--------:|-------------:|---------:|-----------:|------------------:|-----------:|-----------------:|----------:|-----------------:|--------------:|-------------:|--------------:|-----------------------:|-------------------:|-------------

Categorize the energy consumption into "App", "Auxiliary" and "System"

In [54]:
# | label            |   accounting |      ad |    cart |   checkout |   configfile |   copy-default-plugins |   currency |   elasticsearch |   elasticsearch-checker |   email |   flagd |   flagd-ui |   fraud-detection |   frontend |   frontend-proxy |   grafana |   image-provider |   init-config |   jaeger |   jaeger-agent-sidecar |   jaeger-collector |   jaeger-query |   kafka |   master |   opensearch |   opentelemetry-collector |   payment |   product-catalog |   prometheus-server |   quote |   recommendation |   shipping |   sysctl |   valkey-cart |   wait-for-kafka |   wait-for-valkey-cart |   worker |

kepler_mapping = {
    "accounting": "Primary",
    "ad": "Primary",
    "cart": "Primary",
    "checkout": "Primary",
    "configfile": "Primary",
    "copy-default-plugins": "Primary",
    "currency": "Primary",
    "elasticsearch": "Auxiliary",
    "elasticsearch-checker": "Auxiliary",
    "email": "Primary",
    "exporter": "Auxiliary",
    "flagd": "Auxiliary",
    "flagd-ui": "Auxiliary",
    "fraud-detection": "Primary",
    "frontend": "Primary",
    "frontend-proxy": "Primary",
    "grafana": "Auxiliary",
    "image-provider": "Primary",
    "init-config": "Primary",
    "istio-init": "Auxiliary",
    "istio-proxy": "Auxiliary",
    "jaeger": "Auxiliary",
    "jaeger-agent-sidecar": "Auxiliary",
    "jaeger-collector": "Auxiliary",
    "jaeger-query": "Auxiliary",
    "kafka": "Primary",
    "master": "Primary",
    "opensearch": "Auxiliary",
    "opentelemetry-collector": "Auxiliary",
    "otel-collector": "Auxiliary",  # alias for opentelemetry-collector
    "payment": "Primary",
    "product-catalog": "Primary",
    "prometheus-server": "Auxiliary",
    "prometheus": "Auxiliary",  
    "quote": "Primary",
    "recommendation": "Primary",
    "shipping": "Primary",
    "sysctl": "Auxiliary",
    "valkey-cart": "Primary",
    "wait-for-kafka": "Primary",
    "wait-for-valkey-cart": "Primary",
    "worker": "Primary",
}

#| label            |   accounting |          ad |   elasticsearch |   email |   flagd |   fraud-detection |     frontend |   frontend-proxy |     grafana |   image-provider |       kafka |   opensearch |   payment |   prometheus |   recommendation |   shipping |   valkey-cart |

storage_data_mapping = {
    "accounting": "Primary",
    "ad": "Primary",
    "elasticsearch": "Auxiliary",
    "email": "Primary",
    "flagd": "Auxiliary",
    "fraud-detection": "Primary",
    "frontend": "Primary",
    "frontend-proxy": "Primary",
    "grafana": "Auxiliary",
    "image-provider": "Primary",
    "kafka": "Primary",
    "opensearch": "Auxiliary",
    "payment": "Primary",
    "prometheus": "Auxiliary",
    "recommendation": "Primary",
    "shipping": "Primary",
    "valkey-cart": "Primary",
    "quote": "Primary",
    "product-catalog": "Primary",
    "opentelemetry-collector": "Auxiliary",
    "otel-collector": "Auxiliary",  # alias for opentelemetry-collector
    "checkout": "Primary",
    "cart": "Primary",
    "jaeger": "Auxiliary",
    "jaeger-agent-sidecar": "Auxiliary",
    "jaeger-collector": "Auxiliary",
    "jaeger-query": "Auxiliary",
    "currency": "Primary",
}

# | label            |   accounting |          ad |        cart |    checkout |    currency |   elasticsearch |   email |       flagd |   fraud-detection |   frontend |   frontend-proxy |     grafana |   image-provider |      jaeger |       kafka |   opensearch |   opentelemetry-collector |     payment |   product-catalog |   prometheus |   quote |   recommendation |   shipping |   valkey-cart |

network_data_mapping = {
    "accounting": "Primary",
    "ad": "Primary",
    "cart": "Primary",
    "checkout": "Primary",
    "currency": "Primary",
    "elasticsearch": "Auxiliary",
    "email": "Primary",
    "flagd": "Auxiliary",
    "fraud-detection": "Primary",
    "frontend": "Primary",
    "frontend-proxy": "Primary",
    "grafana": "Auxiliary",
    "image-provider": "Primary",
    "jaeger": "Auxiliary",
    "jaeger-collector": "Auxiliary", 
    "jaeger-query": "Auxiliary", 
    "otel-collector": "Auxiliary", 
    "kafka": "Primary",
    "opensearch": "Auxiliary",
    "opentelemetry-collector": "Auxiliary",
    "otel-collector": "Auxiliary",  # alias for opentelemetry-collector
    "payment": "Primary",
    "product-catalog": "Primary",
    "prometheus": "Auxiliary",
    "quote": "Primary",
    "recommendation": "Primary",
    "shipping": "Primary",
    "valkey-cart": "Primary",
}


import pandas as pd

def categorize_energy_consumption(data: pd.DataFrame, mapping: dict) -> pd.DataFrame:
    """
    Categorize and sum energy consumption based on component categories.

    Parameters:
    - data (pd.DataFrame): A DataFrame where rows represent measurements and columns represent component labels.
    - mapping (dict): A mapping from component label (column name) to a category (e.g., "Primary", "Auxiliary").

    Returns:
    - pd.DataFrame: A DataFrame with the same row indices as `data` and columns being the summed energy per category,
                    including an 'Uncategorized' column if needed.
    """
    # Initialize a DataFrame to store summed results
    categorized = pd.DataFrame(index=data.index)

    # Identify columns that are in the mapping
    mapped_cols = set(mapping.keys())
    data_cols = set(data.columns)
    unmapped_cols = list(data_cols - mapped_cols)

    # Print warning for unmapped columns
    if unmapped_cols:
        print(f"[WARNING] Uncategorized columns: {unmapped_cols}")

    # Iterate through unique categories
    for category in set(mapping.values()):
        cols = [col for col in data.columns if mapping.get(col) == category]
        categorized[category] = data[cols].sum(axis=1) if cols else 0

    # Add uncategorized columns
    if unmapped_cols:
        categorized["Uncategorized"] = data[unmapped_cols].sum(axis=1)
    
    return categorized


# Categorize the energy consumption for each dataset
network_categorized = categorize_energy_consumption(network_data_processed, network_data_mapping)
storage_categorized = categorize_energy_consumption(storage_data_processed, storage_data_mapping)
kepler_categorized = categorize_energy_consumption(kepler_data_joules_processed, kepler_mapping)

# Print the categorized energy consumption
print("\nNetwork Energy Consumption Categorized:")
print(network_categorized.to_markdown())
print("\nStorage Energy Consumption Categorized:")
print(storage_categorized.to_markdown())
print("\nKepler Energy Consumption Categorized:")
print(kepler_categorized.to_markdown())

# add a suffix to each set of columns
net = network_categorized.add_suffix(" (Network)")
cpu = kepler_categorized.add_suffix(" (CPU+Mem)")
sto = storage_categorized.add_suffix(" (Storage)")

categorized_combined_suffix = pd.concat([net, cpu, sto], axis=1) * 1000
categorized_combined_suffix = categorized_combined_suffix.sort_index(axis=1)

# Compute total energy consumption per row
categorized_combined_suffix["Total"] = categorized_combined_suffix.sum(axis=1)

# Calculate percentage change from the "Baseline" row
baseline_total = categorized_combined_suffix.loc["Baseline", "Total"]
categorized_combined_suffix["Total Change (%)"] = (
    (categorized_combined_suffix["Total"] - baseline_total) / baseline_total * 100
).round(2)

print("\nCombined Categorized Energy Consumption by origin in Wh:")
print(categorized_combined_suffix.round(2).to_markdown())

# merge the categorized dataframes and sum on same column name
categorized_combined = pd.concat([network_categorized, storage_categorized, kepler_categorized], axis=1)
categorized_combined = categorized_combined.T.groupby(categorized_combined.columns).sum().T


# Print the combined categorized energy consumption
#print("\nCombined Categorized Energy Consumption:")
#print(categorized_combined.to_markdown())

# add total summing up the categories for each experiment (label)
categorized_combined["Total"] = categorized_combined.sum(axis=1)

#categorized_combined_transpose = categorized_combined.transpose()
#print(categorized_combined_transpose.to_markdown())

categorized_combined_transpose_wh = categorized_combined.T.copy() * 1000  # convert kWh to Wh

print("\nCombined Categorized Energy Consumption in Wh:")
print(categorized_combined_transpose_wh.round(2).to_markdown())

# Combine all three sources on service/component label level
combined_per_label = pd.concat([
    network_data_processed,
    storage_data_processed,
    kepler_data_joules_processed
], axis=1)

# Group by column name (i.e., per service/component), and sum if duplicate columns exist
combined_per_label = combined_per_label.groupby(combined_per_label.columns, axis=1).sum()

combined_per_label["Total"] = combined_per_label.sum(axis=1)

combined_per_label_wh = combined_per_label * 1000  # convert kWh to Wh

print("\nCombined Total Energy Consumption per Service (Wh):")
print(combined_per_label_wh.round(2).to_markdown())


combined_per_label_wh_T = combined_per_label_wh.T  # transposed: services as rows
combined_per_label_wh_T["Total"] = combined_per_label_wh_T.sum(axis=1)

print("\nTransposed: Total Energy per Service across all Experiments (Wh):")
print(combined_per_label_wh_T.round(2).to_markdown())


# Add suffixes to source-specific per-service dataframes
network_labeled = network_data_processed.add_suffix(" (Network)")
storage_labeled = storage_data_processed.add_suffix(" (Storage)")
kepler_labeled = kepler_data_joules_processed.add_suffix(" (CPU+Mem)")

# Combine all energy sources with explicit labels
per_service_detailed = pd.concat([kepler_labeled, network_labeled, storage_labeled], axis=1) * 1000  # convert to Wh

# Sort the columns for better readability
per_service_detailed = per_service_detailed.sort_index(axis=1)

# Print table: Energy consumption per service and experiment, broken down by origin
print("\nDetailed Per-Service Energy Consumption (Wh) by Origin:")
print(per_service_detailed.round(2).to_markdown())


categorized_combined_suffix.to_csv("energy_totals_by_origin.csv")
categorized_combined.to_csv("energy_totals_by_category.csv")
combined_per_label_wh.to_csv("energy_totals_per_service.csv")




Network Energy Consumption Categorized:
| label      |   Auxiliary |    Primary |
|:-----------|------------:|-----------:|
| Baseline   |   0.0116207 | 0.00844343 |
| Scrape 30s |   0.0118632 | 0.00850298 |
| Scrape 5s  |   0.0137879 | 0.00846928 |
| 5%         |   0.0141758 | 0.00846218 |
| 10%        |   0.0170402 | 0.00845491 |
| 50%        |   0.0410012 | 0.00848282 |
| Istio      |   0.0117679 | 0.00888175 |

Storage Energy Consumption Categorized:
| label      |   Auxiliary |     Primary |
|:-----------|------------:|------------:|
| Baseline   |  0.00415246 | 5.01925e-05 |
| Scrape 30s |  0.00419052 | 4.9937e-05  |
| Scrape 5s  |  0.00437733 | 5.02327e-05 |
| 5%         |  0.00432163 | 5.13508e-05 |
| 10%        |  0.00458251 | 5.22723e-05 |
| 50%        |  0.00650131 | 5.37543e-05 |
| Istio      |  0.00344078 | 5.09776e-05 |

Kepler Energy Consumption Categorized:
| label      |   Auxiliary |   Primary |
|:-----------|------------:|----------:|
| Baseline   |   0.0107761 | 0.

  combined_per_label = combined_per_label.groupby(combined_per_label.columns, axis=1).sum()
