In [1]:
import pandas as pd
import glob
import os

In [6]:
def prepare(base_folder, run_start_times):
    for run_folder, start_time in run_start_times.items():
        folder_path = os.path.join(base_folder + "/raw", run_folder, "*.csv")
        csv_files = glob.glob(folder_path)

        experiment_start = pd.Timestamp(start_time, tz="UTC")

        resource_dfs = []
        for file in csv_files:
            with open(file, "r", encoding="utf-8") as f:
                raw_header = f.readline().strip()

            if '"' in raw_header:
                # --- Clean header ---
                header = raw_header.replace('"', "").replace("Time", "timestamp")
                columns = header.split(",")

                # if it is the GPU file, keep first GPU 0
                if "GPU 0" in columns:
                    # rename the first GPU 0 to gpu_0
                    columns[columns.index("GPU 0")] = "gpu_0"

                # Prefix non-timestamp columns with the file name (without .csv)
                file_prefix = os.path.splitext(os.path.basename(file))[0]
                columns = [
                    c if c == "timestamp" else f"{file_prefix}_{c}"
                    for c in columns
                ]

                df = pd.read_csv(file, skiprows=1, names=columns)

                df.drop("gpu_GPU 0", axis=1, inplace=True, errors="ignore")

                # Convert to UTC
                df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
                df["timestamp"] = df["timestamp"].dt.tz_localize("UTC")

                df = df.set_index("timestamp")
                df = df[~df.index.duplicated(keep='first')]
                resource_dfs.append(df)
            else:
                df = pd.read_csv(file)

                # take response time of last row
                last_timestamp = df['timestamp'].iloc[-1]
                last_timestamp = pd.to_datetime(last_timestamp).tz_localize('UTC')
                # knative pod reconciliation can take some time, so we add 30s to the last timestamp
                last_timestamp_plus_30 = last_timestamp + pd.Timedelta(seconds=30)
                time_difference = last_timestamp_plus_30 - experiment_start
                with open(os.path.join(base_folder, f"{run_folder}_total-time.txt"), "w") as f:
                    f.write(f"Total Time (s),{time_difference.round('1s').total_seconds()}\n")

                df.drop(columns=["timestamp"], inplace=True)

                df.replace(-1.0, pd.NA, inplace=True)
                df.to_csv(os.path.join(base_folder, f"{run_folder}_response_times.csv"))

        # Global timeline for this run
        all_end = max(df.index.max() for df in resource_dfs)
        time_index = pd.date_range(experiment_start, all_end, freq="1s", tz="UTC")

        aligned_resource_dfs = [df.reindex(time_index) for df in resource_dfs]
        final_resource_df = pd.concat(aligned_resource_dfs, axis=1)

        # Replace absolute timestamps with a step counter in seconds
        final_resource_df = final_resource_df.reset_index(drop=True)
        final_resource_df.index.name = "step"  # step in seconds from start

        # Save aligned CSV per run
        final_resource_df.to_csv(os.path.join(base_folder, f"{run_folder}_resources.csv"))

In [7]:
run_start_times_gpu = {
    "run-1": "2025-09-13T10:27:24",
    "run-2": "2025-09-13T10:36:13",
    "run-3": "2025-09-13T10:44:58",
    "run-4": "2025-09-13T10:53:44",
    "run-5": "2025-09-13T11:02:29",
}
run_start_times_cpu = {
    "run-1": "2025-09-13T09:33:50",
    "run-2": "2025-09-13T09:42:18",
    "run-3": "2025-09-13T09:50:57",
    "run-4": "2025-09-13T09:59:36",
    "run-5": "2025-09-13T10:08:13",
}
run_start_times_auto = {
    "run-1": "2025-09-13T12:56:19",
    "run-2": "2025-09-13T17:53:50",
    "run-3": "2025-09-13T18:08:09",
    "run-4": "2025-09-13T18:17:09",
    "run-5": "2025-09-13T18:31:00",
}

prepare("./data/gpu", run_start_times_gpu)
prepare("./data/cpu", run_start_times_cpu)
prepare("./data/auto", run_start_times_auto)
