In [2]:
import pandas as pd
import glob
import os

In [3]:
def prepare(base_folder, run_start_times):
    for run_folder, start_time in run_start_times.items():
        folder_path = os.path.join(base_folder + "/raw", run_folder, "*.csv")
        csv_files = glob.glob(folder_path)

        experiment_start = pd.Timestamp(start_time, tz="UTC")

        resource_dfs = []
        for file in csv_files:
            with open(file, "r", encoding="utf-8") as f:
                raw_header = f.readline().strip()

            if '"' in raw_header:
                # --- Clean header ---
                header = raw_header.replace('"', "").replace("Time", "timestamp")
                columns = header.split(",")

                # if it is the GPU file, keep first GPU 0
                if "GPU 0" in columns:
                    # rename the first GPU 0 to gpu_0
                    columns[columns.index("GPU 0")] = "gpu_0"

                # Prefix non-timestamp columns with the file name (without .csv)
                file_prefix = os.path.splitext(os.path.basename(file))[0]
                columns = [
                    c if c == "timestamp" else f"{file_prefix}_{c}"
                    for c in columns
                ]

                df = pd.read_csv(file, skiprows=1, names=columns)

                df.drop("gpu_GPU 0", axis=1, inplace=True, errors="ignore")

                # Convert to UTC
                df["timestamp"] = pd.to_datetime(df["timestamp"], errors="coerce")
                df["timestamp"] = df["timestamp"].dt.tz_localize("Europe/Berlin").dt.tz_convert("UTC")

                df = df.set_index("timestamp")
                df = df[~df.index.duplicated(keep='first')]
                resource_dfs.append(df)
            else:
                df = pd.read_csv(file)
                df.drop(columns=["timestamp"], inplace=True)
                df.to_csv(os.path.join(base_folder, f"{run_folder}_response_times.csv"))

        # Global timeline for this run
        all_end = max(df.index.max() for df in resource_dfs)
        time_index = pd.date_range(experiment_start, all_end, freq="1s", tz="UTC")

        aligned_resource_dfs = [df.reindex(time_index) for df in resource_dfs]
        final_resource_df = pd.concat(aligned_resource_dfs, axis=1)

        # Replace absolute timestamps with a step counter in seconds
        final_resource_df = final_resource_df.reset_index(drop=True)
        final_resource_df.index.name = "step"  # step in seconds from start

        # Save aligned CSV per run
        final_resource_df.to_csv(os.path.join(base_folder, f"{run_folder}_resources.csv"))

In [6]:
run_start_times_cpu = {
    "run-1": "2025-09-03T13:23:15",
    "run-2": "2025-09-03T13:38:37",
    "run-3": "2025-09-03T13:56:21",
    "run-4": "2025-09-03T14:11:11",
    "run-5": "2025-09-03T14:26:26",
}
run_start_times_gpu = {
    "run-1": "2025-09-03T11:31:04",
    "run-2": "2025-09-03T11:35:57",
    "run-3": "2025-09-03T11:40:41",
    "run-4": "2025-09-03T12:01:06",
    "run-5": "2025-09-03T12:04:36",
}
run_start_times_auto = {
    "run-1": "2025-09-03T09:25:19",
    "run-2": "2025-09-03T09:32:46",
    "run-3": "2025-09-03T09:38:55",
    "run-4": "2025-09-03T09:45:04",
    "run-5": "2025-09-03T09:51:55",
}

prepare("./data/cpu", run_start_times_cpu)
prepare("./data/gpu", run_start_times_gpu)
prepare("./data/auto", run_start_times_auto)
