In [1]:
cd /Users/karolinegriesbach/Documents/Innkeepr/Git/consumption-based-costs/

# Script to check costs with known data

In [19]:
import glob
import pandas as pd

from src.utils.cost_handling import return_cost_per_run

In [3]:
from_date = "2024-01-01"
to_date="2025-06-23"
timestamp = "2025-06-23 13:23:56.520433"
path_to_remaining_data =  f"/Users/karolinegriesbach/Documents/Innkeepr/Git/consumption-based-costs/data/{from_date}_to_{to_date}/targeting_and_retraining/map_nodes_to_prefect_runs_final_{from_date}_{to_date}.csv"
path_to_targeting_data = f"/Users/karolinegriesbach/Documents/Innkeepr/Git/consumption-based-costs/data/{from_date}_to_{to_date}/costs/final_targeting_costs_{from_date}_{to_date}.csv"
path_to_retraining_data = f"/Users/karolinegriesbach/Documents/Innkeepr/Git/consumption-based-costs/data/{from_date}_to_{to_date}/costs/final_retraining_costs_{from_date}_{to_date}.csv"
path_to_stackit_expected_data = "/Users/karolinegriesbach/Documents/Innkeepr/Git/consumption-based-costs/data/infrastructure_insights/data_for_testing_stackit/"

# Check StackIT Costs

In [61]:
discrepancies_allowed = {"General Purpose Server-g1.3-EU01":{}} # due to agentpool discrepancies are allowed

In [70]:
stackit_files = glob.glob(f"{path_to_stackit_expected_data}/*.csv")
stackit_expected_data = pd.DataFrame()
for file in stackit_files:
 temp = pd.read_csv(file, sep=";")
 date = file.split("/")[-1].split(".")[0].split("_")[-1]
 temp["date"] = date
 temp = temp[[col for col in temp.columns if "Unnamed" not in col]]
 temp["Gesamtkosten"] = temp["Gesamtkosten"].str.replace("â‚¬", "").str.replace(" ","").str.replace(",",".").astype(float)
 temp["Gesamtmenge_values"] = temp["Gesamtmenge"].str.replace("Hours", "").str.replace("Gigabyte", "").str.replace(" ","").str.replace(",",".").astype(float)
 stackit_expected_data = pd.concat([stackit_expected_data, temp], axis=0)
stackit_expected_data

In [63]:
df_targeting = pd.read_csv(path_to_targeting_data)
df_retraining = pd.read_csv(path_to_retraining_data)

In [64]:
# load remaining data and remove targeting and retraining runs
df_remaining = pd.read_csv(path_to_remaining_data)
df_remaining = df_remaining[df_remaining["Prefect_Deployments"].isin(df_targeting["Prefect_Deployments"].unique().tolist())==False]
df_remaining = df_remaining[df_remaining["Prefect_Deployments"].isin(df_retraining["Prefect_Deployments"].unique().tolist())==False]
df_remaining = return_cost_per_run(df_remaining)
df_remaining["Prefect_Deployments"].value_counts(dropna=False)

In [65]:
# combine to stackit data
df_stackit = pd.concat([df_targeting,df_retraining,df_remaining])
df_stackit

In [84]:
# compare stackit data to expected data
res = pd.DataFrame(columns=["date","serviceName","expected_costs","data_costs","diff","diff_in_per","orignal_data_costs","expected Service","expected_hours","data_hours"])
for date in stackit_expected_data["date"].unique():
    print(f"Compare data for date {date}")
    temp_stackit_expected = stackit_expected_data[stackit_expected_data["date"]==date]
    temp_stackit = df_stackit[df_stackit["date"]==pd.to_datetime(date).strftime("%Y-%m-%d")]
    temp_stackit["duration"] = temp_stackit["duration"].astype(float)
    temp_stackit_grouped = temp_stackit.groupby(by=["serviceName"])["cost_per_run"].sum().reset_index()
    temp_stackit_grouped["cost_per_run"] = temp_stackit_grouped["cost_per_run"].round(2)
    temp_stackit_grouped_hours = temp_stackit.groupby(by=["serviceName"])["duration"].sum().reset_index()
    temp_stackit_grouped_hours["data_hours"] = temp_stackit_grouped_hours["duration"]/60/60 # from seconds to hours
    temp_stackit_grouped = pd.merge(temp_stackit_grouped, temp_stackit_grouped_hours, how="left", on="serviceName")
    merge_expected_and_data_costs = pd.merge(temp_stackit_expected[["date","Service","Gesamtkosten","Gesamtmenge_values"]], temp_stackit_grouped, left_on="Service", right_on="serviceName", how="left")
    merge_expected_and_data_costs["orignal_data_costs"] = merge_expected_and_data_costs["cost_per_run"]
    merge_expected_and_data_costs["diff"] = merge_expected_and_data_costs["Gesamtkosten"] - merge_expected_and_data_costs["cost_per_run"]
    merge_expected_and_data_costs["diff_in_per"] = (1-(merge_expected_and_data_costs["cost_per_run"]/merge_expected_and_data_costs["Gesamtkosten"]))*100
    merge_expected_and_data_costs = merge_expected_and_data_costs.rename(
        columns={
            "Service": "expected Service",
            "Gesamtkosten": "expected_costs",
            "cost_per_run": "data_costs",
            "Gesamtmenge_values":"expected_hours"
        }
    )
    for service in discrepancies_allowed.keys():
        if service == "General Purpose Server-g1.3-EU01":
            expected_costs = temp_stackit_expected[temp_stackit_expected["Service"]==service]
            expected_costs["cost_per_hour"] = temp_stackit_expected["Gesamtkosten"]/temp_stackit_expected["Gesamtmenge_values"]
            if len(expected_costs) != 1:
                raise Exception(f"Found more than one cost point for service General Purpose Server-g1.3-EU01")
            cost_per_hour = expected_costs["cost_per_hour"].values[0]
            add_costs = 2 * 24 * float(cost_per_hour)
            merge_expected_and_data_costs.loc[merge_expected_and_data_costs["expected Service"] == service, "data_costs"] += add_costs
        
    res = pd.concat([res, merge_expected_and_data_costs[["date","serviceName","expected_costs","data_costs","diff","diff_in_per","orignal_data_costs","expected Service","expected_hours","data_hours"]]], axis=0)
res

In [85]:
res_wrong = res[res["diff_in_per"].abs()>0.01]
#res_wrong = res_wrong[res_wrong["serviceName"].isin(discrepancies_allowed.keys())==False]
if len(res_wrong):
    raise Exception(f"There are significant cost differences: {res_wrong}")