In [None]:
cd /Users/karolinegriesbach/Documents/Innkeepr/Git/consumption-based-costs/

In [None]:
import os
import re
import json
import logging
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from datetime import datetime, timedelta

from src.utils.accounts import sanitize_account_name
from src.utils.innkeepr_api import call_api_with_service_token, send_to_innkeepr_api_paginated
from src.utils.constants import return_api_url_innkeepr

# Define Variables

In [None]:
from_date = "2024-01-01"
to_date="2025-06-05"
timestamp = "2025-06-05 19:48:58.031710"
path_to_dir = f"/Users/karolinegriesbach/Documents/Innkeepr/Git/consumption-based-costs/data/{from_date}_to_{to_date}/{timestamp}/"
path_to_save = f"/Users/karolinegriesbach/Documents/Innkeepr/Git/consumption-based-costs/data/{from_date}_to_{to_date}/targeting_and_retraining/"
path_to_data=f"{path_to_dir}final_costs_with_azure_and_aws_and_db_{from_date}_{to_date}.csv"
url = return_api_url_innkeepr()
stackit_cost_handling = {
    "start": "2024-11-11",
    "exlude_date_ranges": [
        {
            "start": "2024-12-22",
            "end" : "2025-01-03",
        }
    ]
}

In [None]:
os.makedirs(path_to_save, exist_ok=True)

# Load Data

In [None]:
df = pd.read_csv(path_to_data)
df = df[[col for col in df.columns if "Unnamed" not in col]]
df

In [None]:
df.info()

In [None]:
df["Prefect_Deployments"] = df["Prefect_Deployments"].replace("retrainng", "retraining")

In [None]:
df["Prefect_Deployments"].value_counts()

In [None]:
df["Deployments"].value_counts()

# Filter Data
Filter data for targeting runs only using Deployments and Prefect_Deployments

In [None]:
# get stackIT costs via Prefect_Deployments
targeting_and_retraining_runs = df[(df["Deployments"]=="targeting")|(df["Prefect_Deployments"]=="targeting")|(df["Deployments"]=="retraining")|(df["Prefect_Deployments"]=="retraining")]
# Bug in cost extractor: "retrainng" instead of "retraining" (behoben)
targeting_and_retraining_runs

In [None]:
targeting_and_retraining_runs["Prefect_Deployments"].value_counts()

# Historical count of targeting runs and prefect runs

In [None]:
targeting_runs = targeting_and_retraining_runs[targeting_and_retraining_runs["Prefect_Deployments"]=="targeting"]
retraining_runs = targeting_and_retraining_runs[targeting_and_retraining_runs["Prefect_Deployments"]=="retraining"]
runs_vc_by_date = targeting_runs.groupby("date")["Prefect_Deployments"].value_counts()
runs_vc_by_date = pd.DataFrame(runs_vc_by_date).reset_index().rename(columns={"count":"count targeting runs"})
runs_vc_by_date_retraining = retraining_runs.groupby("date")["Prefect_Deployments"].value_counts()
runs_vc_by_date_retraining = pd.DataFrame(runs_vc_by_date_retraining).reset_index().rename(columns={"count":"count retraining runs"})
audiences_unique_by_date = targeting_runs.groupby("date")["audience_id"].nunique()
audiences_unique_by_date = pd.DataFrame(audiences_unique_by_date).reset_index()
concat = pd.merge(runs_vc_by_date, audiences_unique_by_date, on="date")
fig = plt.figure(figsize=(20,8))
sns.lineplot(
    x="date",
    y="count targeting runs",
    data=runs_vc_by_date[runs_vc_by_date["date"] > "2025-01-01"],
    marker="o",
    linestyle="--",
    label="# targeting runs"
)
sns.lineplot(
    x="date",
    y="count retraining runs",
    data=runs_vc_by_date_retraining[runs_vc_by_date_retraining["date"] > "2025-01-01"],
    marker="o",
    linestyle="--",
    label="# retraining runs"
)
sns.lineplot(
    x="date",
    y="audience_id",
    data=audiences_unique_by_date[audiences_unique_by_date["date"] > "2025-01-01"],
    marker="o",
    linestyle=":",
    label="unique audiences"
)
plt.title("Count Daily targeting runs")
plt.xticks(rotation=90)
plt.grid(True)

# Check Data Completion for node types, date and costs

In [None]:
print(len(targeting_and_retraining_runs))
null_values = targeting_and_retraining_runs[["node_name","date","charge","machine.type","audience_id","duration","total_charge_of_serviceName","audience"]].isnull().sum()
null_values = pd.DataFrame(null_values).rename(columns={0:"isnull"})
null_values["percentage_of_isnull"] = null_values["isnull"]/len(targeting_and_retraining_runs) * 100
null_values.sort_values(by="isnull")

## Handling Missing Data
- fehlend node_names mit existing node_names anreichern via targeting audience
- fehlende Kosten über node_names und runtime anreichern
- charges per targeting run erst ab dm 26.05.2025


In [None]:
targeting_and_retraining_runs[["Prefect_Deployments","audience_id","date","charge","node_name","machine.type"]]

In [None]:
node_size_mapping = targeting_and_retraining_runs.drop_duplicates(subset=["Prefect_Deployments","audience_id","node_name","machine.type"])[["Prefect_Deployments","tenant","audience_id","node_name","machine.type"]]
# check for audiences with several nodes
vc_audience_nodes = node_size_mapping.groupby("audience_id")["node_name"].nunique()
vc_audience_nodes_more_than_one = vc_audience_nodes[vc_audience_nodes>1]
if vc_audience_nodes_more_than_one.empty == False:
    print(f"Several audiences have more than one node type: {len(vc_audience_nodes)}")
node_size_mapping

## Query models for all active accounts

In [None]:
# query models to map audience node_name with targetingOutlook (is one of the main parameter to choose the node size)
try:
    models = pd.read_csv(f"{path_to_save}all_models.csv")
    with open(f"{path_to_save}ignore_tenants.json", "r") as f:
        ignore_tenants = json.load(f)
except FileNotFoundError:
    print("query data")
    models=pd.DataFrame()
    #min_date = targeting_and_retraining_runs["date"].min()
    #min_date = (pd.to_datetime(min_date)-timedelta(days=60)).strftime("%Y-%m-%d")
    ignore_tenants = []
    accounts = call_api_with_service_token(f"{url}/core/accounts/query", {}, logging)
    for tenant in targeting_and_retraining_runs["tenant"].unique():
        print(tenant)
        account_id = [acc["id"] for acc in accounts if sanitize_account_name(acc["name"])==tenant]
        if len(account_id) > 1:
            raise Exception(f"More than one account with name {tenant}")
        if len(account_id) == 0:
            print(f"Tenant {tenant} not found in accounts")
            ignore_tenants.append(tenant)
            continue
        account_id = account_id[0]
        temp_models = send_to_innkeepr_api_paginated(
            f"{url}/models/query",
            account_id,
            {},
            logging
        )
        temp_models = pd.json_normalize(temp_models)
        if len(temp_models) == 0:
            print(f"No models found for tenant {tenant}")
            ignore_tenants.append(tenant)
            continue
        #temp_models = temp_models[temp_models["created"]>=min_date]
        models = pd.concat([models, temp_models])
    print(models.shape)
    missing_audiences = targeting_runs[targeting_runs["tenant"].isin(ignore_tenants)==False]
    missing_audiences = missing_audiences[missing_audiences["audience_id"].isin(models["audience"].unique())==False]
    if missing_audiences.empty == False:
        print(f"Missing {len(missing_audiences)} models")
        print(missing_audiences[["tenant","audience_id"]].drop_duplicates())
    models.to_csv(f"{path_to_save}all_models.csv")
    with open(f"{path_to_save}ignore_tenants.json", "w") as f:
        json.dump(list(ignore_tenants), f)
models

In [None]:
audiences_in_models_but_not_in_targeting_runs = models[
    (models["audience"].isin(targeting_runs[targeting_runs["tenant"].isin(ignore_tenants)==False]["audience_id"].unique())==False) &
    (models["created"]>=targeting_runs["date"].min())
    ]
if audiences_in_models_but_not_in_targeting_runs.empty == False:
    print(f"Found {len(audiences_in_models_but_not_in_targeting_runs)} models that are not in the targeting runs")
    raise Exception(audiences_in_models_but_not_in_targeting_runs[["audience","path","created"]])

## Merge Models and Targeting Runs by considering date and targetingOutlookDays

In [None]:
models = models[["audience","created","targetingOutlookDays"]]
models.head()

In [None]:
targeting_and_retraining_runs["tenant"] = np.where(
    targeting_and_retraining_runs["tenant"].isnull(),
    targeting_and_retraining_runs["account"].str.replace(" ","").str.lower(),
    targeting_and_retraining_runs["tenant"]
)

In [None]:
models = models.rename(columns={"audience":"audience_id","created":"created_data_model_for_testing"})
# Filter data_model_for_testing to get the most recent model for each audience before the date in data_for_testing
print(f"targeting_and_retraining_runs = {len(targeting_and_retraining_runs)}")
merged_targeting_runs_with_models = pd.merge(targeting_and_retraining_runs, models, on=['audience_id'], how="left")#suffixes=('_data_for_testing', '_data_model_for_testing'))
# macht data where the model.created < data.timestamp
merged_targeting_runs_with_models_with_previous_models = merged_targeting_runs_with_models[pd.to_datetime(merged_targeting_runs_with_models['created_data_model_for_testing'],utc=True) <= pd.to_datetime(merged_targeting_runs_with_models['timestamp'],utc=True)]
merged_targeting_runs_with_models_with_previous_models = merged_targeting_runs_with_models_with_previous_models.sort_values(by='created_data_model_for_testing', ascending=False).drop_duplicates(subset=['Prefect_Deployments','tenant','audience_id','timestamp','node_name'], keep='first')
print(f"merged_targeting_runs_with_models_with_previous_models = {len(merged_targeting_runs_with_models_with_previous_models)}")
# get data where data.timestamp does not have a matching model
merged_targeting_runs_with_models_without_previous_models = models.groupby("audience_id")["created_data_model_for_testing"].min()
merged_targeting_runs_with_models_without_previous_models = pd.DataFrame(merged_targeting_runs_with_models_without_previous_models).reset_index()
merged_targeting_runs_with_models_without_previous_models = pd.merge(targeting_and_retraining_runs, merged_targeting_runs_with_models_without_previous_models, on="audience_id")
merged_targeting_runs_with_models_without_previous_models = merged_targeting_runs_with_models_without_previous_models[pd.to_datetime(merged_targeting_runs_with_models_without_previous_models['timestamp'],utc=True) <= pd.to_datetime(merged_targeting_runs_with_models_without_previous_models['created_data_model_for_testing'],utc=True)]
print(f"merged_targeting_runs_with_models_without_previous_models = {len(merged_targeting_runs_with_models_without_previous_models)}")
# get data with null models
considered_audiences = merged_targeting_runs_with_models_with_previous_models["audience_id"].unique().tolist() + merged_targeting_runs_with_models_without_previous_models["audience_id"].unique().tolist()
merged_targeting_runs_with_models_null_models = merged_targeting_runs_with_models[
    (merged_targeting_runs_with_models["audience_id"].isin(considered_audiences) == False)]
merged_targeting_runs_with_models_null_models = merged_targeting_runs_with_models_null_models[merged_targeting_runs_with_models_null_models["tenant"].isin(ignore_tenants)==False]
print(f"merged_targeting_runs_with_models_null_models = {len(merged_targeting_runs_with_models_null_models)}")
# concate data
merged_targeting_runs_with_models = pd.concat([merged_targeting_runs_with_models_with_previous_models, merged_targeting_runs_with_models_without_previous_models,merged_targeting_runs_with_models_null_models])
merged_targeting_runs_with_models = merged_targeting_runs_with_models.reset_index(drop=True)
print(f"merged_targeting_runs_with_models = {len(merged_targeting_runs_with_models)}")
targeting_runs_to_compare = targeting_and_retraining_runs[targeting_and_retraining_runs["tenant"].isin(ignore_tenants)==False]
if len(merged_targeting_runs_with_models) != len(targeting_runs_to_compare):
    raise Exception(f"Unequal length targeting_runs_to_compare {len(targeting_runs_to_compare)} vs. merged {len(merged_targeting_runs_with_models)}")
merged_targeting_runs_with_models[["tenant","audience_id","date","node_name","timestamp","created_data_model_for_testing","targetingOutlookDays"]]

In [None]:
# if dataframe does not match lenght - check why
vc_before = pd.DataFrame(targeting_and_retraining_runs[targeting_and_retraining_runs["tenant"].isin(ignore_tenants)==False].groupby(by=["Prefect_Deployments","audience_id"])["date"].value_counts()).reset_index().sort_values(by=["audience_id","date"], ascending=False).reset_index(drop=True)
vc_after = pd.DataFrame(merged_targeting_runs_with_models.groupby(by=["Prefect_Deployments","audience_id"])["date"].value_counts()).reset_index().sort_values(by=["audience_id","date"], ascending=False).reset_index(drop=True)
if vc_after.equals(vc_before):
    print("fine")
else:
    print("not fine")

# Extract node gb size and cpu

In [None]:
def extract_node_parameters(string,node_spec=None):
    #print(string)
    if isinstance(string, str) == False:
        #print(f"String is not given")
        return None
    if "medium32g" in string:
        if node_spec == "cpu":
            return None
        elif node_spec == "gb":
            return 32
        else:
            raise ValueError(f"Node spec needs to be cpu or gb")
    if "x2large" in string:
        if node_spec == "cpu":
            return 29
        elif node_spec == "gb":
            return 350
        else:
            raise ValueError(f"Node spec needs to be cpu or gb")
    if "small" in string:
        if node_spec == "cpu":
            return 3
        elif node_spec == "gb":
            return 4
        else:
            raise ValueError(f"Node spec needs to be cpu or gb")
    if "xlarge" in string:
        if node_spec == "cpu":
            return 7
        elif node_spec == "gb":
            return 110
        else:
            raise ValueError(f"Node spec needs to be cpu or gb")
    if "medium64g" in string:
        if node_spec == "cpu":
            return 3
        elif node_spec == "gb":
            return 55
        else:
            raise ValueError(f"Node spec needs to be cpu or gb")
    # match the string for cpu and gb
    match = re.search(r"(\d+)cpu(\d+)gib", string)
    if match is False:
        print(f"No match found for {string}")
        return None
    elif node_spec is None:
        raise ValueError(f"Node spec needs to be cpu or gb")
    elif node_spec == "cpu":
        return int(match.group(1))
    elif node_spec == "gb":
        return int(match.group(2))
    else:
        raise ValueError(f"Node spec needs to be cpu or gb")


In [None]:
merged_targeting_runs_with_models["node_gb"] = merged_targeting_runs_with_models["node_name"].apply(lambda x: extract_node_parameters(x,node_spec="gb"))
merged_targeting_runs_with_models["node_cpu"] = merged_targeting_runs_with_models["node_name"].apply(lambda x: extract_node_parameters(x,node_spec="cpu"))
merged_targeting_runs_with_models[["tenant","audience_id","node_name","timestamp","targetingOutlookDays","node_gb","node_cpu"]]

## Add missing node sizes via current node size mapping
- needs a long time (to speed it up - get audiences with one specification and just iterate over audiences with several specifications)

In [None]:
node_size_mapping_with_size = merged_targeting_runs_with_models.dropna(subset=["node_name"]).drop_duplicates(subset=["audience_id","node_name","targetingOutlookDays","node_gb","node_cpu"])[["tenant","audience_id","node_name","node_gb","node_cpu","targetingOutlookDays"]]
vc = pd.DataFrame(node_size_mapping_with_size.groupby("audience_id")["node_name"].nunique()).rename(columns={"node_name":"node_count"})
node_size_mapping_with_size = pd.merge(node_size_mapping_with_size, vc, on="audience_id", how="left")
vc_max = merged_targeting_runs_with_models.groupby(by=["audience_id","node_name","targetingOutlookDays","node_gb","node_cpu"])["timestamp"].max().reset_index()
node_size_mapping_with_size = pd.merge(node_size_mapping_with_size, vc_max, on=["audience_id","node_name","targetingOutlookDays","node_gb","node_cpu"], how="left")
for col in node_size_mapping_with_size.columns:
    node_size_mapping_with_size = node_size_mapping_with_size.rename(columns={col: f"{col}_nm"})
node_size_mapping_with_size

In [None]:
merged_targeting_runs_with_models_with_node_size = pd.merge(
    merged_targeting_runs_with_models,
    node_size_mapping_with_size,
    left_on=["tenant","targetingOutlookDays"],
    right_on = ["tenant_nm","targetingOutlookDays_nm"],
    how="left"
)
merged_targeting_runs_with_models_with_node_size[["tenant","timestamp","audience_id","targetingOutlookDays","node_name","node_gb","node_cpu"]+node_size_mapping_with_size.columns.tolist()]

In [None]:
merged_targeting_runs_with_models_with_node_size = merged_targeting_runs_with_models_with_node_size.sort_values(by=["tenant","audience_id","timestamp","node_gb_nm"]).drop_duplicates(subset=["tenant","audience_id","timestamp"],keep="last")
merged_targeting_runs_with_models_with_node_size["merged_node_name"] = np.where(
    merged_targeting_runs_with_models_with_node_size["node_name"].isnull(),
    merged_targeting_runs_with_models_with_node_size["node_name_nm"],
    merged_targeting_runs_with_models_with_node_size["node_name"]
)
merged_targeting_runs_with_models_with_node_size[["tenant","timestamp","audience_id","timestamp","targetingOutlookDays","node_name","node_gb_nm","merged_node_name"]]

## Check Data Quality of Node Size Mapping

In [None]:
null_node_size_mapping = merged_targeting_runs_with_models_with_node_size[merged_targeting_runs_with_models_with_node_size["merged_node_name"].isnull()].drop_duplicates(subset=["tenant","targetingOutlookDays"])
null_node_size_mapping[["tenant","targetingOutlookDays","date"]]
print(f"Found {len(null_node_size_mapping)} null node size mappings")


In [None]:
null_node_size_mapping[["tenant","targetingOutlookDays","node_name","audience_id","date"]]#.drop_duplicates()

In [None]:
merged_targeting_runs_with_models_with_node_size.to_csv(f"{path_to_save}merged_targeting_runs_with_models_with_node_size.csv", index=False)

In [None]:
merged_targeting_runs_with_models_with_node_size[["tenant","timestamp","audience_id","merged_node_name","duration","charge","part_of_costs","total_charge_of_serviceName"]]

# Extract Valid Time Window for Calculating Costs
- erstmal nur StackIT-Kosten berücksichtigen
- StackIT 03.11 - 10.11: sehr viele Testruns
- stackIT start date: 11.11.2025 (Umzug)
- Zeiträume, wo azure lief (erstmal außen vor lassen): 2024-12-22 & 2025-01-03


In [None]:
stackit_cost_handling

In [None]:
df_cleaned = merged_targeting_runs_with_models_with_node_size[merged_targeting_runs_with_models_with_node_size["date"]>= stackit_cost_handling["start"]]
print(f"Min to max date in df_cleaned: {df_cleaned['date'].min()} to {df_cleaned['date'].max()}")
for exlcude_dates in stackit_cost_handling["exlude_date_ranges"]:
    print(f"Excluding date range: {exlcude_dates['start']} to {exlcude_dates['end']}")
    idx_to_remove = df_cleaned[
        (df_cleaned["date"] >= exlcude_dates["start"]) & (df_cleaned["date"] <= exlcude_dates["end"])
    ].index
    df_cleaned = df_cleaned.drop(idx_to_remove)
df_cleaned=df_cleaned.reset_index(drop=True)
df_cleaned["date"].value_counts().sort_index()

In [None]:
df_cleaned.columns

In [None]:
df_cleaned[["Prefect_Deployments","tenant","audience_id","date","node_name","merged_node_name","charge","part_of_costs","total_charge_of_serviceName"]]

In [None]:
df_cleaned["merged_node_name"].sort_values().unique()

In [None]:
azure_node_names = df_cleaned[df_cleaned["merged_node_name"].isin(["medium32g", "medium64g", "small", "x2large", "xlarge", "nan"])]
azure_node_names[["Prefect_Deployments","tenant","audience_id","date","node_name","merged_node_name","machine.type","charge","part_of_costs","total_charge_of_serviceName"]]

In [None]:
temp = df[df["tenant"]=="ective"]
temp = df[df["Prefect_Deployments"]=="retraining"]
#temp = df[df["audience_id"]=="654199d5c55281d53441fdf2"]
#/Users/karolinegriesbach/Documents/Innkeepr/Git/consumption-based-costs/data/2024-01-01_to_2025-06-05/2025-06-05 19:48:58.031710/prefect_data_2024-01-01_2025-06-05_databricks_prefect_logs.csv -> somewhere here wrong node size matching
temp[["Prefect_Deployments","tenant","audience_id","date","node_name","machine.type","charge","part_of_costs","total_charge_of_serviceName"]]

In [None]:
count_nodes_by_customer = pd.DataFrame(df_cleaned.groupby(by=["date","tenant","Prefect_Deployments"])["merged_node_name"].unique()).reset_index()
count_nodes_by_customer.sort_values(by=["tenant","date"], ascending=True)

In [None]:
charge_null = df_cleaned[df_cleaned["total_charge_of_serviceName"].isnull()]
print(charge_null["date"].min(), charge_null["date"].max(), len(charge_null))
vc_charge_null = pd.DataFrame(charge_null["date"].value_counts().sort_index()).rename(columns={"count":"count_charge_null"})
vc_charge_null

In [None]:
charge_not_null = df_cleaned[df_cleaned["total_charge_of_serviceName"].notnull()]
charge_not_null["date"].min(), charge_not_null["date"].max(), len(charge_not_null)
vc_charge_not_null = pd.DataFrame(charge_not_null["date"].value_counts().sort_index()).rename(columns={"count":"count_charge_not_null"})
vc_charge_not_null

In [None]:
vc_charge_counts = pd.merge(vc_charge_null, vc_charge_not_null, on="date", how="outer").sort_values(by="date")
vc_charge_counts

# Get targeting costs

# Playaround

In [None]:
data_for_testing = targeting_runs[targeting_runs["tenant"]=="pendix"]
data_for_testing = data_for_testing[data_for_testing["date"]>="2025-04-01"].sort_values(by=["audience_id","date"], ascending=False)
data_for_testing[["tenant","audience_id","date","node_name","timestamp"]]

In [None]:
data_model_for_testing = models[models["audience"].isin(data_for_testing["audience_id"].unique())]
data_model_for_testing=data_model_for_testing[data_model_for_testing["created"]>="2025-03-01"].sort_values(by=["audience","created"], ascending=False)
data_model_for_testing

In [None]:
data_model_for_testing = data_model_for_testing.rename(columns={"audience":"audience_id","created":"created_data_model_for_testing"})
# Filter data_model_for_testing to get the most recent model for each audience before the date in data_for_testing
merged_data = pd.merge(data_for_testing, data_model_for_testing, on='audience_id', suffixes=('_data_for_testing', '_data_model_for_testing'))
merged_data = merged_data[merged_data['created_data_model_for_testing'] < merged_data['date']]
merged_data = merged_data.sort_values(by='created_data_model_for_testing', ascending=False).drop_duplicates(subset=['tenant','audience_id','timestamp','node_name'], keep='first')
merged_data[["tenant","audience_id","date","node_name","timestamp","created_data_model_for_testing","targetingOutlookDays"]]

In [None]:
vc_before = pd.DataFrame(data_for_testing.groupby("audience_id")["date"].value_counts()).reset_index().sort_values(by=["audience_id","date"], ascending=False)
vc_after = pd.DataFrame(merged_data.groupby("audience_id")["date"].value_counts()).reset_index().sort_values(by=["audience_id","date"], ascending=False)
if vc_after.equals(vc_before):
    print("fine")
else:
    print("not fine")

In [None]:
if len(data_for_testing) != len(merged_data):
    raise Exception("len(data_for_testing) != len(merged_data)")