In [1]:
cd /Users/karolinegriesbach/Documents/Innkeepr/Git/consumption-based-costs/

In [2]:
import os
import json
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 
import seaborn as sns
from src.utils.cost_handling import return_cost_per_run
from src.utils.accounts import sanitize_account_name


# Categorize Costs
- Use grouping to categorize costs

In [3]:
from_date = "2024-01-01"
to_date="2025-06-23"
timestamp = "2025-06-23 13:23:56.520433"
path_to_dir_targeting = f"/Users/karolinegriesbach/Documents/Innkeepr/Git/consumption-based-costs/data/{from_date}_to_{to_date}/targeting_and_retraining/"
path_to_targeting_date = f"{path_to_dir_targeting}map_nodes_to_prefect_runs_{from_date}_{to_date}.csv"
path_to_save = f"/Users/karolinegriesbach/Documents/Innkeepr/Git/consumption-based-costs/data/{from_date}_to_{to_date}/costs/"
file_costs = f"final_targeting_costs_{from_date}_{to_date}.csv"
targeting_runs = ["targeting","googleConversionUpdate"]
path_to_figures = f"/Users/karolinegriesbach/Documents/Innkeepr/Git/consumption-based-costs/analysis/figures/{from_date}_to_{to_date}/{timestamp}/targeting_costs/"

In [4]:
os.makedirs(path_to_save, exist_ok=True)
os.makedirs(path_to_figures, exist_ok=True)

# Load Data

In [5]:
# loading targeting costs mapping (generated via analysis/3_cost_targeting.ipynb)
df_targeting = pd.read_csv(path_to_targeting_date)
df_targeting = df_targeting[[col for col in df_targeting.columns if "Unnamed" not in col]]
df_targeting.shape

In [6]:
# get cost per run
df_targeting = return_cost_per_run(df_targeting)
df_targeting = df_targeting.sort_values(by=["date","serviceName"])
df_targeting = df_targeting[["date","cloud","Prefect_Deployments","serviceName","tenant","audience","audience_id","duration","sum_duration_serviceName","total_charge_of_serviceName","part_of_costs_per_run","cost_per_run","targetingOutlookDays"]]

In [7]:
vc = df_targeting.groupby(by=["date","serviceName"])["part_of_costs_per_run"].sum()
vc_all = vc.reset_index()
vc = vc_all[vc_all["part_of_costs_per_run"].round()!=1]
vc = vc[vc["serviceName"]!="General Purpose Server-g1.3-EU01"] # is used for agentpools as well, so can be != 1
if len(vc) > 0:
    raise ValueError(f"Some the costs does not match the sum: {vc}")

In [12]:
df_targeting = df_targeting[df_targeting["Prefect_Deployments"].isin(["targeting","googleConversionUpdate"])]
df_targeting = df_targeting.reset_index(drop=True)
df_targeting["month"] = pd.to_datetime(df_targeting["date"], format="mixed").dt.to_period("M")
df_targeting["month"] = df_targeting["month"].astype("str")
df_targeting

In [9]:
df_targeting["Prefect_Deployments"].value_counts(dropna=False)

In [14]:
with open(f"data/infrastructure_insights/customer_buckets.json") as f:
    customer_buckets = json.load(f)
df_targeting["customer_bucket"] = None
for customer_bucket in customer_buckets.keys():
    consider_accounts = customer_buckets[customer_bucket]["accounts"]
    consider_accounts = [sanitize_account_name(account) for account in consider_accounts]
    df_targeting["customer_bucket"] = np.where(df_targeting["tenant"].isin(consider_accounts),customer_bucket,df_targeting["customer_bucket"])
df_targeting.groupby("customer_bucket")["tenant"].unique()

In [10]:
df_targeting.to_csv(f"{path_to_save}{file_costs}",index=False)

# Cost per Customer and Day

In [26]:
path_to_figures_customer = f"{path_to_figures}costs_per_customer/"
os.makedirs(path_to_figures_customer, exist_ok=True)

In [83]:
costs_per_customer = df_targeting.groupby(by=["date","tenant"])["cost_per_run"].sum().reset_index().rename(columns={"cost_per_run":"sum_of_costs"})
count_audiences = df_targeting.groupby(by=["date","tenant"])["audience_id"].nunique().reset_index().rename(columns={"audience_id":"count_audiences"})
costs_per_customer = pd.merge(costs_per_customer,count_audiences,how="left",on=["date","tenant"])
costs_per_customer.sort_values(by=["tenant","date"])

In [84]:
fig = plt.figure(figsize=(15,9))
ax1 = fig.add_subplot(2,1,1)
ax1.set_title("Daily targeting costs per customer")
sns.lineplot(data=costs_per_customer,x="date",y="sum_of_costs",hue="tenant", marker="o", ax=ax1)
plt.grid(True)
plt.xticks([])
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
fig.savefig(f"{path_to_figures_customer}cost_per_customer_and_day.png",bbox_inches='tight',dpi=300)

In [85]:
for customer in costs_per_customer["tenant"].unique():
    costs_per_customer_plot = costs_per_customer[costs_per_customer["tenant"] == customer]
    fig = plt.figure(figsize=(15,5))
    plt.suptitle(customer)
    ax1 = fig.add_subplot(2,1,1)
    ax1.set_title("Daily targeting costs per customer")
    sns.lineplot(data=costs_per_customer_plot,x="date",y="sum_of_costs",hue="tenant", marker="o", ax=ax1)
    plt.grid(True)
    plt.xticks([])
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

    ax2 = fig.add_subplot(2,1,2)
    ax2.set_title("Daily # audiences per customer")
    sns.lineplot(data=costs_per_customer_plot,x="date",y="count_audiences",hue="tenant", marker="o",ax=ax2)
    plt.grid(True)
    plt.xticks(rotation=90)
    ax2.get_legend().remove()
    plt.tight_layout()
    fig.savefig(f"{path_to_figures_customer}cost_per_customer_and_day_{customer}.png",bbox_inches='tight',dpi=300)
    fig.clf()
    plt.clf()
    break

In [86]:
costs_per_customer_stats = costs_per_customer.groupby(by=["tenant"])["sum_of_costs"].describe().reset_index().sort_values(by="mean", ascending=False).reset_index(drop=True)
costs_per_customer_stats.to_csv(f"{path_to_save}cost_per_customer_date_stats.csv",index=False)
costs_per_customer_stats

In [87]:
costs_per_customer_stats[["tenant","mean","std"]].round(2)

In [88]:
fig = plt.figure(figsize=(15, 5))
fig.suptitle("Average targeting costs per day")
sns.boxplot(data=costs_per_customer, x="tenant", y="sum_of_costs")
plt.xticks(rotation=90)
plt.grid(True)
plt.show()
fig.savefig(f"{path_to_figures_customer}cost_per_customer_boxplot.png",bbox_inches='tight',dpi=300)

# Cost per Customer and Month

In [None]:
df_targeting

In [90]:
costs_per_customer_month = df_targeting.groupby(by=["tenant","month"])["cost_per_run"].sum().reset_index().rename(columns={"cost_per_run":"sum_of_costs"})
costs_per_customer_month["sum_of_costs"] = costs_per_customer_month["sum_of_costs"].round(2)
costs_per_customer_month["month"] = costs_per_customer_month["month"].astype("str")
costs_per_customer_month

In [91]:
costs_per_customer_month_stats = costs_per_customer_month.groupby(by=["tenant"])["sum_of_costs"].describe().reset_index().sort_values(by="mean", ascending=False).reset_index(drop=True)
costs_per_customer_month_stats

In [92]:
fig = plt.figure(figsize=(15, 5))
fig.suptitle("Sum targeting costs per month")
# sns.lineplot(data=costs_per_customer_month, x="month", y="sum_of_costs", hue="tenant")
sns.lineplot(data=costs_per_customer_month, x="month", y="sum_of_costs", hue="tenant", style="tenant", markers=True)
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.grid(True)
plt.show()
fig.savefig(f"{path_to_figures_customer}cost_per_customer_per_month.png",bbox_inches='tight',dpi=300)

# Cost per Customer and Month per Audience

In [93]:
count_audiences_per_month = df_targeting.groupby(by=["tenant","month"])["audience_id"].nunique().reset_index().rename(columns={"audience_id":"count_audiences"})
count_audiences_per_month["month"] = count_audiences_per_month["month"].astype("str")
count_audiences_per_month = pd.merge(costs_per_customer_month,count_audiences_per_month,how="left",on=["tenant","month"])
count_audiences_per_month["cost_per_audience (month)"] = (count_audiences_per_month["sum_of_costs"]/count_audiences_per_month["count_audiences"]).round(2)
count_audiences_per_month

In [94]:
min_y_threshold = 5
max_y_threshold = 9
fig = plt.figure(figsize=(15, 10))
fig.suptitle("Costs per Audience")
ax = fig.add_subplot(2,2,1)
sns.lineplot(data=count_audiences_per_month[count_audiences_per_month["cost_per_audience (month)"]>max_y_threshold], x="month", y="cost_per_audience (month)", hue="tenant", style="tenant", markers=True, ax=ax)
plt.xticks([])
plt.grid(True)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)

ax3 = fig.add_subplot(2,2,3)
ax3.set_title("# Audiences")
sns.lineplot(data=count_audiences_per_month[count_audiences_per_month["cost_per_audience (month)"]>max_y_threshold], x="month", y="count_audiences", hue="tenant", style="tenant", markers=True, ax=ax3)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.grid(True)
plt.xticks(rotation=90)

ax2 = fig.add_subplot(2,2,2)
sns.lineplot(data=count_audiences_per_month[count_audiences_per_month["cost_per_audience (month)"]<=max_y_threshold], x="month", y="cost_per_audience (month)", hue="tenant", style="tenant", markers=True, ax=ax2)
plt.xticks([])
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.grid(True)

ax3 = fig.add_subplot(2,2,4)
ax3.set_title("# Audiences")
sns.lineplot(data=count_audiences_per_month[count_audiences_per_month["cost_per_audience (month)"]<=max_y_threshold], x="month", y="count_audiences", hue="tenant", style="tenant", markers=True, ax=ax3)
plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
plt.grid(True)
plt.xticks(rotation=90)

plt.show()
plt.tight_layout()
fig.savefig(f"{path_to_figures_customer}count_audiences_per_month.png",bbox_inches='tight',dpi=300)

In [95]:
total_cost_per_audience_per_month = df_targeting.groupby(by="month")["cost_per_run"].sum().reset_index().rename(columns={"cost_per_run":"sum_of_costs"})
total_audiences_per_month = df_targeting.groupby(by="month")["audience_id"].nunique().reset_index().rename(columns={"audience_id":"count_audiences"})
total_cost_per_audience_per_month["month"] = total_cost_per_audience_per_month["month"].astype("str")
total_audiences_per_month["month"] = total_audiences_per_month["month"].astype("str")
total_cost_per_audience_per_month = pd.merge(total_cost_per_audience_per_month,total_audiences_per_month,how="left",on=["month"])
total_cost_per_audience_per_month["cost_per_audience (month)"] = (total_cost_per_audience_per_month["sum_of_costs"]/total_cost_per_audience_per_month["count_audiences"]).round(2)

total_cost_per_audience_per_month

# Cost per Customer Categorization

In [98]:
cost_per_audience_month_customer_size = df_targeting.groupby(by=["month","customer_bucket"])["cost_per_run"].sum().reset_index().rename(columns={"cost_per_run":"sum_of_costs"})
count_audiences_per_month_customer_size = df_targeting.groupby(by=["month","customer_bucket"])["audience_id"].nunique().reset_index().rename(columns={"audience_id":"count_audiences"})
cost_per_audience_month_customer_size = pd.merge(cost_per_audience_month_customer_size,count_audiences_per_month_customer_size,how="left",on=["month","customer_bucket"])
cost_per_audience_month_customer_size["cost_per_audience (month)"] = (cost_per_audience_month_customer_size["sum_of_costs"]/cost_per_audience_month_customer_size["count_audiences"]).round(2)
cost_per_audience_month_customer_size.sort_values(by=["customer_bucket","month"])
cost_per_audience_month_customer_size

In [108]:
pivot_costs_per_audience = cost_per_audience_month_customer_size.pivot(index='month', columns='customer_bucket', values='cost_per_audience (month)')
pivot_costs_per_audience.to_csv(f"{path_to_save}cost_per_audience_by_customer_category.csv")
pivot_costs_per_audience

# Cost per Customer Catgorization & Outlook

In [None]:
cost_per_audience_month_customer_size_outlook = df_targeting.groupby(by=["month","customer_bucket","targetingOutlookDays"])["cost_per_run"].sum().reset_index().rename(columns={"cost_per_run":"sum_of_costs"})
count_audiences_per_month_customer_size_outlook = df_targeting.groupby(by=["month","customer_bucket","targetingOutlookDays"])["audience_id"].nunique().reset_index().rename(columns={"audience_id":"count_audiences"})
cost_per_audience_month_customer_size_outlook = pd.merge(cost_per_audience_month_customer_size_outlook,count_audiences_per_month_customer_size_outlook,how="left",on=["month","customer_bucket","targetingOutlookDays"])
cost_per_audience_month_customer_size_outlook["cost_per_audience (month)"] = (cost_per_audience_month_customer_size_outlook["sum_of_costs"]/cost_per_audience_month_customer_size_outlook["count_audiences"]).round(2)
cost_per_audience_month_customer_size_outlook = cost_per_audience_month_customer_size_outlook.sort_values(by=["customer_bucket","month","cost_per_audience (month)"])
cost_per_audience_month_customer_size_outlook.to_csv(f"{path_to_save}cost_per_audience_by_customer_category_outlook.csv")

In [35]:
nr_plots = len(cost_per_audience_month_customer_size_outlook["customer_bucket"].unique())
nr_subplots = int(nr_plots/2)
fig = plt.figure(figsize=(15,10))
for iplot, customer_bucket in enumerate(cost_per_audience_month_customer_size_outlook["customer_bucket"].unique()):
    temp_plot = cost_per_audience_month_customer_size_outlook[cost_per_audience_month_customer_size_outlook["customer_bucket"] == customer_bucket]
    ax = fig.add_subplot(3,nr_subplots,iplot+1)
    ax.set_title(f"customer categorization: {customer_bucket}")
    sns.lineplot(data=temp_plot, x="month", y="cost_per_audience (month)", hue="targetingOutlookDays", style="targetingOutlookDays", markers=True, ax=ax)
    plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    plt.grid(True)
    if iplot+1 in [4,5]:
        plt.xticks(rotation=90)
    # if iplot+1 % 2 == 0:
    #     plt.xticks(rotation=90)
    else:
        plt.xticks([])
plt.xticks(rotation=90)
plt.tight_layout()
fig.savefig(f"{path_to_figures}cost_per_audience_month_outlook_{customer_bucket}.png",bbox_inches='tight',dpi=300)
plt.show()
