In [1]:
cd /Users/karolinegriesbach/Documents/Innkeepr/Git/evaluation-and-execution-scripts/

# Check CATE Values for Tchibo Ressourcen Switch

In [2]:
import logging
import pandas as pd
import awswrangler as wr
import matplotlib.pyplot as plt
import seaborn as sns
from general_functions.return_account_ids import return_account_ids
from general_functions.constants import return_api_url
from general_functions.call_api_with_account_id import call_api_with_accountId

In [None]:
customer = "LILLYDOO"
audience_id = "6811db64879ec17410925ee0"
dates = ["20251107","20251110"]
url = return_api_url()
print(f"url = {url}")
account_id = return_account_ids()
account_id = [acc["id"] for acc in account_id if acc["name"] == customer]
account_id = account_id[0]
path_to_hist  = f"s3://{account_id}/targeting.history/"

In [10]:
audiences = call_api_with_accountId(
    endpoint_url=f"{url}/audiences/query", accountID=account_id, content={"id": audience_id}, logger=logging
)
if len(audiences) != 1:
    raise ValueError(f"Expected 1 audience, got {len(audiences)}: {audiences}")
audience_name = audiences[0]["name"]
source=audiences[0]["source"]
audience_name.replace(" ", ""), source

In [11]:
models = call_api_with_accountId(
    endpoint_url=f"{url}/models/query", accountID=account_id, content={"audienceId": audience_id}, logger=logging
)
check_models = [entry for entry in models if pd.to_datetime(entry["created"]).strftime("%Y-%m-%d") > pd.to_datetime(min(dates)).strftime("%Y-%m-%d")]
if len(check_models) > 1:
    raise ValueError(f"Expected 1 model, got {len(check_models)}: {check_models}")
elif len(check_models) == 0:
    max_model_date = max([entry["created"] for entry in models], default=None)
    print(f"Latest model date: {max_model_date}")
    if pd.to_datetime(max_model_date).strftime("%Y-%m-%d") > pd.to_datetime(min(dates)).strftime("%Y-%m-%d"):
        raise ValueError(f"Model created after {min(dates)}")
else: 
    raise Exception(f"Something wrong with models: {check_models}")


In [35]:
sessions = []
tar_hist = pd.DataFrame()
for date in dates:
    path_to_file = f"{path_to_hist}{date}/{audience_id}.parquet"
    temp = wr.s3.read_parquet(path_to_file)[["session","anonymousId","treatment","cate","counterfactual"]]
    temp["targeting.date"] = date
    if len(sessions) != 0:
        print(f"Filter for users: {len(sessions)}")
        temp = temp[temp["session"].isin(sessions)]
    tar_hist = pd.concat([tar_hist, temp])
    new_sessions = temp["session"].unique().tolist()
    sessions = sessions + new_sessions
    sessions = list(set(sessions))
    print("updated sessions: ", len(sessions))

tar_hist.head()

In [36]:
sessions_min = tar_hist[tar_hist["targeting.date"] == min(dates)]["session"].unique().tolist()
sessions_max = tar_hist[tar_hist["targeting.date"] == max(dates)]["session"].unique().tolist()
sessions_both = list(set(sessions_min) & set(sessions_max))
print(f"Sessions in both dates: {len(sessions_both)}")
del sessions_min 
del sessions_max
tar_hist = tar_hist[tar_hist["session"].isin(sessions_both)]

## Check Diff in CATE By AnonymousId

In [40]:
tar_hist_pivot_cf = tar_hist.pivot(index="anonymousId", columns="targeting.date", values="counterfactual")
tar_hist_pivot_cf["diff"] = tar_hist_pivot_cf[max(dates)] / tar_hist_pivot_cf[min(dates)]
tar_hist_pivot_cf["diff_perc"] = ((tar_hist_pivot_cf[max(dates)] / tar_hist_pivot_cf[min(dates)])-1)*100
tar_hist_pivot_cf.head(), tar_hist_pivot_cf["diff_perc"].describe()

In [37]:
tar_hist_pivot = tar_hist.pivot(index="anonymousId", columns="targeting.date", values="cate")
tar_hist_pivot.head()

In [15]:
tar_hist_pivot["diff"] = tar_hist_pivot[max(dates)] / tar_hist_pivot[min(dates)]
tar_hist_pivot["diff_perc"] = ((tar_hist_pivot[max(dates)] / tar_hist_pivot[min(dates)])-1)*100
tar_hist_pivot.head()

In [16]:
# fig = plt.figure(figsize=(12, 6))
# ax = fig.add_subplot(111)
# ax.set_title(f"Difference in %")
# sns.histplot(data=tar_hist_pivot, y="diff_perc", ax=ax, kde=True)
# plt.xticks(rotation=90)
# plt.legend(bbox_to_anchor=(1.05, 1))
# plt.ylim([0,1000])
# plt.grid(True)
# plt.tight_layout()

In [27]:
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(121)
ax.set_title(f"Difference in %")
sns.boxplot(data=tar_hist_pivot, y="diff_perc", ax=ax)
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1))
#plt.ylim([0,1])
plt.grid(True)
plt.tight_layout()
ax2 = fig.add_subplot(122)
ax2.set_title(f"Difference in % (Zoom)")
sns.boxplot(data=tar_hist_pivot, y="diff_perc", ax=ax2)
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1))
plt.ylim([0,100])
plt.grid(True)
plt.tight_layout()

In [28]:
tar_hist_pivot["diff_perc"].describe()

In [29]:
# how many change over 1 %
vc_change_gt_1 = tar_hist_pivot[tar_hist_pivot["diff_perc"] > 1]
print(f"total number of changes: {vc_change_gt_1.shape[0]} from total history {tar_hist_pivot.shape[0]}")
perc_change_gt1 = vc_change_gt_1.shape[0] / tar_hist_pivot.shape[0] * 100
print(f"percentage of changes: {perc_change_gt1}")

In [30]:
# how many change over 1 %
vc_change_gt_5 = tar_hist_pivot[tar_hist_pivot["diff_perc"] > 20]
print(f"total number of changes: {vc_change_gt_5.shape[0]} from total history {tar_hist_pivot.shape[0]}")
perc_change_gt5 = vc_change_gt_5.shape[0] / tar_hist_pivot.shape[0] * 100
print(f"percentage of changes: {perc_change_gt5}")

In [31]:
vc_change_gt_5.sort_values(by="diff_perc", ascending=False).head(50)

# Check Change in Group
Out of Interest - new users play a role as well!

In [32]:
tar_hist_group = tar_hist.pivot(index="anonymousId", columns="targeting.date", values="treatment")
tar_hist_group.head()

In [34]:
tar_hist_group["same_group"] = tar_hist_group[max(dates)] == tar_hist_group[min(dates)]
vc_group_change = tar_hist_group["same_group"].value_counts(dropna=False)
percentage = vc_group_change[False] / (vc_group_change[True] + vc_group_change[False]) * 100
print(f"Change in %: {percentage}")
print(f"Total anonymousId: { vc_group_change.sum()}")
vc_group_change