In [2]:
cd /Users/karolinegriesbach/Documents/Innkeepr/Git/evaluation-and-execution-scripts/

# Check CATE Values for Tchibo Ressourcen Switch

In [3]:
import logging
import pandas as pd
import numpy as np
import awswrangler as wr
import matplotlib.pyplot as plt
import seaborn as sns
from general_functions.return_account_ids import return_account_ids
from general_functions.constants import return_api_url
from general_functions.call_api_with_account_id import call_api_with_accountId

In [4]:
customer = "Junglueck"
audience_id = "66d179bfc98dbc05719f7799"
dates = ["20251104","20251110"]
url = return_api_url()
print(f"url = {url}")
account_id = return_account_ids()
account_id = [acc["id"] for acc in account_id if acc["name"] == customer]
account_id = account_id[0]
path_to_hist  = f"s3://{account_id}/targeting.history/"

In [5]:
audiences = call_api_with_accountId(
    endpoint_url=f"{url}/audiences/query", accountID=account_id, content={"id": audience_id}, logger=logging
)
if len(audiences) != 1:
    raise ValueError(f"Expected 1 audience, got {len(audiences)}: {audiences}")
audience_name = audiences[0]["name"]
source=audiences[0]["source"]
audience_name.replace(" ", ""), source

In [6]:
models = call_api_with_accountId(
    endpoint_url=f"{url}/models/query", accountID=account_id, content={"audienceId": audience_id}, logger=logging
)
check_models = [entry for entry in models if pd.to_datetime(entry["created"]).strftime("%Y-%m-%d") > pd.to_datetime(min(dates)).strftime("%Y-%m-%d")]
if len(check_models) > 1:
    raise ValueError(f"Expected 1 model, got {len(check_models)}: {check_models}")
elif len(check_models) == 0:
    max_model_date = max([entry["created"] for entry in models], default=None)
    print(f"Latest model date: {max_model_date}")
    if pd.to_datetime(max_model_date).strftime("%Y-%m-%d") > pd.to_datetime(min(dates)).strftime("%Y-%m-%d"):
        raise ValueError(f"Model created after {min(dates)}")
else: 
    raise Exception(f"Something wrong with models: {check_models}")


In [7]:
sessions = []
tar_hist_orig = pd.DataFrame()
for date in dates:
    path_to_file = f"{path_to_hist}{date}/{audience_id}.parquet"
    temp = wr.s3.read_parquet(path_to_file)[["session","anonymousId","treatment","cate","counterfactual","session.date"]]
    temp["targeting.date"] = date
    if len(sessions) != 0:
        print(f"Filter for users: {len(sessions)}")
        temp = temp[temp["session"].isin(sessions)]
    tar_hist_orig = pd.concat([tar_hist_orig, temp])
    new_sessions = temp["session"].unique().tolist()
    sessions = sessions + new_sessions
    sessions = list(set(sessions))
    print("updated sessions: ", len(sessions))

tar_hist_orig.head()

In [8]:
print(f"Users and sessions original: {tar_hist_orig.groupby(by=['targeting.date'])['session'].nunique()}")
data_min = tar_hist_orig[tar_hist_orig["targeting.date"] == min(dates)][["session","anonymousId","targeting.date","session.date"]].drop_duplicates()
print(len(data_min))
data_max = tar_hist_orig[tar_hist_orig["targeting.date"] == max(dates)][["session","anonymousId","targeting.date","session.date"]].drop_duplicates()
print(len(data_max))
min_and_max = pd.merge(data_min, data_max, on=["session","anonymousId"], how="left", suffixes=('_min', '_max'))
min_and_max = min_and_max[(min_and_max["targeting.date_min"].notna() & min_and_max["targeting.date_max"].notna())]
print(f"Users and sessions ocurring in both: {len(min_and_max)}")
del data_min
del data_max
min_and_max.head()

In [9]:
min_and_max.groupby("session")["anonymousId"].nunique() 


In [10]:
tar_hist = pd.merge(
    tar_hist_orig,
    min_and_max[["session","anonymousId"]],
    on=["session","anonymousId"],
    how="right"
)
tar_hist["test_index"] = tar_hist["anonymousId"] + "$" + tar_hist["session"]
print(tar_hist["anonymousId"].nunique(), tar_hist["session"].nunique())
tar_hist

## Check Diff in CATE By AnonymousId

In [21]:
tar_hist["session.date"] = tar_hist["session.date"].astype("string")
tar_hist_pivot_sess_date = tar_hist.pivot(index="test_index", columns=["targeting.date"], values="session.date")
tar_hist_pivot_sess_date = tar_hist_pivot_sess_date.rename(columns=lambda x: f"session.date_{x}")
tar_hist_pivot_sess_date.head()

In [23]:
tar_hist_pivot = tar_hist.pivot(index="test_index", columns=["targeting.date"], values="cate")
tar_hist_pivot.head()

In [24]:
tar_hist_pivot = pd.merge(
    tar_hist_pivot,
    tar_hist_pivot_sess_date,
    left_index=True,
    right_index=True,
    how="left"
)
tar_hist_pivot

In [25]:
tar_hist_pivot["diff"] = tar_hist_pivot[max(dates)] / tar_hist_pivot[min(dates)]
tar_hist_pivot["diff_perc"] = ((tar_hist_pivot[max(dates)] / tar_hist_pivot[min(dates)])-1)*100
tar_hist_pivot["session_date_equal"] = tar_hist_pivot[f"session.date_{min(dates)}"] == tar_hist_pivot[f"session.date_{max(dates)}"]
tar_hist_pivot.head()

In [41]:
fig = plt.figure(figsize=(5, 5))
ax = fig.add_subplot(121)
ax.set_title(f"Difference in %")
sns.boxplot(data=tar_hist_pivot, y="diff_perc", hue="session_date_equal", ax=ax)
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1))
#plt.ylim([0,1])
plt.grid(True)
plt.tight_layout()
ax2 = fig.add_subplot(122)
ax2.set_title(f"Difference in % (Zoom)")
sns.boxplot(data=tar_hist_pivot, y="diff_perc", hue="session_date_equal", ax=ax2)
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1))
plt.ylim([-10,20])
plt.grid(True)
plt.tight_layout()

In [28]:
tar_hist_pivot.groupby("session_date_equal")["diff_perc"].describe()

## equal session dates

In [48]:
tar_hist_pivot.reset_index().columns

In [51]:
test=tar_hist_pivot.reset_index()
test[test["test_index"].str.contains("7bf2d394-3e4f-4cf8-824f-c1ca7754824a")]

In [45]:
temp_equal = tar_hist_pivot[tar_hist_pivot["session_date_equal"]==True].sort_values(by="diff_perc", ascending=False)
print(temp_equal["diff_perc"].min(), temp_equal["diff_perc"].max())
temp_equal.head(30)

## unequal session dates

In [44]:
temp = tar_hist_pivot[tar_hist_pivot["session_date_equal"]==False].sort_values(by="diff_perc", ascending=False)
print(temp["diff_perc"].min(), temp["diff_perc"].max())
temp.head(30)

# Check Change in Group
Out of Interest - new users play a role as well!

In [34]:
tar_hist_group = tar_hist.pivot(index="test_index", columns="targeting.date", values="treatment")
tar_hist_group.head()

In [35]:
tar_hist_group["same_group"] = tar_hist_group[max(dates)] == tar_hist_group[min(dates)]
vc_group_change = tar_hist_group["same_group"].value_counts(dropna=False)
percentage = vc_group_change[False] / (vc_group_change[True] + vc_group_change[False]) * 100
print(f"Change in %: {percentage}")
print(f"Total anonymousId: { vc_group_change.sum()}")
vc_group_change

In [42]:
import subprocess
from datetime import datetime

# Define the export name with timestamp
export_name = f"check-cate-values_{customer}_{audience_name.replace(' ', '_')}_{datetime.now().strftime('%Y%m%d_%H%M%S')}.html"
export_path = f"./{export_name}"

# Get the current notebook path
current_notebook = "SprintStories/Ressourcenverkleinerung/check-cate-values.ipynb"

# Convert notebook to HTML using jupyter nbconvert
result = subprocess.run(
    ["jupyter", "nbconvert", "--to", "html", current_notebook, "--output", export_path],
    capture_output=True,
    text=True
)

if result.returncode == 0:
    print(f"Notebook successfully exported to: {export_path}")
else:
    print(f"Error exporting notebook: {result.stderr}")