In [1]:
cd /Users/karolinegriesbach/Documents/Innkeepr/Git/evaluation-and-execution-scripts/

In [2]:
import logging
import pandas as pd
import numpy as np
import awswrangler as wr
from matplotlib import pyplot as plt
import seaborn as sns
from general_functions.return_account_ids import return_account_ids
from general_functions.constants import return_api_url
from general_functions.call_api_with_account_id import call_api_with_accountId, send_to_innkeepr_api_paginated

In [3]:
#Lillydoo - 68c2d9007bd2ec4485bb98ed
# Asambeauty - 682ed8362fc068cde38c3dff
conversion_action_id = "682ed8362fc068cde38c3dff" #Nikin: 6834787b13526dc3d1017e06"
customer = "Asambeauty"
start_date = "20251019"
end_date = "20251019"
date_range = pd.date_range(start=start_date, end=end_date, freq="D").strftime("%Y%m%d").tolist()
url = return_api_url()
print(f"url = {url}")
account_id = return_account_ids()
account_id = [acc["id"] for acc in account_id if acc["name"] == customer]
account_id = account_id[0]

In [4]:
data_file_path = f"DataChecks/targeting_history_ga_conversion_update/data/targeting_history_{customer}_{conversion_action_id}_{start_date}_{end_date}.csv"
try:
    df = pd.read_csv(data_file_path)
except FileNotFoundError:
    print("File not found, creating new DataFrame.")
    df = pd.DataFrame()
    for date in date_range:
        try:
            print(f"Reading data for {date}")
            temp = wr.s3.read_parquet(f"s3://{account_id}/targeting.history/{date}/ga_conversion_update_{conversion_action_id}.parquet")
        except wr.exceptions.NoFilesFound:
            print(f". No data for {date}")
            continue
        temp["bucket_date"] = date
        df = pd.concat([df, temp])
    df.to_csv(data_file_path, index=False)
df

In [5]:
df["created"] = df["created"].astype("string")

In [6]:
df["properties.revenue"] = df["properties.revenue"].astype("float")
df.groupby(by=["created","session.date"])["properties.revenue"].sum().reset_index().sort_values(by=["session.date","created"])

In [8]:
df["value_based_conversion_value"] = df["value_based_conversion_value"].astype("float")
df.groupby(by=["created","session.date"])["value_based_conversion_value"].sum().reset_index().sort_values(by=["session.date","created"])

In [7]:
df.groupby("created")["session.date"].value_counts().reset_index().sort_values(by=["session.date","created"])

In [None]:
df[["value"]]

In [None]:
df["session.date"].value_counts().sort_index()
df["create_30_days"] = pd.to_datetime(df["created"]) - pd.Timedelta(days=30)
df["create_30_days"] = df["create_30_days"].dt.date
df["create_30_days"] = df["create_30_days"].astype("string")

In [None]:
data_30_days=df[df["session.date"]>=df["create_30_days"]]
data_30_days.shape
data_30_days[["conv_prob","value_based_conversion_value"]]

In [None]:
fig = plt.figure(figsize=(12, 6))
ax1 = fig.add_subplot(211)
ax1.set_title(f"Conversionprob per date for {customer} - {conversion_action_id}")
sns.lineplot(data=df, x="created", y="conv_prob", ax=ax1, label="probs")
sns.lineplot(data=data_30_days, x="created", y="conv_prob", ax=ax1, linestyle="--", label="probs (30 days)")
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1))
plt.ylim([0,1])
plt.grid(True)
ax2 = fig.add_subplot(212)
ax2.set_title(f"Value per date for {customer} - {conversion_action_id}")
sns.lineplot(data=data_30_days, x="created", y="value_based_conversion_value", ax=ax2, label="adapted conversions")
sns.lineplot(data=data_30_days, x="created", y="properties.revenue", ax=ax2, linestyle="--", color="red", label="original conversions")
plt.legend(bbox_to_anchor=(1.05, 1))
plt.grid(True)
plt.xticks(rotation=90)
plt.tight_layout()


In [None]:
fig = plt.figure(figsize=(12, 6))
ax = fig.add_subplot(111)
ax.set_title(f"Conversionprob per date for {customer} - {conversion_action_id}")
sns.histplot(data=df, y="conv_prob", ax=ax)
sns.histplot(data=data_30_days, y="conv_prob", ax=ax)
#sns.histplot(data=data_30_days, x="created", y="value_based_conversion_value", ax=ax, color="red")
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1))
#plt.ylim([0,1])
plt.grid(True)
plt.tight_layout()

In [None]:
date_count = df.groupby("created")["session.date"].value_counts().reset_index().sort_values(by="session.date")
fig = plt.figure(figsize=(12, 6))
ax = fig.add_subplot(111)
ax.set_title(f"Number of sessions per date for {customer} - {conversion_action_id}")
sns.lineplot(data=date_count, x="session.date", y="count", hue="created", ax=ax)
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1))
plt.grid(True)

In [None]:
date_count_total = df.groupby("created")["session.date"].count().reset_index().sort_values(by="created")
fig = plt.figure(figsize=(12, 6))
ax = fig.add_subplot(211)
ax.set_title(f"Number of sessions per update date for {customer} - {conversion_action_id}")
sns.lineplot(data=date_count_total, x="created", y="session.date", ax=ax)
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1))
plt.ylim(0,date_count_total["session.date"].max() + 1000)
plt.grid(True)
plt.tight_layout()
ax2 = fig.add_subplot(212)
ax2.set_title(f"Average adapted conversion value per update date for {customer} - {conversion_action_id}")
#sns.lineplot(data=df, x="created", y="conv_prob", ax=ax2)
sns.lineplot(data=df, x="created", y="value_based_conversion_value", ax=ax2)
ax2.set_ylabel("adjusted conversion value")
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1))
#plt.ylim(0,1)
plt.grid(True)
plt.tight_layout()

# Merge with treatments

In [None]:
treatments = send_to_innkeepr_api_paginated(
    f"{url}/treatments/query",
    account_id,
    {"id": df["signal.relates_to.treatment"].dropna().unique().tolist()},
    logger=logging,
)

In [None]:
treatments_df = pd.json_normalize(treatments)
treatments_df = treatments_df[["name","id","relates_to.campaign.name"]]
treatments_df

In [None]:
df= pd.merge(df, treatments_df, left_on="signal.relates_to.treatment", right_on="id", how="left")
df

In [None]:
df_count_campaigns = df.groupby(by=["created","relates_to.campaign.name"])["session.date"].count().reset_index().sort_values(by="session.date", ascending=False)
df_count_campaigns

In [None]:
df["test_campaigns"] = "other"
df["test_campaigns"] = np.where(
    df["relates_to.campaign.name"].str.contains("it_brand", case=False, na=False),
    df["relates_to.campaign.name"],
    df["test_campaigns"],
)
df["test_campaigns"] = np.where(
    df["relates_to.campaign.name"].str.contains("it_brand", case=False, na=False),
    df["relates_to.campaign.name"],
    df["test_campaigns"],
)

df["test_campaigns"].value_counts(dropna=False)

In [None]:
conversions_last_30_days = df[df["session.date"]>= (pd.to_datetime(df["created"].max()) - pd.Timedelta(days=30)).strftime("%Y-%m-%d")]
conversions_last_30_days = conversions_last_30_days.groupby(by=["test_campaigns"])["properties.revenue"].count().reset_index()
conversions_last_30_days

In [None]:
plot_data = df.groupby(by=["created","test_campaigns"])["properties.revenue"].describe()
plot_data.columns = [f"revenue_{col}" for col in plot_data.columns]
plot_data = plot_data.reset_index()
temp = df.groupby(by=["created","test_campaigns"])["value_based_conversion_value"].describe()
temp.columns = [f"vb_revenue_{col}" for col in temp.columns]
temp = temp.reset_index()
plot_data = pd.merge(plot_data, temp, on=["created","test_campaigns"], how="left")
plot_data = plot_data.sort_values(by=["created","test_campaigns"])
plot_data

In [None]:
fig = plt.figure(figsize=(12, 6))
ax = fig.add_subplot(211)
ax.set_title(f"Count revenue per date")
sns.lineplot(data=plot_data, x="created", y="revenue_count", hue="test_campaigns", ax=ax)
plt.ylim(0,6000)
plt.grid(True)
plt.xticks([])
plt.legend(bbox_to_anchor=(1.05, 1))

ax = fig.add_subplot(212)
sns.lineplot(data=plot_data, x="created", y="revenue_mean", hue="test_campaigns", ax=ax)
sns.lineplot(data=plot_data, x="created", y="vb_revenue_mean", hue="test_campaigns", ax=ax, linestyle="--")
plt.ylim([0,2])
plt.grid(True)
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1))

In [None]:
sns.scatterplot(data=df, x="properties.revenue", y="value_based_conversion_value")
plt.xlabel("properties.revenue")
plt.ylabel("value_based_conversion_value")
plt.title("Scatterplot of value_based_conversions and properties.revenue")
plt.show()

In [None]:
count_anonymousIds = df.groupby(by=["anonymousId"])["session"].nunique().reset_index()

In [None]:
test_conv = df[['created', 'profile', 'anonymousId', 'treatment', 'conv_prob',
       'session.date', 'session', 'conv_created',
       'value_based_conversion_value', 'properties.revenue']]
min_date = test_conv["created"].min()
max_date = test_conv["created"].max()
print(min_date, max_date)

In [None]:
from general_functions.datetime_helper import transform_date_to_timestamp_milliseconds

conversions_resp = send_to_innkeepr_api_paginated(
    f"{url}/conversions/query",
    account_id,
    {"created": {"$gte": transform_date_to_timestamp_milliseconds(min_date), "$lte": transform_date_to_timestamp_milliseconds(max_date)},
     #"name":"checkout_completed"
     },
    logger=logging,
)
len(conversions_resp)

In [None]:
def extrac_data(x, key):
    if x is None:
        return x
    if key in x.keys():
        return x[key]
    else:
        return None

In [None]:
conversion = pd.DataFrame(conversions_resp)
conversion = conversion[conversion["anonymousId"].isin(df["anonymousId"].unique())]
print(len(conversion))
conversion = conversion[["created","anonymousId","sessionId","properties","name"]]
conversion["revenue_conv"] = conversion["properties"].apply(lambda x: extrac_data(x,"revenue"))
conversion["total_conv"] = conversion["properties"].apply(lambda x: extrac_data(x,"total"))
conversion

In [None]:
conversion["name"].value_counts()

In [None]:
conversion[conversion["anonymousId"].isin(df["anonymousId"].unique())]["name"].value_counts()

In [None]:
df["session"] = df["session"].astype("string")
conversion["sessionId"] = conversion["sessionId"].astype("string")
#df_with_conv = pd.merge(df, conversion, how="left", left_on="session", right_on="sessionId", suffixes=("", "_conv"))
df_with_conv = pd.merge(data_30_days, conversion, how="left", left_on="session", right_on="sessionId", suffixes=("", "_conv"))
df_with_conv = df_with_conv[df_with_conv["session.date"].astype("string")<=pd.to_datetime(df_with_conv["created_conv"]).dt.date.astype("string")]
#min_conv_per_date_and_anonymousId = df_with_conv.groupby(["session.date","anonymousId"])["created_conv"].min().reset_index()
#df_with_conv = pd.merge(df_with_conv, min_conv_per_date_and_anonymousId, how="left", on=["session.date","anonymousId"], suffixes=("", "_min_conv"))
#df_with_conv = df_with_conv[df_with_conv["created_conv"]==df_with_conv["created_conv_min_conv"]]
#df_with_conv[["created","session","conv_prob","anonymousId","session.date","created_conv","created_conv_min_conv","name"]].sort_values(by=["anonymousId","created"])
df_with_conv[["created","session","conv_prob","anonymousId","session.date","created_conv","name","name_conv"]].sort_values(by=["anonymousId","created"])
df_with_conv = df_with_conv.drop_duplicates(subset=["session","name_conv"])
df_with_conv

In [None]:
df_with_conv["name_conv"].value_counts(dropna=False)

In [None]:
df_conv_name = pd.merge(data_30_days, df_with_conv[["session","name_conv"]], how="left", on=["session"])
df_conv_name["name_conv"] = df_conv_name["name_conv"].fillna("None")
df_conv_name

In [None]:
df_conv_name["name_conv"].value_counts(dropna=False)

In [None]:
unique_anonymousIds_with_checkout = df_with_conv[df_with_conv["name_conv"]=="checkout_completed"]["anonymousId"].unique()
len(unique_anonymousIds_with_checkout)

In [None]:
df_conv_name["users_purchased"]=None
df_conv_name["users_purchased"] = np.where(
    df_conv_name["anonymousId"].isin(unique_anonymousIds_with_checkout),
    True,
    False
)
df_conv_name.groupby(by=["users_purchased","name_conv"])["value_based_conversion_value"].describe().reset_index()

In [None]:
df_conv_name.groupby(by=["users_purchased","name_conv"])["properties.revenue"].describe().reset_index()

# Analyze single date

In [None]:
df_date = df[df["created"].isin(["20250723","20250713"])]

In [None]:
df_date

In [None]:
df_date.groupby("created")["session"].nunique()

In [None]:
df_date.groupby("created")["anonymousId"].nunique()

In [None]:
vc_anonymousid = df_date.groupby("created")["anonymousId"].value_counts()
vc_anonymousid = vc_anonymousid[vc_anonymousid>1]
vc_anonymousid.reset_index()

In [None]:
date_count = df_date.groupby("created")["session.date"].value_counts().reset_index().sort_values(by="session.date")
fig = plt.figure(figsize=(20, 6))
ax = fig.add_subplot(111)
ax.set_title(f"Number of sessions per session.date for {customer} - {conversion_action_id}")
sns.lineplot(data=date_count[date_count["session.date"]>"2025-06-15"], x="session.date", y="count", hue="created", ax=ax)
plt.xticks(rotation=90)
plt.legend(bbox_to_anchor=(1.05, 1))
plt.grid(True)

# Count by Month

In [None]:
df

In [None]:
df["month"] = pd.to_datetime(df["created"]).dt.strftime("%Y-%m")
count_conversion_by_month = df.groupby("month")["created"].count().reset_index()
count_conversion_by_month

In [None]:
total_sum_conversion_value = df.groupby("month")[["properties.revenue","value_based_conversion_value"]].sum().reset_index()
total_sum_conversion_value

In [None]:
count_unique_sessions_by_month = df.groupby("month")["session"].nunique().reset_index()
count_unique_sessions_by_month

In [None]:
stats_conversion_value = df.groupby("month")["properties.revenue"].describe()
stats_conversion_value

In [None]:
stats_conversion_value_modified = df.groupby("month")["value_based_conversion_value"].describe()
stats_conversion_value_modified