In [None]:
cd /Users/karolinegriesbach/Documents/Innkeepr/Git/evaluation-and-execution-scripts/

In [None]:
import os
import ast
import logging
import numpy as np
import pandas as pd
import awswrangler as wr
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import ttest_ind
from general_functions.return_account_ids import return_account_ids
from general_functions.sanitize_accout_name import sanitize_account_name
from general_functions.constants import return_api_url
from general_functions.call_api_with_account_id import call_api_with_accountId, send_to_innkeepr_api_paginated

In [None]:
customer = "Asambeauty"
customer_dict = {
    "to teach":{
        "brand_string":"Brand Bidding", 
        "conversionActionId":"68cbfa5577b55cfe740fbc8b"
        },
    "Kfzteile24": {
        "adAccountId":"2252905905",
        "conversionActionId":"68820a2deafeb4bf3b501beb"
    },
    "Asambeauty": {
        "brand_string":"01|02",
        "conversionActionId":"682ed8362fc068cde38c3dff"
    },
    "LILLYDOO": {
        "generic_string":"generic",
        "conversionActionId":"68c2d9007bd2ec4485bb98ed"
    }
}
customer_sanitized = sanitize_account_name(customer)
path_save = f"/Users/karolinegriesbach/Documents/Innkeepr/Git/evaluation-and-execution-scripts/SprintStories/EN-2874-brand-vs-generic/data/{customer_sanitized}/"
start_date = "20251025"
end_date = "20251101"
date_range = pd.date_range(start=start_date, end=end_date, freq="D").strftime("%Y%m%d").tolist()
if not os.path.exists(path_save):
    os.makedirs(path_save)
url = return_api_url()
print(f"url = {url}")
account_id = return_account_ids()
account_id = [acc["id"] for acc in account_id if acc["name"] == customer]
account_id = account_id[0]

# Get Data

## Load Targeting History of Conversion Actions

In [30]:
conversion_action_id = customer_dict[customer]["conversionActionId"]
data_file_path = f"{path_save}targeting_history_{customer}_{conversion_action_id}_{start_date}_{end_date}.csv"
try:
    df = pd.read_csv(data_file_path)
except FileNotFoundError:
    print("File not found, creating new DataFrame.")
    df = pd.DataFrame()
    for date in date_range:
        try:
            print(f"Reading data for {date}")
            temp = wr.s3.read_parquet(f"s3://{account_id}/targeting.history/{date}/ga_conversion_update_{conversion_action_id}.parquet")
        except wr.exceptions.NoFilesFound:
            print(f". No data for {date}")
            continue
        temp["bucket_date"] = date
        df = pd.concat([df, temp])
    df.to_csv(data_file_path, index=False)
df = df[["created","profile","anonymousId","treatment","conv_prob","session.date","session","gclid_externalIds","properties.revenue","value_based_conversion_value"]]
df = df.explode("gclid_externalIds")
df

In [29]:
df.columns

In [None]:
def return_gclid_id(x):
    x = ast.literal_eval(x)
    if isinstance(x, list):
        try:
            return x[0]["id"]
        except KeyError:
            return None
    return None

In [None]:
df["externalId"] = df["gclid_externalIds"].apply(lambda x: return_gclid_id(x))
df["externalId"] = df["externalId"].astype("string")

## Load Signals

In [None]:
signals = send_to_innkeepr_api_paginated(
    f"{url}/signals/query",
    account_id,
    {"externalId":df["externalId"].dropna().unique().tolist()},
    logging
)
signals = pd.json_normalize(signals)
signals

In [None]:
customer_dict

In [None]:
# fitler for adAccountId
print(f"Found {len(signals)} signals")
if "adAccountId" in customer_dict[customer].keys():
    ad_account_id = customer_dict[customer]["adAccountId"]
    signals = signals[signals["relates_to.campaign.resourceName"].str.contains(ad_account_id).fillna(False)]
    print(f"Found {len(signals)} signals with adAccountId {ad_account_id}")
elif "brand_string" in customer_dict[customer].keys():
    brand_string = customer_dict[customer]["brand_string"]
    signals = signals[signals["relates_to.campaign.name"].str.contains(brand_string).fillna(False)]
    print(f"Found {len(signals)} signals with brand_string {brand_string}")
elif "generic_string" in customer_dict[customer].keys():
    generic_string = customer_dict[customer]["generic_string"]
    signals = signals[signals["relates_to.campaign.name"].str.contains(generic_string).fillna(False)==False]
else:
    raise Exception("adAccountId or brand_string not found in customer_dict")


In [None]:
signals["relates_to.campaign.name"].sort_values().unique()

## Merge Targeting History and Signal Id

In [None]:
signals["externalId"] = signals["externalId"].astype("string")
signals["brand traffic"] = True
df = pd.merge(df, signals[["externalId","relates_to.campaign.name","relates_to.adGroup.name", "brand traffic"]], on="externalId", how="left")
df["brand traffic"] = np.where(df["brand traffic"]==True, True, False)
df.head()

In [None]:
#brand traffic by date
dates_with_brand_traffic = df[df["brand traffic"] == True]["created"].unique()
print(f"dates with brand traffic: {dates_with_brand_traffic}")
df = df[df["created"].isin(dates_with_brand_traffic)]
print(f"Dates in data: {df['created'].unique()}")
df.shape

# Get Statistics

## t-test

In [None]:
group1 = df[df["brand traffic"] == True]["conv_prob"]
group2 = df[df["brand traffic"] == False]["conv_prob"]
t_statistic, p_value = ttest_ind(group1, group2)
print(f"t-statistic: {t_statistic}, p-value: {p_value}")
if p_value < 0.05:
    print("Is significant different")
    signficant = True
else:
    print("Is not significant different")
    signficant = False

In [None]:
df["conv_prob"] = df["conv_prob"].astype("float")

In [None]:
df.groupby("brand traffic")["conv_prob"].describe().transpose()

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
ax1.set_title(f"{customer}: brand vs. generic (signifcant = {signficant})")
sns.histplot(data=df, y="conv_prob", hue="brand traffic", kde=True, ax=ax1)
plt.grid(True)
plt.tight_layout()
fig.savefig(f"{path_save}brand_vs_generic_histogram.png")

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
ax1.set_title(f"{customer}: brand vs. generic (signifcant = {signficant})")
sns.boxplot(data=df, y="conv_prob", hue="brand traffic", ax=ax1)
plt.grid(True)
fig.savefig(f"{path_save}brand_vs_generic_boxplot.png")

In [None]:
sns.catplot(
    data=df, y="conv_prob", hue="brand traffic",
    kind="violin", bw_adjust=.5, cut=0, split=True,
)
plt.title(f"{customer}: brand vs. generic (signifcant = {signficant})")
plt.grid(True)
plt.tight_layout()
plt.savefig(f"{path_save}brand_vs_generic_catplot.png")


# Apply Boost on Generic traffic
https://miro.com/app/board/uXjVJMhQHk8=/?moveToWidget=3458764646648291672&cot=14

## Exclude Brand Traffic

In [None]:
# exclude brand traffic
test_date = "20251030"
df_filtered = df[df["created"].astype("string") == test_date]
brand_traffic = df_filtered[df_filtered["brand traffic"] == True].sort_values(by="conv_prob", ascending=False)
worst_n_percent = brand_traffic.groupby("session.date").size()*0.7
worst_n_percent = worst_n_percent.astype("int")
index_to_exclude = []
for date in worst_n_percent.index:
    temp_index = brand_traffic[brand_traffic["session.date"] == date].head(worst_n_percent[date]).index
    print(f"date: {date} - temp_index: {len(temp_index)}")
    index_to_exclude.extend(temp_index)
print(f"index_to_exclude: {len(index_to_exclude)}")
if len(index_to_exclude) != worst_n_percent.sum():
    raise Exception("Index does not match")
print(f"df shape: {df.shape}")
df_filtered = df_filtered[~df_filtered.index.isin(index_to_exclude)]
df.shape

In [None]:
fig = plt.figure()
ax1 = fig.add_subplot(1,1,1)
ax1.set_title(f"{customer}: brand (excluded 70%) vs. generic (signifcant = {signficant})")
sns.boxplot(data=df_filtered, y="conv_prob", hue="brand traffic", ax=ax1)
plt.grid(True)
fig.savefig(f"{path_save}brand_vs_generic_boxplot_filtered.png")

In [None]:
df.groupby("brand traffic")["conv_prob"].describe().transpose()

In [None]:
df_filtered.groupby("brand traffic")["conv_prob"].describe().transpose()

## Adjust conversion value

In [None]:
def adjust_revenue(revenue, conversion_prob, top75_threshold, top50_threshold):
    if conversion_prob >= top75_threshold:
        adjusted_revenue = revenue * 1.05      # Top 25%: boost revenue
    elif conversion_prob >= top50_threshold:
        adjusted_revenue = revenue * 1.00     # Middle 25%: keep revenue
    else:
        adjusted_revenue = revenue * 0.9      # Bottom 50%: decrease rev
    return adjusted_revenue

def adjust_brand_revenue(revenue, brand_weighting):
    if conversion_prob >= top75_threshold:
        adjusted_revenue = revenue * 1.05      # Top 25%: boost revenue
    elif conversion_prob >= top50_threshold:
        adjusted_revenue = revenue * 1.00     # Middle 25%: keep revenue
    else:
        adjusted_revenue = revenue * 0.9      # Bottom 50%: decrease rev
    return adjusted_revenue

In [33]:
df_filtered["counterfactual"] = df_filtered["conv_prob"]
prob_stats = df_filtered["counterfactual"].describe()
top75 = prob_stats["75%"]
top50 = prob_stats["50%"]
stats_brand_traffic = df_filtered[df_filtered["brand traffic"] == True]["counterfactual"].describe()
stats_brand_traffic