In [None]:
cd /Users/karolinegriesbach/Documents/Innkeepr/Git/evaluation-and-execution-scripts/

In [None]:
import logging
import delta_sharing
import pandas as pd
import numpy as np
import awswrangler as wr
import matplotlib.pyplot as plt
import seaborn as sns  
from datetime import datetime, timedelta
from itertools import combinations
from scipy.stats import ttest_ind

from Analysis.cross_account_comparison.models.lstm import NNLSTM
import general_functions.databricks_client as db_client
from general_functions.conncet_s3 import S3Connection
from general_functions.sanitize_accout_name import sanitize_account_name
from general_functions.return_account_ids import return_account_ids
from general_functions.datetime_helper import transform_date_to_timestamp_milliseconds
from general_functions.return_account_ids import return_account_ids
from general_functions.constants import return_api_url
from general_functions.call_api_with_account_id import call_api_with_accountId, send_to_innkeepr_api_paginated

In [None]:
accounts_to_consider = ["Nikin","Asambeauty"]
model_type = "conversion"
url = return_api_url()
print(f"url = {url}")
accounts = return_account_ids()
start_date = "2025-08-01"
path_to_save = "Analysis/cross_account_comparison/"
path_to_save_figures = f"Analysis/cross_account_comparison/figures/{accounts_to_consider[0]}-{accounts_to_consider[1]}"
print(path_to_save_figures)
s3 = S3Connection()

# Model Comparison

In [None]:
def load_model_and_get_stats(path_to_model, path_to_weights):
      file_name = path_to_model.split("/")[-1]
      account = file_name.split("-")[0]
      date = file_name.split("_best_models")[0].split("*")[-1]
      model = NNLSTM({}, path_to_model, None)
      model = model.load_lstm_model(path_to_model, compile=True)
      model.load_weights(path_to_weights)
      layer_info = []
      for layer in model.layers:
            layer_info.append({
            'name': layer.name,
            'type': layer.__class__.__name__,
            'params': layer.count_params(),
            #'output_shape': str(layer.output_shape)
            })
      layer_info = pd.DataFrame(layer_info)
      weights_info = pd.DataFrame()
      nr_rows = int(len(model.weights))
      fig, axes = plt.subplots(nr_rows, 2, figsize=(10, 9))
      iplot=0
      for layer in model.weights:
            layer_name = layer.name
            layer_shape = layer.shape
            values = layer.value.read_value().numpy()
            #print(f"values = {values}")
            stats = pd.DataFrame(values).transpose().describe()
            stats["layer"] = layer_name
            stats["shape"] = str(layer_shape)
            weights_info = pd.concat([weights_info, stats])
            print(f"layer_name = {layer_name}, len values = {len(values)}")
            print(f"iplot = {iplot}, nr_rows = {nr_rows}")
            axes[iplot][0].hist(values.flatten(), bins=50, alpha=0.7)
            axes[iplot][0].set_title(f"{account} - {date}: Histogram of {layer_name}")
            axes[iplot][0].grid(True)
            axes[iplot][1].boxplot(values.flatten())
            axes[iplot][1].set_title(f"{account} - {date}: Boxplot of {layer_name}")
            axes[iplot][1].grid(True)
            iplot+=1
      plt.tight_layout()
      plt.savefig(f"{path_to_save_figures}Weights_{account}_{date}.png")
      plt.show()
      plt.close()
      weights_info = pd.concat([layer_info, weights_info], axis=0)
      return weights_info


In [None]:
model_comparison = pd.DataFrame()
for account in accounts:
    if account["name"] not in accounts_to_consider:
        continue
    print(f"account = {account['name']}")
    sanitized_account_name = sanitize_account_name(account["name"])
    account_id = account["id"]
    df = call_api_with_accountId(f"{url}/models/query", account_id, {"type":"conversion","created":{"$gte":start_date}}, logging)
    df = pd.json_normalize(df)
    df["account"]=account["name"]
    # Add model properties
    model_params = pd.DataFrame()
    for path in df["path"].unique():
        bucket = f"innkeepr-targeting-{sanitized_account_name}"
        files = s3.list_files_with_pagination(bucket, path)
        print(f"files = {files}")
        file = [f for f in files if "conversion_probability_model.h5" in f]
        weights = [f for f in files if "weights.h5" in f]
        classifier = file[0].split("/")[2].split("_classifier_")[1].split("_")[0]
        neurons = file[0].split("/")[2].split("_classifier_")[1].split("_")[1]
        temp = pd.DataFrame(data={
            "path": [path],
            "classifier": [classifier],
            "neurons": [neurons]
        })
        local_path = f"{path_to_save}models/{file[0].replace('/','*')}"
        s3.download_file(bucket, file[0], local_path)
        local_path_weights = f"{path_to_save}models/{weights[0].replace('/','*')}"
        s3.download_file(bucket, weights[0], local_path_weights)
        model_infos = load_model_and_get_stats(local_path, local_path_weights)
        model_infos["path"] = path
        temp = pd.merge(model_infos, temp, on="path")
        model_params = pd.concat([model_params, temp])
    df = pd.merge(df, model_params, on="path")
    model_comparison = pd.concat([model_comparison, df])
model_comparison.to_csv(f"{path_to_save}/model_comparison.csv", index=False)
model_comparison

## Significant Test for Weights - between dates

In [None]:
def unique_pairs(lst):
    seen = set()
    results = []
    for combo in combinations(lst, 2):
        sorted_combo = tuple(sorted(combo))
        if sorted_combo not in seen:
            seen.add(sorted_combo)
            results.append(sorted_combo)
    return results

def flatten_values_no_nan(df):
    flatten = df.values.ravel()
    flattened_no_nan = pd.Series(flatten).dropna().values
    return flattened_no_nan

def run_ttest(group1, group2, equal_variance=True, p_stat=0.001):
    t_stat, p_value = ttest_ind(group1, group2, equal_var=equal_variance)
    res = {
        "t_stat": t_stat,
        "p_value": p_value,
        "p_stat": p_stat,
        "significant": p_value < p_stat
    }
    return res


In [None]:
fixed_columns = ["account","path","layer"]
weight_values_columns = [c for c in model_comparison.columns if isinstance(c, int)]
columns_to_use = fixed_columns + weight_values_columns
ttest_results = pd.DataFrame(columns=["group1","group2","layer","p_value","significant","p_stat","t_stat"])
layers = model_comparison["layer"].dropna().unique()
path_combinations = unique_pairs(model_comparison["path"].unique().tolist())
print(f"len path_combinations = {len(path_combinations)}")
for layer in layers:
        for comb in path_combinations:
            print(f"layer = {layer}, comb = {comb}")
            group1 = model_comparison[(model_comparison["path"] == comb[0]) & (model_comparison["layer"] == layer)]
            group1 = flatten_values_no_nan(group1[weight_values_columns])
            print(f"group1 = {np.shape(group1)}")
            group2 = model_comparison[(model_comparison["path"] == comb[1]) & (model_comparison["layer"] == layer)]
            group2 = flatten_values_no_nan(group2[weight_values_columns])
            print(f"group2 = {np.shape(group2)}")
            result_dict = run_ttest(group1, group2)
            result_dict["group1"] = comb[0]
            result_dict["group2"] = comb[1]
            result_dict["layer"] = layer
            ttest_results = pd.concat([ttest_results, pd.DataFrame(result_dict, index=[0])], axis=0)


In [None]:
ttest_results.sort_values(by="p_value", ascending=True)

In [None]:
ttest_results["significant"].value_counts()

## Model Score: Fine

In [None]:
model_comparison.groupby(by=["account"])["f1Score"].describe()

## Model Params: Fine

In [None]:
model_comparison.groupby(by=["account","type_y","name"])["params"].describe()

## Model Neurons & Classifier & Layers: fine

In [None]:
model_comparison.groupby(by=["account"])["neurons"].describe()

In [None]:
model_comparison.groupby(by=["account"])["classifier"].describe()

In [None]:
model_comparison.groupby(by=["account","layer"])["shape"].value_counts()

# Targeting History

In [None]:
try:
    conversion_action_history = pd.read_csv(f"{path_to_save}/conversion_action_history.csv")
except FileNotFoundError:
    print(f"Create empty dataframe")
    conversion_action_history = pd.DataFrame()
for accout in accounts:
    account_id = accout["id"]
    account_name = accout["name"]
    if account_name not in accounts_to_consider:
        continue
    print(f"account = {account_name}")
    date_range = pd.date_range(start_date, end="today", freq="D").strftime("%Y%m%d").tolist()
    conversion_actions = call_api_with_accountId(f"{url}/sources/googleAdwords/conversionActions/query", account_id, {}, logging)
    print(f"date_range = {date_range}")
    for conversion_action in conversion_actions:
        conversion_actions_id = conversion_action["id"]
        conversion_action_name = conversion_action["name"]
        print(f"Get results for conversion action {conversion_actions_id}")
        for date in date_range:
            temp = conversion_action_history[(conversion_action_history["conversion_action_id"] == conversion_actions_id) & 
                                             (conversion_action_history["created"].astype("string") == date) & 
                                             (conversion_action_history["account"] == account_name)]
            if len(temp) > 0:
                print(f"Date {date} already exists for conversion action {conversion_actions_id} and account {account_name}")
                continue
            try:
                temp = wr.s3.read_parquet(f"s3://{account_id}/targeting.history/{date}/ga_conversion_update_{conversion_actions_id}.parquet")
                temp["account"] = account_name
                temp["conversion_action_id"] = conversion_actions_id
                temp["conversion_action_name"] = conversion_action_name
            except wr.exceptions.NoFilesFound:
                print(f"No files found for conversion action {conversion_actions_id} on date {date}")
                continue
            conversion_action_history = pd.concat([conversion_action_history, temp], ignore_index=True)
conversion_action_history = conversion_action_history.drop_duplicates().reset_index(drop=True)
conversion_action_history.to_csv(f"{path_to_save}/conversion_action_history.csv", index=False)


In [None]:
conversion_action_history = conversion_action_history[conversion_action_history["account"].isin(accounts_to_consider)]
conversion_action_history["account"].value_counts()

In [None]:
strategies = conversion_action_history["strategy"].dropna().unique().tolist()
strategies

In [None]:
conversion_action_history["label"] = conversion_action_history["account"].astype("str")+"-"+conversion_action_history["conversion_action_name"].astype("str")+"-"+conversion_action_history["strategy"].astype("str")
conversion_action_history

## Converison Probabilities

In [None]:
conversion_action_history["created"] = conversion_action_history["created"].astype("string")

In [None]:
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(1, 1, 1)
sns.lineplot(data=conversion_action_history, x="created", y="conv_prob", hue="label",style="label", marker="o", ax=ax)
plt.xticks(rotation=90)
plt.ylim(0,1)
plt.grid(True)
fig.savefig(f"{path_to_save_figures}/conversion_action_history_conv_prob.png")

## Conversion Value
No obvious differences in the trend. 

In [None]:
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(1, 1, 1)
sns.lineplot(data=conversion_action_history, x="created", y="properties.revenue", hue="label",style="label", marker="o", ax=ax)
sns.lineplot(data=conversion_action_history, x="created", y="value_based_conversion_value", hue="label",style="label", marker="D", ax=ax)
plt.xticks(rotation=90)
plt.grid(True)
fig.savefig(f"{path_to_save_figures}/conversion_action_history_conv_value.png")

# Count in Changes
- For Nikin the count increases
- For JunoAndMe the count decreases
    - Check: conversions

In [None]:
tar_hist_count = conversion_action_history.groupby(["label", "created"])["session"].count().reset_index().sort_values(by="created").reset_index(drop=True)
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(1, 1, 1)
sns.lineplot(data=tar_hist_count, x="created", y="session", hue="label",style="label", marker="o", ax=ax)
plt.xticks(rotation=90)
plt.grid(True)
fig.savefig(f"{path_to_save_figures}/conversion_action_history_count.png")

## compare with datashift

In [None]:
datashift = pd.read_csv("DataChecks/datashift/check_datashift_data.csv")
account_ids = return_account_ids()
account_dict ={}
for account in account_ids:
    if account["name"] in accounts_to_consider:
        account_dict[account["name"]] = account["id"]
account_ids = [account_dict[account] for account in accounts_to_consider]
datashift = datashift[datashift["account_id"].isin(account_ids)]
datashift["account"] = datashift["account_id"].apply(lambda x: [acc for acc in account_dict.keys() if account_dict[acc] == x][0])
min_date = (pd.to_datetime(tar_hist_count["created"].min())-timedelta(days=30)).strftime("%Y-%m-%d")
datashift = datashift[datashift["max_date"]>=min_date]
datashift = datashift.sort_values(by="max_date").reset_index(drop=True)
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(1, 1, 1)
sns.lineplot(data=datashift, x="max_date", y="count_sessions_max_date", hue="account",style="account", marker="o", ax=ax)
sns.lineplot(data=datashift, x="max_date", y="mean_count_sessions", hue="account",style="account", marker="x", ax=ax)

plt.xticks(rotation=90)
plt.grid(True)
fig.savefig(f"{path_to_save_figures}/datashift_count.png")

## compare with session dates

In [None]:
session_date = conversion_action_history.groupby(by=["label","created"])["session.date"].value_counts()
session_date = session_date.reset_index().sort_values(by=["session.date"]).reset_index(drop=True)
session_date["new_label"] = session_date["label"].astype("string").str.replace("-Adjusted Conversion (purchase) - Innkeepr","")
session_date["new_label_date"] = session_date["new_label"].astype("string")+"-"+session_date["created"].astype("string")
print(session_date["new_label"].value_counts())
session_date

In [None]:
fig = plt.figure(figsize=(15,15))
for ilabel, new_labe in enumerate(session_date["new_label"].unique()):
    ax = fig.add_subplot(3, 1, ilabel+1)
    ax.set_title(f"Conversionprob per date for {new_labe}")
    sns.barplot(data=session_date[session_date["new_label"] == new_labe], x="session.date", y="count", hue="created", ax=ax)
    plt.xticks(rotation=90)
    plt.grid(True)
    plt.legend(loc="upper left", bbox_to_anchor=(1.0, 1.0), ncol=2)
plt.tight_layout()
fig.savefig(f"{path_to_save_figures}/conversion_action_history_count_session_date.png")

## Percentage of changed vs. unchanged conversions

In [None]:
conversion_action_history["conversion_value_modified"] = conversion_action_history["properties.revenue"] != conversion_action_history["value_based_conversion_value"]
count_modifcations = conversion_action_history.groupby(by=["label","created"])["conversion_value_modified"].value_counts()
count_modifcations = count_modifcations.reset_index()
count_modifcations

In [None]:
fig = plt.figure(figsize=(15,15))
for ilabel, new_labe in enumerate(count_modifcations["label"].unique()):
    ax = fig.add_subplot(3, 1, ilabel+1)
    ax.set_title(f"Conversion Modifications per date for {new_labe}")
    sns.barplot(data=count_modifcations[count_modifcations["label"] == new_labe], x="created", y="count", hue="conversion_value_modified", ax=ax)
    plt.xticks(rotation=90)
    plt.grid(True)
    plt.legend(loc="upper left", bbox_to_anchor=(1.0, 1.0), ncol=2)
plt.tight_layout()
fig.savefig(f"{path_to_save_figures}/conversion_action_history_modfication_count.png")

In [None]:
count_modifcations_pivot = count_modifcations.pivot(index=["label","created"], columns="conversion_value_modified", values="count")
count_modifcations_pivot = count_modifcations_pivot.reset_index()
count_modifcations_pivot["change_in_percentage"] = (count_modifcations_pivot[True] / (count_modifcations_pivot[True] + count_modifcations_pivot[False]) * 100).round(2)
count_modifcations_pivot = count_modifcations_pivot.sort_values(by="created").reset_index(drop=True)
count_modifcations_pivot

In [None]:
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(1, 1, 1)
ax.set_title(f"Conversion Modifications %")
sns.lineplot(data=count_modifcations_pivot, x="created", y="change_in_percentage", hue="label", style="label", markers="o", ax=ax)
plt.xticks(rotation=90)
plt.grid(True)
plt.ylim(0,100)
plt.legend(loc="upper left", bbox_to_anchor=(-0.0, -0.6), ncol=1)
plt.tight_layout()
fig.savefig(f"{path_to_save_figures}/conversion_action_history_modfication_perc.png")

## compare with converisons

In [None]:
to_date = conversion_action_history["created"].max()
date_range = pd.date_range(start_date, end=to_date, freq="10D").strftime("%Y%m%d").tolist()
date_range.append(to_date)
print(date_range)
print(f"Query conversions from {start_date} to {to_date}")
try:
    conversions = pd.read_csv(f"{path_to_save}/conversions.csv")
except FileNotFoundError:
    print(f"Create empty dataframe")
    conversions = pd.DataFrame(columns=["created","account"])
conversions["date"] = pd.to_datetime(conversions["created"]).dt.date
conversions["date"] = conversions["date"].astype("string")
for account in accounts:
    account_id = account["id"]
    account_name = account["name"]
    if account_name not in accounts_to_consider:
        continue
    print(f"Account: {account_name}")
    for idate, date in enumerate(date_range):
        if idate == len(date_range)-1:
            end_date = date_range[-1]
        else:
            end_date = date_range[idate+1]
        print(f"{idate}:gte: {date}, lt: {end_date}")
        check_dates = pd.date_range(date, end_date, freq="1D").strftime("%Y-%m-%d")
        check_dates = check_dates.tolist()
        ignore_date_range = True
        for check_date in check_dates:
            temp = conversions[(conversions["date"] == check_date) & (conversions["account"] == account_name)]
            if len(temp) == 0:
                ignore_date_range = False
                break
        if ignore_date_range:
            print(f"Ignore date range")
            continue
        content={
        "created": {
                        "$gte": transform_date_to_timestamp_milliseconds(date),
                        "$lte": transform_date_to_timestamp_milliseconds(end_date),
                },
        }
        df = send_to_innkeepr_api_paginated(f"{url}/conversions/query", account_id, content, logging)
        df = pd.json_normalize(df)
        if df.empty:
            print(f"No conversions found for account {account_name} on date {date}")
            continue
        df["account"] = account_name
        if "checkout_completed" not in df["name"].unique().tolist():
            raise Exception(f"checkout_completed not in conversions: {df['name'].unique().tolist()}")
        df = df[df["name"]=="checkout_completed"]
        conversions = pd.concat([conversions, df], ignore_index=True)
        conversions.to_csv(f"{path_to_save}/conversions.csv", index=False)
conversions.to_csv(f"{path_to_save}/conversions.csv", index=False)

In [None]:
conversions["date"] = pd.to_datetime(conversions["created"]).dt.date
conversions["date"] = conversions["date"].astype("string")
conversions["name"].value_counts()

In [None]:
conv_hist_count = conversions.dropna(subset="properties.revenue").groupby(["date","account"])["sessionId"].count().reset_index().sort_values(by="date").reset_index(drop=True)
rolling_sum_conv_hist_count = conv_hist_count.groupby("account")["sessionId"].rolling(window=30).sum().reset_index().rename(columns={"sessionId": "rolling_sum"})
rolling_sum_conv_hist_count = rolling_sum_conv_hist_count.reset_index(drop=True).set_index("level_1").drop(columns=["account"])
conv_hist_count = pd.concat([conv_hist_count, rolling_sum_conv_hist_count], axis=1)
conv_hist_count

In [None]:
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(2, 1, 1)
sns.lineplot(data=conv_hist_count, x="date", y="sessionId", hue="account", style="account", marker="o", ax=ax)
plt.xticks(rotation=90)
plt.grid(True)
ax2 = fig.add_subplot(2, 1, 2)
sns.lineplot(data=conv_hist_count, x="date", y="rolling_sum", hue="account", style="account", marker="D", ax=ax2)
plt.xticks(rotation=90)
plt.grid(True)
plt.tight_layout()
fig.savefig(f"{path_to_save_figures}/conversion_count.png")

# Conversion Probs from Table

In [None]:
converison_probs = pd.DataFrame()
for account in accounts:
    account_id = account["id"]
    account_name = account["name"]
    if account_name not in accounts_to_consider:
        continue
    print(f"Account: {account_name}")
    profile_path = db_client.return_databricks_client()
    table_path = f"{profile_path}#delta_share_events.{account_id}.conversion_probabilities_view_30_outlook" 
    tmp_probs = delta_sharing.load_as_pandas(table_path)
    tmp_probs["account"] = account_name
    converison_probs = pd.concat([converison_probs, tmp_probs], ignore_index=True)
converison_probs

In [None]:
converison_probs["date"].value_counts(dropna=False).sort_index()

In [None]:
converison_probs.groupby("account")["goal"].value_counts(dropna=False)

In [None]:
converison_probs["date"] = converison_probs["date"].astype("string")
converison_probs = converison_probs.sort_values(by="created").reset_index(drop=True)
converison_probs = converison_probs.drop_duplicates(subset=["date","sessionId"],keep="last")
converison_probs = converison_probs.sort_values(by="date").reset_index(drop=True)
converison_probs

In [None]:
vc = converison_probs.groupby(by=["date","sessionId"])["created"].nunique().reset_index()
vc[vc["created"] > 1]

In [None]:
converison_probs.groupby("account")["probability"].describe()

In [None]:
fig = plt.figure(figsize=(10,5))
ax = fig.add_subplot(1, 1, 1)
sns.lineplot(data=converison_probs, x="date", y="probability", hue="account",style="account", marker="o", ax=ax)
plt.xticks(rotation=90)
plt.grid(True)
fig.savefig(f"{path_to_save_figures}/conversion_prob_table.png")

# Conclusion

Model Properties:
l

Predictions:

Labels:
- *differences, but not performance relevant
- **differences, performance relevant


# Recommendations
