In [1]:
cd /Users/karolinegriesbach/Documents/Innkeepr/Git/evaluation-and-execution-scripts/

In [2]:
import logging
import delta_sharing
import pandas as pd

import general_functions.databricks_client as db_client
from general_functions.call_api_with_account_id import send_to_innkeepr_api_paginated
from general_functions.return_account_ids import return_account_ids
from general_functions.constants import return_api_url

In [3]:
customer = "Nikin"
url = return_api_url()
print(f"url = {url}")
account_id = return_account_ids()
account_id = [acc["id"] for acc in account_id if acc["name"] == customer]
account_id = account_id[0]

# Load test cate data

In [14]:
cate = pd.read_csv("SprintStories/EN_2730_Nikin/cate_20250905_test.csv", index_col=0)
cate = cate[cate["created"]=="2025-09-03"]

In [15]:
cate_with_nan_profiles = cate[cate["profile_id"].isnull()]
cate_with_nan_profiles

# Load profiles view: check if cate_with_nan_profiles.anonymousIds are in profiles_view

In [16]:
try:
    profiles_view = pd.read_csv("SprintStories/EN_2730_Nikin/profiles_view_30_outlook.csv", index_col=False)
except:
    profile_path = db_client.return_databricks_client()
    table_path = f"{profile_path}#delta_share_events.{account_id}.profiles_view_30_outlook" #features_view_30_outlook"
    profiles_view = delta_sharing.load_as_pandas(table_path) #, limit=20000)
    profiles_view.to_csv("SprintStories/EN_2730_Nikin/profiles_view_30_outlook.csv", index=False)
profiles_view

In [17]:
# check if profile_views has all anonymousIds
print("missing anonymousIds:", cate_with_nan_profiles["anonymousId"].nunique())
print("total anonymousIds: ", cate["anonymousId"].nunique())

missing_anonymousIds_from_cate = cate_with_nan_profiles[cate_with_nan_profiles["anonymousId"].isin(profiles_view["anonymousId"]) == False]
missing_anonymousIds_from_cate

# Check anonymoousIds in db

In [20]:
profiles_db = send_to_innkeepr_api_paginated(
    f"{url}/profiles/query",
    account_id,
    {"anonymousId":missing_anonymousIds_from_cate["anonymousId"].tolist()},
    logging
)
len(profiles_db)
profiles_db = pd.json_normalize(profiles_db)

In [21]:
profiles_db

## check name distribution

In [22]:
cookieid = "id"
col_name = "name"
data_col_extIds = "externalIds"
profiles_id="_id"
col_last_modified="lastModified"
col_user_id="anonymousId"
profiles_db = profiles_db.rename(columns={"id": profiles_id})
profiles_db = profiles_db[[profiles_id, col_user_id, data_col_extIds, col_last_modified]]
profiles_db = profiles_db.explode(data_col_extIds)
profiles_db[[cookieid, col_name]] = profiles_db[data_col_extIds].apply(
        lambda x: pd.Series([x[cookieid], x[col_name]])
    )
profiles_db["name"].value_counts()

# Query by date and check if all ids are returned

In [34]:
profiles_by_date = send_to_innkeepr_api_paginated(
    f"{url}/profiles/query",
    account_id,
    {"lastModified":{"$gte":"2025-09-03"}},
    logging
)
profiles_by_date = pd.json_normalize(profiles_by_date)
profiles_by_date.shape

In [35]:
existing_anonymousIds = cate[cate["anonymousId"].isin(profiles_by_date["anonymousId"])]
print(f"Existing anonymousIds: {existing_anonymousIds['anonymousId'].nunique()} from {cate['anonymousId'].nunique()}")
existing_anonymousIds

In [36]:
missing_anonymousIds = cate[cate["anonymousId"].isin(profiles_by_date["anonymousId"])==False]
print(f"Existing missing_anonymousIds: {missing_anonymousIds['anonymousId'].nunique()} from {cate['anonymousId'].nunique()}")
missing_anonymousIds ##? Die id gibt es, warum fehlt sie?

# Simulate Profiles Transformation

In [37]:
cookieid = "id"
col_name = "name"
data_col_extIds = "externalIds"
profiles_id="_id"
col_last_modified="lastModified"
col_user_id="anonymousId"
profiles_by_date = profiles_by_date.rename(columns={"id": profiles_id})
profiles_by_date = profiles_by_date[[profiles_id, col_user_id, data_col_extIds, col_last_modified]]
profiles_by_date = profiles_by_date.explode(data_col_extIds)
profiles_by_date[[cookieid, col_name]] = profiles_by_date[data_col_extIds].apply(
        lambda x: pd.Series([x[cookieid], x[col_name]])
    )
profiles_by_date["name"].value_counts()

In [38]:
profiles_by_date["anonymousId"].nunique()

In [39]:
list_unique_anonymousids = cate["anonymousId"].unique()
profiles_by_date = profiles_by_date[profiles_by_date[col_user_id].isin(list_unique_anonymousids)]
print("Unique orignal anonymousIds: ", cate["anonymousId"].nunique())
print("Count of unique anonymousIds in profiles_by_date ", profiles_by_date["anonymousId"].nunique())

In [45]:
profiles_filtered = profiles_by_date[
        (profiles_by_date[col_name].str.contains("email_sha256"))
        | (profiles_by_date[col_name].str.contains("email"))
        | (profiles_by_date[col_name].str.contains("_ga"))
        | (profiles_by_date[col_name].str.contains("_fbp"))
        #| (profiles_by_date[col_name].str.contains("_ttp"))
        | (profiles_by_date[col_name].str.contains("criteo_gum_id"))
    ]
print("Count of unique anonymousIds in profiles_filtered ", profiles_filtered["anonymousId"].nunique())

In [47]:
missing_profiles = profiles_by_date[profiles_by_date["anonymousId"].isin(profiles_filtered["anonymousId"]) == False]
print("Count of unique anonymousIds in missing_profiles ", missing_profiles["anonymousId"].nunique())
print(missing_profiles["name"].value_counts())
missing_profiles

# Check if these profiles have gclids

In [51]:
session_ids = cate[cate["anonymousId"].isin(missing_profiles["anonymousId"])]
session_ids = session_ids["session"].unique().tolist()
len(session_ids)

In [52]:
sessions = send_to_innkeepr_api_paginated(
    f"{url}/sessions/query",
    account_id,
    {"sessionId":session_ids},
    logging
)
sessions = pd.json_normalize(sessions)
sessions.shape

In [56]:
sessions.drop_duplicates(subset=["sessionId","campaign.gclid"])["campaign.gclid"].count()

In [57]:
sessions