In [1]:
cd /Users/karolinegriesbach/Documents/Innkeepr/Git/evaluation-and-execution-scripts/

In [2]:
import ast
import logging
import pandas as pd
import numpy as np
from datetime import timedelta, datetime

from general_functions.call_api_with_account_id import send_to_innkeepr_api_paginated
from general_functions.constants import return_api_url
from general_functions.return_account_ids import return_account_ids


In [3]:
customer = "LILLYDOO"
bq = pd.read_csv("SprintStories/EN-3061-Lillydo-Abohistorie/bigquery_data.csv")
profiles = pd.read_parquet(f"DataChecks/profiles/{customer}_profiles.parquet")
features = pd.read_parquet("DataChecks/views/LILLYDOO.parquet")


In [4]:
print(f"bq date range: {bq['created'].min()} - {bq['created'].max()}")
print(f"profiles date range: {profiles['lastModified'].min()} - {profiles['lastModified'].max()}")
print(f"features date range: {features['created'].min()} - {features['created'].max()}")
print(f"Total anonymousIds in features = {features['anonymousId'].nunique()}")

# Transform Data

In [5]:
def return_email_sha_values(entry):
    if isinstance(entry, str):
        entry = ast.literal_eval(entry)
    try:
        return entry["email_sha256"]
    except KeyError:
        return None

In [6]:
bq["email_sha256"] = bq["traits"].apply(lambda x: return_email_sha_values(x))
bq = bq[["_id","created","email_sha256"]]
bq.head()

In [7]:
def return_profiles_email_sha256(entry):
    if isinstance(entry, str):
        entry = ast.literal_eval(entry)
    emails = []
    for email in entry:
        if email["name"] == "email_sha256":
                emails.append(email["id"])
    if len(emails) == 0:
        return None
    return emails

In [8]:
profiles = profiles[["profile_id", "anonymousId", "lastModified", "email_sha256_externalIds"]]
profiles = profiles.dropna(subset=["email_sha256_externalIds"])
profiles["email_sha256"] = profiles["email_sha256_externalIds"].apply(lambda x: return_profiles_email_sha256(x))
profiles = profiles.explode("email_sha256")
profiles = profiles.drop_duplicates()
profiles.head()

# Get Matching Rate between bigquery and profiles (from databricks)

## Filtered by Dates

In [9]:
# get profiles and bq for the same daterange
min_date = max(bq["created"].min(), profiles["lastModified"].min())
max_date= min(bq["created"].max(), profiles["lastModified"].max())
print(f"min date: {min_date}, max date: {max_date}")
bq_filtered = bq[(bq["created"] >= min_date) & (bq["created"] <= max_date)]
bq_filtered = bq_filtered.drop_duplicates()
profiles_filtered = profiles[(profiles["lastModified"] >= min_date)& (profiles["lastModified"] <= max_date)]
print(f"bq date range: {bq_filtered['created'].min()} - {bq_filtered['created'].max()}")
print(f"profiles date range: {profiles_filtered['lastModified'].min()} - {profiles_filtered['lastModified'].max()}")

In [10]:
merge_bq_and_profiles = pd.merge(bq_filtered, profiles_filtered, how="left", left_on="email_sha256", right_on="email_sha256", suffixes=("_bq", "_profiles"))
merge_bq_and_profiles

In [11]:
# machted profiles
matched_profiles = merge_bq_and_profiles[merge_bq_and_profiles["profile_id"].notna()]["_id"].unique()
unmatched_profiles = merge_bq_and_profiles[merge_bq_and_profiles["_id"].isin(matched_profiles)==False]["_id"].unique()
total_conversion_ids = merge_bq_and_profiles['_id'].nunique()
print(f"matched profiles: {len(matched_profiles)}")
print(f"unmatched profiles: {len(unmatched_profiles)}")
print(f"Total conversion ids = {total_conversion_ids}")
matching_rate = len(matched_profiles)/total_conversion_ids*100
print(f"matching rate: {matching_rate}%")
if total_conversion_ids != len(matched_profiles) + len(unmatched_profiles):
    raise ValueError(f"Matched profiles + Unmatched profiles != total profiles")

## Unfiltered

In [12]:
merge_bq_and_profiles_all = pd.merge(bq, profiles, how="left", left_on="email_sha256", right_on="email_sha256", suffixes=("_bq", "_profiles"))
merge_bq_and_profiles_all

In [13]:
# machted profiles
matched_profiles_all = merge_bq_and_profiles_all[merge_bq_and_profiles_all["profile_id"].notna()]["_id"].unique()
unmatched_profiles_all = merge_bq_and_profiles_all[merge_bq_and_profiles_all["_id"].isin(matched_profiles_all)==False]["_id"].unique()
total_conversion_ids_all = merge_bq_and_profiles_all['_id'].nunique()
print(f"matched profiles: {len(matched_profiles_all)}")
print(f"unmatched profiles: {len(unmatched_profiles_all)}")
print(f"Total conversion ids = {total_conversion_ids_all}")
matching_rate_all = len(matched_profiles_all)/total_conversion_ids_all*100
print(f"matching rate: {matching_rate_all}%")
if total_conversion_ids_all != len(matched_profiles_all) + len(unmatched_profiles_all):
    raise ValueError(f"Matched profiles + Unmatched profiles != total profiles")

# Profile Matching Via Mongo Profiles

In [14]:
# query profiles
min_date = bq["created"].min()
min_date = pd.to_datetime(min_date)-timedelta(days=1)
min_date = pd.to_datetime(min_date)
today = datetime.today()+timedelta(days=2)
date_range = pd.date_range(min_date, today, freq="2D").strftime("%Y-%m-%dT00:00:00.000Z").tolist()
print(date_range)
workspace_ids =  return_account_ids()
workspace_id = [workspace["id"] for workspace in workspace_ids if workspace["name"] == customer][0]
api_url = return_api_url()
col_last_modified = "lastModified"

In [15]:
print(f"Query data from date: {min_date}")
try:
    profiles_api = pd.read_csv(f"SprintStories/EN-3032-BigQuery/{customer}_profiles_api.csv")
except FileNotFoundError:
    profiles_api = pd.DataFrame()
    for idate, start_date in enumerate(date_range):
        if idate == len(date_range)-1:
            end_date = date_range[-1]
        else:
            end_date = date_range[idate+1]
        print(f"Query data for date: {start_date} to {end_date}")
        temp = send_to_innkeepr_api_paginated(
                f"{api_url}/profiles/query",
                workspace_id,
                {col_last_modified: {"$gte": start_date, "$lt": end_date}},
                logging.getLogger(),
            )
        temp = pd.json_normalize(temp)
        profiles_api = pd.concat([profiles_api, temp])
        profiles_api = profiles_api.drop_duplicates(subset=["id"])
        if idate % 5 == 0:
            profiles_api.to_csv(f"SprintStories/EN-3032-BigQuery/{customer}_profiles_api.csv")
    profiles_api.to_csv(f"SprintStories/EN-3032-BigQuery/{customer}_profiles_api.csv")

In [16]:
profiles_api = profiles_api[["id","anonymousId","lastModified","externalIds", "traits.email_sha256"]]
profiles_api["email_sha256"] = profiles_api["externalIds"].apply(lambda x: return_profiles_email_sha256(x))
profiles_api = profiles_api.explode("email_sha256")
profiles_api

In [17]:
merge_bq_and_profiles_api = pd.merge(bq, profiles_api, how="left", left_on="email_sha256", right_on="email_sha256", suffixes=("_bq", "_profiles"))
merge_bq_and_profiles_api

In [18]:
# machted profiles
matched_profiles_api = merge_bq_and_profiles_api[merge_bq_and_profiles_api["id"].notna()]["_id"].unique()
unmatched_profiles_api = merge_bq_and_profiles_api[merge_bq_and_profiles_api["_id"].isin(matched_profiles_api)==False]["_id"].unique()
total_conversion_ids_api = merge_bq_and_profiles_api['_id'].nunique()
print(f"matched profiles: {len(matched_profiles_api)}")
print(f"unmatched profiles: {len(unmatched_profiles_api)}")
print(f"Total conversion ids = {total_conversion_ids_api}")
matching_rate_api = len(matched_profiles_api)/total_conversion_ids_api * 100
print(f"matching rate: {matching_rate_api}%")
if total_conversion_ids_api != len(matched_profiles_api) + len(unmatched_profiles_api):
    raise ValueError(f"Matched profiles + Unmatched profiles != total profiles")

In [19]:
unmatched_profiles_api

In [20]:
profiles_api.head()

In [23]:
print(len(unmatched_profiles_api))
for iemail, email in enumerate(unmatched_profiles_api):
    test = profiles_api[profiles_api["externalIds"].astype(str).str.contains(email)]
    print(f"{iemail}: {test.shape}")
    if len(test) == 0:
        continue
    print(f"... {iemail}: Found email {email}")

# Compare API Profiles with session anonymousIds

In [None]:
matched_profiles_api_anonyId = merge_bq_and_profiles_api[merge_bq_and_profiles_api["id"].notna()]["anonymousId"].unique()
sessions_with_match = features[features["anonymousId"].isin(matched_profiles_api_anonyId)]["anonymousId"].unique()
sessions_without_match = features[~features["anonymousId"].isin(matched_profiles_api_anonyId)]["anonymousId"].unique()
total_anonymousIds_sessions = features['anonymousId'].nunique()
print(f"sessions_with_match: {len(sessions_with_match)}")
print(f"sessions_without_match: {len(sessions_without_match)}")
print(f"Total anonymousIds = {total_anonymousIds_sessions}")
matching_rate_api = len(sessions_with_match)/total_anonymousIds_sessions * 100
print(f"matching rate: {matching_rate_api}%")
if len(sessions_with_match) + len(sessions_without_match) != total_anonymousIds_sessions:
    raise ValueError(f"sessions_with_match + sessions_without_match != total_anonymousIds_sessions")
