In [38]:
shopify_domains = ["klaedeskabet.dk",
    "fashionnova.com",
    "kyliecosmetics.com",
    "colourpop.com",
    "jeffreestarcosmetics.com",
    "gymshark.com",
    "allbirds.com",
    "brooklinen.com",
    "ruggable.com",
    "ruggable.com",
    "chubbiesshorts.com",
    "chubbiesshorts.com",
    "puravidabracelets.com",
    "nativecos.com",
    "hauslabs.com",
    "skknbykim.com",
    "harney.com",
    "redbullshopus.com",
    "tula.com",
    "tula.com",
    "tesla.com",
    "spiritualgangster.com",
    "taylorstitch.com",
    "american-giant.com",
    "drsquatch.com",
    "mejuri.com",
    "mejuri.com",
    "peets.com",
    "deathwishcoffee.com",
    "hellotushy.com",
    "bando.com",
    "moroccanoil.com",
    "negativeunderwear.com",
    "birdies.com",
    "naadam.co",
    "popflexactive.com",
    "moderncitizen.com",
    "greatjonesgoods.com",
    "pinklily.com",
    "misen.com",
    "materialkitchen.com",
    "hedleyandbennett.com",
    "rumpl.com",
    "mizzenandmain.com",
    "ohpolly.com",
    "tecovas.com",
    "stance.com",
    "spongelle.com",
    "trueclassictees.com",
    "meundies.com",
    "studs.com",
    "jackhenry.co",
    "luxyhair.com",
    "juicycouture.com",
    "everlast.com",
    "skims.com",
    "feals.com",
    "foursigmatic.com",
    "golde.co",
    "liquid-iv.com",
    "thesill.com",
    "wearlively.com",
    "andieswim.com",
    "yourparade.com",
    "brightland.co",
    "omsom.com",
    "jenis.com",
    "snowehome.com",
    "graza.co",
    "flybyjing.com",
    "getmaude.com",
    "ugmonk.com",
    "shop.app"
]

In [39]:
import os
import requests
from dotenv import load_dotenv
import pandas as pd

# Load environment variables from .env file
load_dotenv()

SUPABASE_KEY = os.getenv("SUPABASE_KEY")

url = f"https://iukxcgvmzjfelwfrpkyi.supabase.co/rest/v1/analytics?apikey={SUPABASE_KEY}&select=*"
headers = {
    "apikey": SUPABASE_KEY,
    "Authorization": f"Bearer {SUPABASE_KEY}",
}

response = requests.get(url, headers=headers)

# Create a dataframe from the JSON response payload
data = response.json()


In [40]:
df = pd.DataFrame(data)
blacklisted_user_ids = ["390c0190-33a3-4029-ae09-687bff0b77d6", "42a0286f-f933-490a-89aa-0c0037dae11e"]
should_delete = df[df["type"] == "delete-data"]
# Exclude all events where the url contains "music.apple"
df = df[~df["url"].str.contains("music\.apple", na=False)]

blacklisted_user_ids.extend(should_delete["user_id"].tolist())
# Filter out rows where user_id is in the blacklisted_user_ids array
df = df[~df["user_id"].isin(blacklisted_user_ids)]


def get_top_domain(domain: str) -> str:
    if domain.startswith("www") or domain.startswith("ww2"):
        domain = domain[4:]
    
    d = domain.split("/")[0]

    return ".".join(d.split(".")[-3:]) if ".co.uk" in d else ".".join(d.split(".")[-2:])

# Add column that says if the user has a strategy or not
df["domain"] = df["url"].apply(lambda x: get_top_domain(x) if x != None else None)
# Define an empty array for blacklisted user ids


# AFTER:  2025-04-13 02:00
# BEFORE: 2025-04-26 10:00
df = df[(df["created_at"] >= "2025-03-27 01:00")]

In [41]:
# Fix shop.app URLs
shop_df = df[df["domain"] == "shop.app"]

# Count shop_df that contains /checkout and how many doesnt
checkout_df = shop_df[shop_df["url"].str.contains("/checkout|/pay/")]
no_checkout_df = shop_df[~shop_df["url"].str.contains("/checkout|/pay/")]

before_checkout_df_len = len(checkout_df)
before_no_checkout_df_len = len(no_checkout_df)

print("Filter shop.app URLs")

user_df = df.groupby("user_id")

# for each user_id iterate through the rows using index
for user_id, rows in user_df:
    # for each row in the user_id group, if url contains /checkout and is domain shop.app,
    # then set domain to the previous domain
    for index, row in rows.iterrows():
        if not '.' in row["domain"] or row["domain"] == "shop.app" and ("/checkout" in row["url"] or "/pay/" in row["url"]):
            # Ensure index - 1 is valid
            if previous_domain is not None:
                if previous_domain in shopify_domains:
                    df.at[row.name, "domain"] = previous_domain
                else:
                    df.at[row.name, "domain"] = "shop.app"

        else:
            previous_domain = row["domain"]

shop_df = df[df["domain"] == "shop.app"]

# Count shop_df that contains /checkout and how many doesnt
checkout_df = shop_df[shop_df["url"].str.contains("/checkout|/pay/")]
no_checkout_df = shop_df[~shop_df["url"].str.contains("/checkout|/pay/")]

# print
print(f"Number of shop.app URLs with /checkout filtered from {before_checkout_df_len} to {len(checkout_df)}")
print(f"Number of shop.app URLs without /checkout filtered from {before_no_checkout_df_len} to {len(no_checkout_df)}")

Filter shop.app URLs
Number of shop.app URLs with /checkout filtered from 183 to 183
Number of shop.app URLs without /checkout filtered from 9 to 52


In [42]:
import uuid
from datetime import datetime, timedelta

print("Fix session ids")

# Ensure 'created_at' is in datetime format
df['date'] = pd.to_datetime(df['created_at'], format='mixed')

# Sort the dataframe by user_id and created_at
df = df.sort_values(by=['user_id', 'date']).reset_index(drop=True)

# Generate session IDs
current_session_id = None

for user_id, group in df.groupby('user_id'):
    current_session_id = str(uuid.uuid4())
    last_event_time = None

    for index, row in group.iterrows():
        if last_event_time is None or (row['date'] - last_event_time) > timedelta(minutes=360):
            current_session_id = str(uuid.uuid4())
        df.at[index, 'session_id'] = current_session_id
        last_event_time = row['date']

df = df.drop(columns=['date'])

Fix session ids


In [43]:
# Add checkout events
checkout_keywords = ['/checkout', '/cart', '/shoppingcart', '/bag']

# Find first occurrence of checkout-related URLs per (session_id, domain)
def contains_checkout(url):
    if pd.isnull(url):
        return False
    return any(keyword in url for keyword in checkout_keywords)

# Filter rows that match checkout keywords
checkout_hits = df[df['url'].apply(contains_checkout)]

# Get the first hit per (session_id, domain)
first_checkout = (
    checkout_hits
    .sort_values('created_at')
    .groupby(['session_id', 'domain'], as_index=False)
    .first()
)

# Prepare new rows to inject
new_rows = []
for _, row in first_checkout.iterrows():
    new_row = row.copy()
    new_row['type'] = 'checkout'
    new_row['payload'] = None
    new_rows.append(new_row)

# Append new rows to df and re-sort by created_at
if new_rows:
    df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
    df = df.sort_values('created_at').reset_index(drop=True)

In [44]:
# TO CSV
df.to_csv("analytics.csv", index=False)


In [45]:
# List all with type = "place-order"
df[df["type"] == "checkout"]


Unnamed: 0,id,type,url,payload,user_id,session_id,received_at,created_at,domain
32,19647,checkout,www.ticketmaster.dk/checkout/Z698xZC4Z1744-y/5...,,0b7cd927-52ef-434b-8b2e-9c63281c6327,2d1873d0-c505-4e2f-aeea-f75f87e5de69,2025-03-27T15:27:56.313968+00:00,2025-03-27T15:27:55.78+00:00,ticketmaster.dk
174,19811,checkout,shop.app/checkout/55096148034/cn/Z2NwLXVzLWNlb...,,a8813899-eb7b-49f9-9e97-7f91f995e944,6995f96a-0dea-4fb2-ba60-5366bbabca32,2025-03-31T13:22:37.471846+00:00,2025-03-31T13:22:36.906+00:00,shop.app
293,19972,checkout,www.amazon.com/cart/smart-wagon,,a584c10c-d236-408a-b544-50b6219dee39,d1a54359-192c-4e70-9fbe-f4f301958049,2025-04-02T12:11:13.404999+00:00,2025-04-02T12:11:12.747+00:00,amazon.com
613,20292,checkout,greenmind.dk/checkout/personaldetails,,ddb693bf-df31-4881-a0de-5cca527b0504,7da46224-a50d-41c6-901e-f1f133a44b1b,2025-04-02T12:51:31.586633+00:00,2025-04-02T12:51:31.391+00:00,greenmind.dk
978,20663,checkout,www.amazon.co.uk/gp/remotepagelet/signin/check...,,10fb5e0c-f579-4234-ab46-b609a9cfb5d0,52a07cba-1696-41e9-93db-e012da1a85dd,2025-04-03T07:43:41.069063+00:00,2025-04-03T07:43:39.866+00:00,amazon.co.uk
1041,20725,checkout,www.asos.com/bag,,10fb5e0c-f579-4234-ab46-b609a9cfb5d0,52a07cba-1696-41e9-93db-e012da1a85dd,2025-04-03T07:46:30.122721+00:00,2025-04-03T07:46:30.053+00:00,asos.com
1154,20837,checkout,www.zalando.dk/cart,,0ada4805-fb70-490e-993a-072b09ae0229,cc8608fa-9066-4c6b-ba76-36137007f5b8,2025-04-03T10:38:26.059313+00:00,2025-04-03T10:38:38.066+00:00,zalando.dk
1405,21089,checkout,www.apple.com/shop/bag,,7e54f2ee-060a-467e-b319-9e99d8e58e27,cb631236-7a98-4c37-b82c-83b625345dfe,2025-04-04T02:26:37.767639+00:00,2025-04-04T02:26:36.965+00:00,apple.com
1559,21241,checkout,shop.app/checkout/14159740/cn/Z2NwLXVzLWVhc3Qx...,,a8813899-eb7b-49f9-9e97-7f91f995e944,9712708c-13de-46ce-91f9-14c1b5721b49,2025-04-04T14:36:14.813176+00:00,2025-04-04T14:36:14.246+00:00,shop.app
1937,22194,checkout,www.ticketmaster.dk/checkout/Z698xZC4Z1744-y/4...,,e9275da9-716d-4c8c-8168-2702a491abee,b1c50f05-a5d2-4d80-99d8-50253a3703a8,2025-04-07T12:20:35.560741+00:00,2025-04-07T12:20:34.822+00:00,ticketmaster.dk


In [46]:
# Create a new data frame, where it only includes a single sample of each type of event. The event is the "type" column.
df_unique = df.drop_duplicates(subset=["type"])
df_unique


Unnamed: 0,id,type,url,payload,user_id,session_id,received_at,created_at,domain
0,19500,from-directs,www.lessextension.com/,"""anticonsumption""",less-website,1994df91-38bf-459b-aa1c-14a59bb77df2,2025-03-27T01:06:50.771229+00:00,2025-03-27T01:06:50.072+00:00,lessextension.com
5,19519,on-onboarding,www.lessextension.com/onboarding,"""""",none,da81867b-8ccb-4375-af8e-a75281b811de,2025-03-27T07:24:55.70906+00:00,2025-03-27T07:24:56.412+00:00,lessextension.com
11,19626,page-view,www.us.rains.com/wpm@e6387955w1e097182p885e99f...,,1790dc76-e30c-4520-b12a-dbf62998c65c,fb6259f3-f2fd-41a4-8dc3-43655715ce2b,2025-03-27T12:10:38.738921+00:00,2025-03-27T12:10:35.809+00:00,rains.com
14,19629,time-spent,www.us.rains.com/wpm@e6387955w1e097182p885e99f...,"{""duration"":2770}",1790dc76-e30c-4520-b12a-dbf62998c65c,fb6259f3-f2fd-41a4-8dc3-43655715ce2b,2025-03-27T12:10:40.729803+00:00,2025-03-27T12:10:38.343+00:00,rains.com
27,19642,welcome-modal-seen,www.ticketmaster.dk/event/555879,,0b7cd927-52ef-434b-8b2e-9c63281c6327,2d1873d0-c505-4e2f-aeea-f75f87e5de69,2025-03-27T15:27:14.463082+00:00,2025-03-27T15:27:13.93+00:00,ticketmaster.dk
32,19647,checkout,www.ticketmaster.dk/checkout/Z698xZC4Z1744-y/5...,,0b7cd927-52ef-434b-8b2e-9c63281c6327,2d1873d0-c505-4e2f-aeea-f75f87e5de69,2025-03-27T15:27:56.313968+00:00,2025-03-27T15:27:55.78+00:00,ticketmaster.dk
87,19701,from-directs-cta,www.lessextension.com/,"""shoppingaddiction""",less-website,7cf70bdd-7a30-4e1e-beca-b2a12b35ab20,2025-03-27T21:32:22.421951+00:00,2025-03-27T21:32:22.176+00:00,lessextension.com
108,19722,uninstall,www.lessextension.com/goodbye,,1e42348b-77f0-4eb2-911b-93e0e27bd8a0,a21640f6-2498-4dd7-b282-3172ae68e8fa,2025-03-29T16:14:40.952129+00:00,2025-03-29T16:14:36.678+00:00,lessextension.com
163,19779,open-popup,kcgblchgejkpnemehaojecgbamdiacml/popup.html,,5c7c5d5f-ad28-401e-b6f0-6d6f167cc726,8debd9e1-5630-4a78-8a24-3201e77cfb89,2025-03-31T09:32:24.938499+00:00,2025-03-31T09:32:24.233+00:00,shop.app
289,19971,add-to-cart,www.amazon.com/gp/product/1108724264/ref=as_li_tl,,a584c10c-d236-408a-b544-50b6219dee39,d1a54359-192c-4e70-9fbe-f4f301958049,2025-04-02T12:11:11.798597+00:00,2025-04-02T12:11:11.089+00:00,amazon.com


In [47]:
# count different types of events
event_counts = df["type"].value_counts()
print(event_counts)

type
time-spent                    7322
page-view                      757
from-directs                   213
on-onboarding                  177
from-directs-cta                42
add-to-cart                     37
uninstall                       28
open-popup                      25
checkout                        22
welcome-modal-seen              17
active                          15
enforce_wait_modal_shown        14
enforce_wait_info_expanded       7
enforce_wait_canceled            6
questionary-popup                4
open-options                     3
Name: count, dtype: int64


In [48]:
import re

# Count unique user IDs
# Define a regex pattern for UUIDv4
uuidv4_pattern = re.compile(r'^[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}$', re.IGNORECASE)

# Filter user_id's that match the UUIDv4 pattern
uuidv4 = df[df["user_id"].apply(lambda x: bool(uuidv4_pattern.match(x)))]
unique_users = uuidv4["user_id"].nunique()

uninstalled = df[df["type"] == "uninstall"]
unique_uninstalled = uninstalled["user_id"].nunique()
unique_installed = unique_users - unique_uninstalled

# look at the last active event for each unique user. If the last event has a payload of "false", this means they've deactivated it.
# I'd like to know how many of the current installed users have deactivated the extension.
last_active = df[df["type"] == "active"]
last_active = last_active.drop_duplicates(subset=["user_id"], keep="last")
last_active = last_active[last_active["payload"] == "false"]
unique_deactivated = last_active["user_id"].nunique()
totally_active = unique_installed - unique_deactivated
on_onboarding = df[(df["type"] == "on-onboarding") & (df["payload"].apply(lambda x: x != '""'))]

print("Installations:", len(on_onboarding), "\nUser activity seen from", unique_users, "\nuninstalled:", unique_uninstalled, "\ndeactivated:", unique_deactivated, "\nTotal registered active users:", totally_active)

Installations: 8 
User activity seen from 54 
uninstalled: 28 
deactivated: 3 
Total registered active users: 23


In [49]:
def printEvents(df_events: pd.DataFrame):
    df_events = df_events.sort_values("created_at", ascending=True)
    # Filter all events of type "time-spent"
    df_events = df_events[df_events['type'] != 'time-spent']

    for _, row in df_events.iterrows():
        short_user_id = row['user_id'][:5]
        short_session_id = row['session_id'][:5]
        short_time = row['created_at'][5:19]
        print(f"{short_time}    {row['type']}   U{short_user_id}, S{short_session_id} - {row['url']}")

In [50]:

# For every enforce_wait_modal_shown type event. Show the following 20 events

# Find all enforce_wait_modal_shown events
enforce_wait_modal_shown_events = df[df['type'] == 'enforce_wait_modal_shown']

# For each event, do something
for _, event in enforce_wait_modal_shown_events.iterrows():
    # Find index of the event in the original dataframe
    event_index = df.index[df['id'] == event['id']][0]
    
    # Get the next 20 events
    next_20_events = df.iloc[event_index + 1:event_index + 21]
    # Print the events
    print(next_20_events)



         id                        type  \
1480  21163                  time-spent   
1481  21164                  time-spent   
1482  21165  enforce_wait_info_expanded   
1483  21166                  time-spent   
1484  21167                  time-spent   
1485  21168                  time-spent   
1486  21169                  time-spent   
1487  21170                   uninstall   
1488  21171                from-directs   
1489  21172                from-directs   
1490  21173                from-directs   
1491  21174                from-directs   
1492  21175                from-directs   
1493  21176            from-directs-cta   
1494  21177                from-directs   
1495  21178            from-directs-cta   
1496  21179               on-onboarding   
1497  21180                   uninstall   
1498  21181               on-onboarding   
1499  21182               on-onboarding   

                                        url            payload  \
1480  secure6.store.apple.com/