In [81]:
shopify_domains = ["klaedeskabet.dk",
    "fashionnova.com",
    "kyliecosmetics.com",
    "colourpop.com",
    "jeffreestarcosmetics.com",
    "gymshark.com",
    "allbirds.com",
    "brooklinen.com",
    "ruggable.com",
    "ruggable.com",
    "chubbiesshorts.com",
    "chubbiesshorts.com",
    "puravidabracelets.com",
    "nativecos.com",
    "hauslabs.com",
    "skknbykim.com",
    "harney.com",
    "redbullshopus.com",
    "tula.com",
    "tula.com",
    "tesla.com",
    "spiritualgangster.com",
    "taylorstitch.com",
    "american-giant.com",
    "drsquatch.com",
    "mejuri.com",
    "mejuri.com",
    "peets.com",
    "deathwishcoffee.com",
    "hellotushy.com",
    "bando.com",
    "moroccanoil.com",
    "negativeunderwear.com",
    "birdies.com",
    "naadam.co",
    "popflexactive.com",
    "moderncitizen.com",
    "greatjonesgoods.com",
    "pinklily.com",
    "misen.com",
    "materialkitchen.com",
    "hedleyandbennett.com",
    "rumpl.com",
    "mizzenandmain.com",
    "ohpolly.com",
    "tecovas.com",
    "stance.com",
    "spongelle.com",
    "trueclassictees.com",
    "meundies.com",
    "studs.com",
    "jackhenry.co",
    "luxyhair.com",
    "juicycouture.com",
    "everlast.com",
    "skims.com",
    "feals.com",
    "foursigmatic.com",
    "golde.co",
    "liquid-iv.com",
    "thesill.com",
    "wearlively.com",
    "andieswim.com",
    "yourparade.com",
    "brightland.co",
    "omsom.com",
    "jenis.com",
    "snowehome.com",
    "graza.co",
    "flybyjing.com",
    "getmaude.com",
    "ugmonk.com",
    "shop.app"
]

In [82]:
import os
import requests
from dotenv import load_dotenv
import pandas as pd

# Load environment variables from .env file
load_dotenv()

SUPABASE_KEY = os.getenv("SUPABASE_KEY")

url = f"https://iukxcgvmzjfelwfrpkyi.supabase.co/rest/v1/analytics?apikey={SUPABASE_KEY}&select=*"
headers = {
    "apikey": SUPABASE_KEY,
    "Authorization": f"Bearer {SUPABASE_KEY}",
}

response = requests.get(url, headers=headers)

# Create a dataframe from the JSON response payload
data = response.json()


In [None]:
df = pd.DataFrame(data)
blacklisted_user_ids = ["390c0190-33a3-4029-ae09-687bff0b77d6", "42a0286f-f933-490a-89aa-0c0037dae11e"]
should_delete = df[df["type"] == "delete-data"]
# Exclude all events where the url contains "music.apple" or "tv.apple"
df = df[~df["url"].str.contains("music.apple|tv.apple", na=False)]

blacklisted_user_ids.extend(should_delete["user_id"].tolist())
# Filter out rows where user_id is in the blacklisted_user_ids array
df = df[~df["user_id"].isin(blacklisted_user_ids)]

def get_top_domain(domain: str) -> str:
    if domain.startswith("www") or domain.startswith("ww2"):
        domain = domain[4:]
    
    d = domain.split("/")[0]

    return ".".join(d.split(".")[-3:]) if ".co.uk" in d else ".".join(d.split(".")[-2:])

# Add column that says if the user has a strategy or not
df["domain"] = df["url"].apply(lambda x: get_top_domain(x) if x != None else None)
# Define an empty array for blacklisted user ids


# AFTER:  2025-04-13 02:00
# BEFORE: 2025-04-26 10:00
df = df[(df["created_at"] >= "2025-03-27 01:00")]

In [84]:
# Fix shop.app URLs
shop_df = df[df["domain"] == "shop.app"]

# Count shop_df that contains /checkout and how many doesnt
checkout_df = shop_df[shop_df["url"].str.contains("/checkout|/pay/")]
no_checkout_df = shop_df[~shop_df["url"].str.contains("/checkout|/pay/")]

before_checkout_df_len = len(checkout_df)
before_no_checkout_df_len = len(no_checkout_df)

print("Filter shop.app URLs")

user_df = df.groupby("user_id")

# for each user_id iterate through the rows using index
for user_id, rows in user_df:
    # for each row in the user_id group, if url contains /checkout and is domain shop.app,
    # then set domain to the previous domain
    for index, row in rows.iterrows():
        if not '.' in row["domain"] or row["domain"] == "shop.app" and ("/checkout" in row["url"] or "/pay/" in row["url"]):
            # Ensure index - 1 is valid
            if previous_domain is not None:
                if previous_domain in shopify_domains:
                    df.at[row.name, "domain"] = previous_domain
                else:
                    df.at[row.name, "domain"] = "shop.app"

        else:
            previous_domain = row["domain"]

shop_df = df[df["domain"] == "shop.app"]

# Count shop_df that contains /checkout and how many doesnt
checkout_df = shop_df[shop_df["url"].str.contains("/checkout|/pay/")]
no_checkout_df = shop_df[~shop_df["url"].str.contains("/checkout|/pay/")]

# print
print(f"Number of shop.app URLs with /checkout filtered from {before_checkout_df_len} to {len(checkout_df)}")
print(f"Number of shop.app URLs without /checkout filtered from {before_no_checkout_df_len} to {len(no_checkout_df)}")

Filter shop.app URLs
Number of shop.app URLs with /checkout filtered from 183 to 183
Number of shop.app URLs without /checkout filtered from 27 to 75


In [85]:
import uuid
from datetime import datetime, timedelta

print("Fix session ids")

# Ensure 'created_at' is in datetime format
df['date'] = pd.to_datetime(df['created_at'], format='mixed')

# Sort the dataframe by user_id and created_at
df = df.sort_values(by=['user_id', 'date']).reset_index(drop=True)

# Generate session IDs
current_session_id = None

for user_id, group in df.groupby('user_id'):
    current_session_id = str(uuid.uuid4())
    last_event_time = None

    for index, row in group.iterrows():
        if row['session_id'] == "none":
            continue
        
        if last_event_time is None or (row['date'] - last_event_time) > timedelta(minutes=30):
            current_session_id = str(uuid.uuid4())
        df.at[index, 'session_id'] = current_session_id
        last_event_time = row['date']

df = df.drop(columns=['date'])

Fix session ids


In [86]:
# Add checkout events
checkout_keywords = ['/checkout', '/cart', '/shoppingcart', '/bag']

# Find first occurrence of checkout-related URLs per (session_id, domain)
def contains_checkout(url):
    if pd.isnull(url):
        return False
    return any(keyword in url for keyword in checkout_keywords)

# Filter rows that match checkout keywords
checkout_hits = df[df['url'].apply(contains_checkout)]

# Get the first hit per (session_id, domain)
first_checkout = (
    checkout_hits
    .sort_values('created_at')
    .groupby(['session_id', 'domain'], as_index=False)
    .first()
)

# Prepare new rows to inject
new_rows = []
for _, row in first_checkout.iterrows():
    new_row = row.copy()
    new_row['type'] = 'checkout'
    new_row['payload'] = None
    new_rows.append(new_row)

# Append new rows to df and re-sort by created_at
if new_rows:
    df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
    df = df.sort_values('created_at').reset_index(drop=True)

In [87]:
# TO CSV
df.to_csv("analytics.csv", index=False)


In [88]:
# For any domain that does not include a checkout event, write out all unique URLs visited
unique_urls = df[~df['session_id'].isin(first_checkout['session_id'])].groupby('domain')['url'].unique().reset_index()
# Print domains and their URLs that don't have checkout events
for _, row in unique_urls.iterrows():
    print(f"\n{row['domain']} URLs:")
    for url in row['url']:
        print(f"  - {url}")
    print("-" * 50)


aliexpress.com URLs:
  - www.aliexpress.com/item/1005007128414214.html
  - wp.aliexpress.com/wp.html
--------------------------------------------------

amazon.com URLs:
  - www.amazon.com/Urnex-Dezcal-Activated-Remover-Tablets/dp/B00ZOUS6DO
  - www.amazon.com/dp/B084YSSJFN
  - www.amazon.com/b/
  - www.amazon.com/kindle-dbs/hz/subscribe/ku/
  - www.amazon.com/s
  - www.amazon.com/Verity-Colleen-Hoover-ebook/dp/B09H6T8LTR/ref=sr_1_1
  - read.amazon.com/sample/B09H6T8LTR
  - www.amazon.com/gp/product/B08P4PMS3L/ref=as_li_tl
  - aws.amazon.com/
  - aws.amazon.com/solutions/
  - aws.amazon.com/startups
  - aws.amazon.com/startups/lp/aws-gen-ai-lofts
  - www.amazon.com/Human-Anatomy-Physiology-Elaine-Marieb/dp/0134580990
  - www.amazon.com/dp/0134580990/ref=olp-opf-redir
  - gaming.amazon.com/minecraft-legends-pc/dp/amzn1.pg.item.7ee3a5d2-b415-43e1-869d-a38fa8047a4c
  - www.amazon.com/Linkind-Matter-Certified-A19-Equivalent/dp/B0BHRZFJDN
  - www.amazon.com/Linkind-Google-Changing-Dimmable

In [89]:
# List all with type = "place-order"
df[df["type"] == "checkout"]

Unnamed: 0,id,type,url,payload,user_id,session_id,received_at,created_at,domain
48,19547,checkout,foto.foetex.dk/checkout.html,,390c0190-33a3-4029-ae09-687bff0b77d6,9697c64b-95f7-4e89-9e8b-69e6af4c301d,2025-03-27T07:48:53.399341+00:00,2025-03-27T07:48:52.814+00:00,foetex.dk
148,19647,checkout,www.ticketmaster.dk/checkout/Z698xZC4Z1744-y/5...,,0b7cd927-52ef-434b-8b2e-9c63281c6327,da80964f-f8ac-464f-a121-8d935a191770,2025-03-27T15:27:56.313968+00:00,2025-03-27T15:27:55.78+00:00,ticketmaster.dk
313,19811,checkout,shop.app/checkout/55096148034/cn/Z2NwLXVzLWNlb...,,a8813899-eb7b-49f9-9e97-7f91f995e944,d933eb71-63ab-44ee-88d1-0bc587a69d3c,2025-03-31T13:22:37.471846+00:00,2025-03-31T13:22:36.906+00:00,shop.app
462,19972,checkout,www.amazon.com/cart/smart-wagon,,a584c10c-d236-408a-b544-50b6219dee39,cba2072d-2ecd-48c8-9971-b9e58e7c8bd1,2025-04-02T12:11:13.404999+00:00,2025-04-02T12:11:12.747+00:00,amazon.com
783,20292,checkout,greenmind.dk/checkout/personaldetails,,ddb693bf-df31-4881-a0de-5cca527b0504,e6e1b990-850e-42f3-a1fe-eedbcd543980,2025-04-02T12:51:31.586633+00:00,2025-04-02T12:51:31.391+00:00,greenmind.dk
1147,20663,checkout,www.amazon.co.uk/gp/remotepagelet/signin/check...,,10fb5e0c-f579-4234-ab46-b609a9cfb5d0,c6049eca-8699-423f-807b-a6713d7640f8,2025-04-03T07:43:41.069063+00:00,2025-04-03T07:43:39.866+00:00,amazon.co.uk
1209,20725,checkout,www.asos.com/bag,,10fb5e0c-f579-4234-ab46-b609a9cfb5d0,c6049eca-8699-423f-807b-a6713d7640f8,2025-04-03T07:46:30.122721+00:00,2025-04-03T07:46:30.053+00:00,asos.com
1322,20837,checkout,www.zalando.dk/cart,,0ada4805-fb70-490e-993a-072b09ae0229,3e7861f9-4b1b-4a7f-8afa-73b62ffd5830,2025-04-03T10:38:26.059313+00:00,2025-04-03T10:38:38.066+00:00,zalando.dk
1575,21089,checkout,www.apple.com/shop/bag,,7e54f2ee-060a-467e-b319-9e99d8e58e27,1cc92628-793b-4f04-a0f5-9aec98fdcfab,2025-04-04T02:26:37.767639+00:00,2025-04-04T02:26:36.965+00:00,apple.com
1728,21241,checkout,shop.app/checkout/14159740/cn/Z2NwLXVzLWVhc3Qx...,,a8813899-eb7b-49f9-9e97-7f91f995e944,ec0f7bae-0b79-4704-bb53-9d79f069823b,2025-04-04T14:36:14.813176+00:00,2025-04-04T14:36:14.246+00:00,shop.app


In [90]:
# Create a new data frame, where it only includes a single sample of each type of event. The event is the "type" column.
df_unique = df.drop_duplicates(subset=["type"])
df_unique


Unnamed: 0,id,type,url,payload,user_id,session_id,received_at,created_at,domain
0,19500,from-directs,www.lessextension.com/,"""anticonsumption""",less-website,none,2025-03-27T01:06:50.771229+00:00,2025-03-27T01:06:50.072+00:00,lessextension.com
5,19505,page-view,foto.foetex.dk/billeder.html,,390c0190-33a3-4029-ae09-687bff0b77d6,9697c64b-95f7-4e89-9e8b-69e6af4c301d,2025-03-27T07:23:39.255773+00:00,2025-03-27T07:23:38.148+00:00,foetex.dk
6,19506,time-spent,foto.foetex.dk/billeder.html,"{""duration"":5009}",390c0190-33a3-4029-ae09-687bff0b77d6,9697c64b-95f7-4e89-9e8b-69e6af4c301d,2025-03-27T07:23:43.734083+00:00,2025-03-27T07:23:43.128+00:00,foetex.dk
19,19519,on-onboarding,www.lessextension.com/onboarding,"""""",none,none,2025-03-27T07:24:55.70906+00:00,2025-03-27T07:24:56.412+00:00,lessextension.com
48,19547,checkout,foto.foetex.dk/checkout.html,,390c0190-33a3-4029-ae09-687bff0b77d6,9697c64b-95f7-4e89-9e8b-69e6af4c301d,2025-03-27T07:48:53.399341+00:00,2025-03-27T07:48:52.814+00:00,foetex.dk
143,19642,welcome-modal-seen,www.ticketmaster.dk/event/555879,,0b7cd927-52ef-434b-8b2e-9c63281c6327,da80964f-f8ac-464f-a121-8d935a191770,2025-03-27T15:27:14.463082+00:00,2025-03-27T15:27:13.93+00:00,ticketmaster.dk
203,19701,from-directs-cta,www.lessextension.com/,"""shoppingaddiction""",less-website,none,2025-03-27T21:32:22.421951+00:00,2025-03-27T21:32:22.176+00:00,lessextension.com
224,19722,uninstall,www.lessextension.com/goodbye,,1e42348b-77f0-4eb2-911b-93e0e27bd8a0,none,2025-03-29T16:14:40.952129+00:00,2025-03-29T16:14:36.678+00:00,lessextension.com
281,19779,open-popup,kcgblchgejkpnemehaojecgbamdiacml/popup.html,,5c7c5d5f-ad28-401e-b6f0-6d6f167cc726,c9edde6f-17f9-4b8a-af11-1d58e24e50ab,2025-03-31T09:32:24.938499+00:00,2025-03-31T09:32:24.233+00:00,shop.app
459,19971,add-to-cart,www.amazon.com/gp/product/1108724264/ref=as_li_tl,,a584c10c-d236-408a-b544-50b6219dee39,cba2072d-2ecd-48c8-9971-b9e58e7c8bd1,2025-04-02T12:11:11.798597+00:00,2025-04-02T12:11:11.089+00:00,amazon.com


In [91]:
# count different types of events
event_counts = df["type"].value_counts()
print(event_counts)
print("Total number of events: ", len(df))

type
time-spent                            10798
page-view                              1093
from-directs                            215
on-onboarding                           179
add-to-cart                              51
from-directs-cta                         42
checkout                                 38
uninstall                                30
open-popup                               28
enforce_wait_modal_shown                 25
welcome-modal-seen                       18
active                                   16
enforce_wait_canceled                    11
enforce_wait_info_expanded                9
questionary-popup                         6
open-options                              4
enforce_wait_permit_valid_on_click        2
questionary-closed                        2
place-order                               1
questionary-finished                      1
Name: count, dtype: int64
Total number of events:  12569


In [92]:
import re

# Count unique user IDs
# Define a regex pattern for UUIDv4
uuidv4_pattern = re.compile(r'^[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}$', re.IGNORECASE)

# Filter user_id's that match the UUIDv4 pattern
uuidv4 = df[df["user_id"].apply(lambda x: bool(uuidv4_pattern.match(x)))]
unique_users = uuidv4["user_id"].nunique()

uninstalled = df[df["type"] == "uninstall"]
unique_uninstalled = uninstalled["user_id"].nunique()
unique_installed = unique_users - unique_uninstalled

# look at the last active event for each unique user. If the last event has a payload of "false", this means they've deactivated it.
# I'd like to know how many of the current installed users have deactivated the extension.
last_active = df[df["type"] == "active"]
last_active = last_active.drop_duplicates(subset=["user_id"], keep="last")
last_active = last_active[last_active["payload"] == "false"]
unique_deactivated = last_active["user_id"].nunique()
totally_active = unique_installed - unique_deactivated
on_onboarding = df[(df["type"] == "on-onboarding") & (df["payload"].apply(lambda x: x != '""'))]

print("Installations:", len(on_onboarding), "\nUser activity seen from", unique_users, "\nuninstalled:", unique_uninstalled, "\ndeactivated:", unique_deactivated, "\nTotal registered active users:", totally_active)

Installations: 8 
User activity seen from 58 
uninstalled: 30 
deactivated: 4 
Total registered active users: 24


In [93]:
def printEvents(df_events: pd.DataFrame):
    df_events = df_events.sort_values("created_at", ascending=True)
    # Filter all events of type "time-spent"
    df_events = df_events[df_events['type'] != 'time-spent']

    for _, row in df_events.iterrows():
        short_user_id = row['user_id'][:5]
        short_session_id = row['session_id'][:5]
        short_time = row['created_at'][5:19]
        print(f"{short_time}    {row['type']}   U{short_user_id}, S{short_session_id} - {row['url']}")

In [94]:

# For every enforce_wait_modal_shown type event. Show the following 20 events

# Find all enforce_wait_modal_shown events
enforce_wait_modal_shown_events = df[df['type'] == 'enforce_wait_modal_shown']

# For each event, do something
for _, event in enforce_wait_modal_shown_events.iterrows():
    # Find index of the event in the original dataframe
    event_index = df.index[df['id'] == event['id']][0]
    
    # Get the next 20 events
    next_20_events = df.iloc[event_index + 1:event_index + 21]
    # Print the events
    print(next_20_events)



         id                        type  \
1649  21163                  time-spent   
1650  21164                  time-spent   
1651  21165  enforce_wait_info_expanded   
1652  21166                  time-spent   
1653  21167                  time-spent   
1654  21168                  time-spent   
1655  21169                  time-spent   
1656  21170                   uninstall   
1657  21171                from-directs   
1658  21172                from-directs   
1659  21173                from-directs   
1660  21174                from-directs   
1661  21175                from-directs   
1662  21176            from-directs-cta   
1663  21177                from-directs   
1664  21178            from-directs-cta   
1665  21179               on-onboarding   
1666  21180                   uninstall   
1667  21181               on-onboarding   
1668  21182               on-onboarding   

                                        url            payload  \
1649  secure6.store.apple.com/