In [124]:
shopify_domains = ["klaedeskabet.dk",
    "fashionnova.com",
    "kyliecosmetics.com",
    "colourpop.com",
    "jeffreestarcosmetics.com",
    "gymshark.com",
    "allbirds.com",
    "brooklinen.com",
    "ruggable.com",
    "ruggable.com",
    "chubbiesshorts.com",
    "chubbiesshorts.com",
    "puravidabracelets.com",
    "nativecos.com",
    "hauslabs.com",
    "skknbykim.com",
    "harney.com",
    "redbullshopus.com",
    "tula.com",
    "tula.com",
    "tesla.com",
    "spiritualgangster.com",
    "taylorstitch.com",
    "american-giant.com",
    "drsquatch.com",
    "mejuri.com",
    "mejuri.com",
    "peets.com",
    "deathwishcoffee.com",
    "hellotushy.com",
    "bando.com",
    "moroccanoil.com",
    "negativeunderwear.com",
    "birdies.com",
    "naadam.co",
    "popflexactive.com",
    "moderncitizen.com",
    "greatjonesgoods.com",
    "pinklily.com",
    "misen.com",
    "materialkitchen.com",
    "hedleyandbennett.com",
    "rumpl.com",
    "mizzenandmain.com",
    "ohpolly.com",
    "tecovas.com",
    "stance.com",
    "spongelle.com",
    "trueclassictees.com",
    "meundies.com",
    "studs.com",
    "jackhenry.co",
    "luxyhair.com",
    "juicycouture.com",
    "everlast.com",
    "skims.com",
    "feals.com",
    "foursigmatic.com",
    "golde.co",
    "liquid-iv.com",
    "thesill.com",
    "wearlively.com",
    "andieswim.com",
    "yourparade.com",
    "brightland.co",
    "omsom.com",
    "jenis.com",
    "snowehome.com",
    "graza.co",
    "flybyjing.com",
    "getmaude.com",
    "ugmonk.com",
    "shop.app"
]

In [125]:
import os
import requests
from dotenv import load_dotenv
import pandas as pd

# Load environment variables from .env file
load_dotenv()

SUPABASE_KEY = os.getenv("SUPABASE_KEY")

url = f"https://iukxcgvmzjfelwfrpkyi.supabase.co/rest/v1/analytics?apikey={SUPABASE_KEY}&select=*"
headers = {
    "apikey": SUPABASE_KEY,
    "Authorization": f"Bearer {SUPABASE_KEY}",
}

response = requests.get(url, headers=headers)

# Create a dataframe from the JSON response payload
data = response.json()


In [126]:
df = pd.DataFrame(data)
blacklisted_user_ids = []#["390c0190-33a3-4029-ae09-687bff0b77d6", "42a0286f-f933-490a-89aa-0c0037dae11e"]
should_delete = df[df["type"] == "delete-data"]
# Exclude all events where the url contains "music.apple" or "tv.apple"
df = df[~df["url"].str.contains("music.apple|tv.apple", na=False)]

blacklisted_user_ids.extend(should_delete["user_id"].tolist())
# Filter out rows where user_id is in the blacklisted_user_ids array
df = df[~df["user_id"].isin(blacklisted_user_ids)]

def get_top_domain(domain: str) -> str:
    if domain.startswith("www") or domain.startswith("ww2"):
        domain = domain[4:]
    
    d = domain.split("/")[0]

    return ".".join(d.split(".")[-3:]) if ".co.uk" in d else ".".join(d.split(".")[-2:])

# Add column that says if the user has a strategy or not
df["domain"] = df["url"].apply(lambda x: get_top_domain(x) if x != None else None)
# Define an empty array for blacklisted user ids


# AFTER:  2025-04-13 02:00
# BEFORE: 2025-04-26 10:00
df = df[(df["created_at"] >= "2025-03-27 01:00")]

In [127]:
# Fix shop.app URLs
shop_df = df[df["domain"] == "shop.app"]

# Count shop_df that contains /checkout and how many doesnt
checkout_df = shop_df[shop_df["url"].str.contains("/checkout|/pay/")]
no_checkout_df = shop_df[~shop_df["url"].str.contains("/checkout|/pay/")]

before_checkout_df_len = len(checkout_df)
before_no_checkout_df_len = len(no_checkout_df)

print("Filter shop.app URLs")

user_df = df.groupby("user_id")

# for each user_id iterate through the rows using index
for user_id, rows in user_df:
    # for each row in the user_id group, if url contains /checkout and is domain shop.app,
    previous_domain = None
    # then set domain to the previous domain
    for index, row in rows.iterrows():
        if not '.' in row["domain"] or row["domain"] == "shop.app" and ("/checkout" in row["url"] or "/pay/" in row["url"]):
            # Ensure index - 1 is valid
            if previous_domain is not None:
                if previous_domain in shopify_domains:
                    df.at[row.name, "domain"] = previous_domain
                else:
                    df.at[row.name, "domain"] = "shop.app"

        else:
            previous_domain = row["domain"]

shop_df = df[df["domain"] == "shop.app"]

# Count shop_df that contains /checkout and how many doesnt
checkout_df = shop_df[shop_df["url"].str.contains("/checkout|/pay/")]
no_checkout_df = shop_df[~shop_df["url"].str.contains("/checkout|/pay/")]

# print
print(f"Number of shop.app URLs with /checkout filtered from {before_checkout_df_len} to {len(checkout_df)}")
print(f"Number of shop.app URLs without /checkout filtered from {before_no_checkout_df_len} to {len(no_checkout_df)}")

Filter shop.app URLs
Number of shop.app URLs with /checkout filtered from 183 to 183
Number of shop.app URLs without /checkout filtered from 27 to 62


In [128]:
import uuid
from datetime import datetime, timedelta

print("Fix session ids")

# Ensure 'created_at' is in datetime format
df['date'] = pd.to_datetime(df['created_at'], format='mixed')

# Sort the dataframe by user_id and created_at
df = df.sort_values(by=['user_id', 'date']).reset_index(drop=True)

# Generate session IDs
current_session_id = None

for user_id, group in df.groupby('user_id'):
    current_session_id = str(uuid.uuid4())
    last_event_time = None

    for index, row in group.iterrows():
        if row['type'] != "uninstall" and row['session_id'] == "none":
            continue
        
        if last_event_time is None or (row['date'] - last_event_time) > timedelta(minutes=30):
            current_session_id = str(uuid.uuid4())
        df.at[index, 'session_id'] = current_session_id
        last_event_time = row['date']

df = df.drop(columns=['date'])

Fix session ids


In [129]:
# This script injects a checkout event into the dataframe for each session_id and domain pair.
# It will use the first URL that contains a checkout-related keyword as the basis for the new event.
"""# Add checkout events
checkout_keywords = ['/checkout', '/cart', '/shoppingcart', '/bag']

# Find first occurrence of checkout-related URLs per (session_id, domain)
def contains_checkout(url):
    if pd.isnull(url):
        return False
    return any(keyword in url for keyword in checkout_keywords)

# Filter rows that match checkout keywords
checkout_hits = df[df['url'].apply(contains_checkout)]

# Get the first hit per (session_id, domain)
first_checkout = (
    checkout_hits
    .sort_values('created_at')
    .groupby(['session_id', 'domain'], as_index=False)
    .first()
)

# Prepare new rows to inject
new_rows = []
for _, row in first_checkout.iterrows():
    new_row = row.copy()
    new_row['type'] = 'checkout'
    new_row['payload'] = None
    new_rows.append(new_row)

# Append new rows to df and re-sort by created_at
if new_rows:
    df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)
    df = df.sort_values('created_at').reset_index(drop=True)"""

"# Add checkout events\ncheckout_keywords = ['/checkout', '/cart', '/shoppingcart', '/bag']\n\n# Find first occurrence of checkout-related URLs per (session_id, domain)\ndef contains_checkout(url):\n    if pd.isnull(url):\n        return False\n    return any(keyword in url for keyword in checkout_keywords)\n\n# Filter rows that match checkout keywords\ncheckout_hits = df[df['url'].apply(contains_checkout)]\n\n# Get the first hit per (session_id, domain)\nfirst_checkout = (\n    checkout_hits\n    .sort_values('created_at')\n    .groupby(['session_id', 'domain'], as_index=False)\n    .first()\n)\n\n# Prepare new rows to inject\nnew_rows = []\nfor _, row in first_checkout.iterrows():\n    new_row = row.copy()\n    new_row['type'] = 'checkout'\n    new_row['payload'] = None\n    new_rows.append(new_row)\n\n# Append new rows to df and re-sort by created_at\nif new_rows:\n    df = pd.concat([df, pd.DataFrame(new_rows)], ignore_index=True)\n    df = df.sort_values('created_at').reset_index(

In [130]:
# Add checkout events
checkout_keywords = ['/checkout', '/cart', '/shoppingcart', '/bag']

# Find first occurrence of checkout-related URLs per (session_id, domain)
def contains_checkout(url):
    if pd.isnull(url):
        return False
    return any(keyword in url for keyword in checkout_keywords)

# Filter rows that match checkout keywords
df["is_checkout"] = df['url'].apply(contains_checkout)

session_domain = df.sort_values('created_at').groupby(['session_id', 'domain'], as_index=False)

# Create list to store new checkout events
new_rows = []

def add_checkout_event(row):
    new_row = row.copy()
    new_row['type'] = 'checkout'
    new_row['payload'] = None
    new_rows.append(new_row)

# Process each group (session_id + domain combination)
for name, group in session_domain:
    session_id, domain = name
    
    # Find checkout events in this group
    checkout_events = group[group['is_checkout']]
    
    # If there are checkout events in this group, add a checkout type event
    if checkout_events.empty: continue

    checkout_cooldown = 0
    
    for _, row in group.iterrows():
        if not row['is_checkout']:
            if row['type'] == "page-view" and 0 < checkout_cooldown:
                checkout_cooldown -= 1
            continue

        if 0 < checkout_cooldown:
            checkout_cooldown = 3
            continue
        
        checkout_cooldown = 3
        add_checkout_event(row)


# Add the new checkout events to the dataframe
if new_rows:
    checkout_events_df = pd.DataFrame(new_rows)
    df = pd.concat([df, checkout_events_df], ignore_index=True)
    df = df.sort_values('created_at').reset_index(drop=True)

# Remove the temporary is_checkout column
df = df.drop(columns=['is_checkout'])

In [131]:
# TO CSV
df.to_csv("analytics.csv", index=False)


In [132]:
# For any domain that does not include a checkout event, write out all unique URLs visited
unique_urls = df[~df['session_id'].isin(first_checkout['session_id'])].groupby('domain')['url'].unique().reset_index()
# Print domains and their URLs that don't have checkout events
for _, row in unique_urls.iterrows():
    print(f"\n{row['domain']} URLs:")
    for url in row['url']:
        print(f"  - {url}")
    print("-" * 50)


aliexpress.com URLs:
  - www.aliexpress.com/item/1005007128414214.html
  - wp.aliexpress.com/wp.html
  - www.aliexpress.com/
  - www.aliexpress.com/w/wholesale-carbon-wheelset-700c.html
  - www.aliexpress.com/item/1005008929299411.html
  - www.aliexpress.com/w/wholesale-ent-disc-elitewheels.html
  - www.aliexpress.com/item/1005006902683405.html
--------------------------------------------------

amazon.co.uk URLs:
  - www.amazon.co.uk/
  - www.amazon.co.uk/stores/Amazon+Basics/page/5D96C4AA-F0F4-415F-90A4-B202C6B03A17&ref=gwais_337c3/
  - www.amazon.co.uk/ap/signin
  - www.amazon.co.uk/gp/remotepagelet/signin/checkout-perf-initiate-and-store.html
  - www.amazon.co.uk/Amazon-Basics-Enameled-Round-Covered/dp/B07B4VY154
--------------------------------------------------

amazon.com URLs:
  - www.amazon.com/Urnex-Dezcal-Activated-Remover-Tablets/dp/B00ZOUS6DO
  - www.amazon.com/dp/B084YSSJFN
  - www.amazon.com/gp/product/1108724264/ref=as_li_tl
  - read.amazon.com/sample/1108724264
  - ww

In [133]:
# List all with type = "place-order"
df[df["type"] == "uninstall"]

Unnamed: 0,id,type,url,payload,user_id,session_id,received_at,created_at,domain
224,19722,uninstall,www.lessextension.com/goodbye,,1e42348b-77f0-4eb2-911b-93e0e27bd8a0,7b4d2d17-01e9-4da8-a85e-30ae7030e4b1,2025-03-29T16:14:40.952129+00:00,2025-03-29T16:14:36.678+00:00,lessextension.com
254,19752,uninstall,www.lessextension.com/goodbye,,ab3ee523-dcce-4d92-a4e0-48a0eda7d4fb,0f8af110-db73-4c59-a863-eb37a9642edc,2025-03-31T08:34:03.7719+00:00,2025-03-31T08:34:03.004+00:00,lessextension.com
269,19767,uninstall,www.lessextension.com/goodbye,,c948e559-7720-4c5e-9794-980e7f6d6b94,e5642672-7ec4-45bc-a057-88a651caabe3,2025-03-31T08:40:14.051022+00:00,2025-03-31T08:40:13.94+00:00,lessextension.com
283,19781,uninstall,www.lessextension.com/goodbye,,5c7c5d5f-ad28-401e-b6f0-6d6f167cc726,651b509e-52e3-4e01-a17e-1d96c595b69b,2025-03-31T09:32:54.820853+00:00,2025-03-31T09:32:54.252+00:00,lessextension.com
1240,20755,uninstall,www.lessextension.com/goodbye,,10fb5e0c-f579-4234-ab46-b609a9cfb5d0,4df254f7-1a5a-4d20-a42f-e4d4fe9089c0,2025-04-03T09:05:39.570417+00:00,2025-04-03T09:05:39.381+00:00,lessextension.com
1657,21170,uninstall,www.lessextension.com/goodbye,,7e54f2ee-060a-467e-b319-9e99d8e58e27,22293b10-3bcf-4061-99a2-4546e2e5995f,2025-04-04T02:32:47.51322+00:00,2025-04-04T02:32:46.773+00:00,lessextension.com
1667,21180,uninstall,www.lessextension.com/goodbye,,37d04e88-8f59-43ad-9995-74233308bf80,ad1a6c11-5e50-469d-82ba-b497b55ae7fc,2025-04-04T07:36:51.522905+00:00,2025-04-04T07:36:50.737+00:00,lessextension.com
1673,21186,uninstall,www.lessextension.com/goodbye,,08dc12d7-599e-4c15-abd7-161efb715089,6f049df0-43a1-4517-926b-7698f56c8f92,2025-04-04T07:38:49.224459+00:00,2025-04-04T07:38:49.046+00:00,lessextension.com
1675,21188,uninstall,www.lessextension.com/goodbye,,610227fe-d8db-42aa-b8c7-c8b64cb2534a,ae6d2ec6-4329-42b8-88d5-2e1205efd633,2025-04-04T07:41:06.886669+00:00,2025-04-04T07:41:06.447+00:00,lessextension.com
2537,22039,uninstall,www.lessextension.com/goodbye,,f5a8c9e4-e013-4ee1-a850-ee409f5b1967,e9bc37d1-19a6-42f8-a1ce-b08aa6b6a095,2025-04-07T05:27:57.852175+00:00,2025-04-07T05:27:57.397+00:00,lessextension.com


In [134]:
# Create a new data frame, where it only includes a single sample of each type of event. The event is the "type" column.
df_unique = df.drop_duplicates(subset=["type"])
df_unique


Unnamed: 0,id,type,url,payload,user_id,session_id,received_at,created_at,domain
0,19500,from-directs,www.lessextension.com/,"""anticonsumption""",less-website,none,2025-03-27T01:06:50.771229+00:00,2025-03-27T01:06:50.072+00:00,lessextension.com
5,19505,page-view,foto.foetex.dk/billeder.html,,390c0190-33a3-4029-ae09-687bff0b77d6,5cf5697c-6443-4b1a-bb4a-75d8df28b073,2025-03-27T07:23:39.255773+00:00,2025-03-27T07:23:38.148+00:00,foetex.dk
6,19506,time-spent,foto.foetex.dk/billeder.html,"{""duration"":5009}",390c0190-33a3-4029-ae09-687bff0b77d6,5cf5697c-6443-4b1a-bb4a-75d8df28b073,2025-03-27T07:23:43.734083+00:00,2025-03-27T07:23:43.128+00:00,foetex.dk
19,19519,on-onboarding,www.lessextension.com/onboarding,"""""",none,none,2025-03-27T07:24:55.70906+00:00,2025-03-27T07:24:56.412+00:00,lessextension.com
48,19547,checkout,foto.foetex.dk/checkout.html,,390c0190-33a3-4029-ae09-687bff0b77d6,5cf5697c-6443-4b1a-bb4a-75d8df28b073,2025-03-27T07:48:53.399341+00:00,2025-03-27T07:48:52.814+00:00,foetex.dk
143,19642,welcome-modal-seen,www.ticketmaster.dk/event/555879,,0b7cd927-52ef-434b-8b2e-9c63281c6327,fc8da7cc-6173-4fa6-85d8-37efe5935f32,2025-03-27T15:27:14.463082+00:00,2025-03-27T15:27:13.93+00:00,ticketmaster.dk
203,19701,from-directs-cta,www.lessextension.com/,"""shoppingaddiction""",less-website,none,2025-03-27T21:32:22.421951+00:00,2025-03-27T21:32:22.176+00:00,lessextension.com
224,19722,uninstall,www.lessextension.com/goodbye,,1e42348b-77f0-4eb2-911b-93e0e27bd8a0,7b4d2d17-01e9-4da8-a85e-30ae7030e4b1,2025-03-29T16:14:40.952129+00:00,2025-03-29T16:14:36.678+00:00,lessextension.com
281,19779,open-popup,kcgblchgejkpnemehaojecgbamdiacml/popup.html,,5c7c5d5f-ad28-401e-b6f0-6d6f167cc726,651b509e-52e3-4e01-a17e-1d96c595b69b,2025-03-31T09:32:24.938499+00:00,2025-03-31T09:32:24.233+00:00,kcgblchgejkpnemehaojecgbamdiacml
459,19971,add-to-cart,www.amazon.com/gp/product/1108724264/ref=as_li_tl,,a584c10c-d236-408a-b544-50b6219dee39,d93cae7c-4cf9-4c13-92e5-4e8cf42ec2ef,2025-04-02T12:11:11.798597+00:00,2025-04-02T12:11:11.089+00:00,amazon.com


In [135]:
# count different types of events
event_counts = df["type"].value_counts()
print(event_counts)
print("Total number of events: ", len(df))

type
time-spent                            11711
page-view                              1187
from-directs                            220
on-onboarding                           181
checkout                                 54
add-to-cart                              54
from-directs-cta                         44
uninstall                                32
open-popup                               31
enforce_wait_modal_shown                 27
active                                   19
welcome-modal-seen                       18
questionary-popup                        15
enforce_wait_canceled                    12
enforce_wait_info_expanded               10
questionary-closed                        7
open-options                              4
enforce_wait_permit_valid_on_click        4
place-order                               2
questionary-finished                      2
Name: count, dtype: int64
Total number of events:  13634


In [136]:
import re

# Count unique user IDs
# Define a regex pattern for UUIDv4
uuidv4_pattern = re.compile(r'^[a-f0-9]{8}-[a-f0-9]{4}-4[a-f0-9]{3}-[89ab][a-f0-9]{3}-[a-f0-9]{12}$', re.IGNORECASE)

# Filter user_id's that match the UUIDv4 pattern
uuidv4 = df[df["user_id"].apply(lambda x: bool(uuidv4_pattern.match(x)))]
unique_users = uuidv4["user_id"].nunique()

uninstalled = df[df["type"] == "uninstall"]
unique_uninstalled = uninstalled["user_id"].nunique()
unique_installed = unique_users - unique_uninstalled

# look at the last active event for each unique user. If the last event has a payload of "false", this means they've deactivated it.
# I'd like to know how many of the current installed users have deactivated the extension.
last_active = df[df["type"] == "active"]
last_active = last_active.drop_duplicates(subset=["user_id"], keep="last")
last_active = last_active[last_active["payload"] == "false"]
unique_deactivated = last_active["user_id"].nunique()
totally_active = unique_installed - unique_deactivated
on_onboarding = df[(df["type"] == "on-onboarding") & (df["payload"].apply(lambda x: x != '""'))]

print("Installations:", len(on_onboarding), "\nUser activity seen from", unique_users, "\nuninstalled:", unique_uninstalled, "\ndeactivated:", unique_deactivated, "\nTotal registered active users:", totally_active)

Installations: 8 
User activity seen from 60 
uninstalled: 32 
deactivated: 5 
Total registered active users: 23


In [137]:
def printEvents(df_events: pd.DataFrame):
    df_events = df_events.sort_values("created_at", ascending=True)
    # Filter all events of type "time-spent"
    df_events = df_events[df_events['type'] != 'time-spent']

    for _, row in df_events.iterrows():
        short_user_id = row['user_id'][:5]
        short_session_id = row['session_id'][:5]
        short_time = row['created_at'][5:19]
        print(f"{short_time}    {row['type']}   U{short_user_id}, S{short_session_id} - {row['url']}")

In [138]:

# For every enforce_wait_modal_shown type event. Show the following 20 events

# Find all enforce_wait_modal_shown events
enforce_wait_modal_shown_events = df[df['type'] == 'enforce_wait_modal_shown']

# For each event, do something
for _, event in enforce_wait_modal_shown_events.iterrows():
    # Find index of the event in the original dataframe
    event_index = df.index[df['id'] == event['id']][0]
    
    # Get the next 20 events
    next_20_events = df.iloc[event_index + 1:event_index + 21]
    # Print the events
    print(next_20_events)



         id                        type  \
1650  21163                  time-spent   
1651  21164                  time-spent   
1652  21165  enforce_wait_info_expanded   
1653  21166                  time-spent   
1654  21167                  time-spent   
1655  21168                  time-spent   
1656  21169                  time-spent   
1657  21170                   uninstall   
1658  21171                from-directs   
1659  21172                from-directs   
1660  21173                from-directs   
1661  21174                from-directs   
1662  21175                from-directs   
1663  21176            from-directs-cta   
1664  21177                from-directs   
1665  21178            from-directs-cta   
1666  21179               on-onboarding   
1667  21180                   uninstall   
1668  21181               on-onboarding   
1669  21182               on-onboarding   

                                        url            payload  \
1650  secure6.store.apple.com/