In [1]:
# open csv into a list of dictionaries
import csv
import os


def read_csv(file):
    with open(file, "r") as f:
        reader = csv.DictReader(f)
        return list(reader)


user_data = read_csv("user_web.csv")

In [2]:
user_data

[{'timestamp': '1713439099772',
  '_e': '',
  '_sfAccountId': '',
  '_sfUserId': '',
  'accountName': '',
  'accountProfileObjects': '',
  'accountType': '',
  'action': 'Untagged Page',
  'apptegicFirstSeen': '',
  'clientIp': '85.222.134.0',
  'clientTimeZone': '',
  'company': '',
  'customerId': '',
  'emailAddress': '',
  'extAccountId': '',
  'extUserId': '',
  'profileId': '',
  'profileObjects': '',
  'sfmcContactKey': '',
  'title': '',
  'url': 'https://www.mercedes-benz.com/en/art-and-culture/zeitgeist/esports/',
  'urlref': '',
  'userAgent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
  'userEmail': '',
  'userId': '1bd5799dcca2fbb0',
  'userName': ''},
 {'timestamp': '1713439134897',
  '_e': '',
  '_sfAccountId': '',
  '_sfUserId': '',
  'accountName': '',
  'accountProfileObjects': '',
  'accountType': '',
  'action': 'Untagged Page',
  'apptegicFirstSeen': '',
  'clientIp': '85.222.134.0',
  'c

In [3]:
# distinct userIds
user_ids = set([user["userId"] for user in user_data])
user_ids

{'1bd5799dcca2fbb0', 'd6a5978cf232a3b4'}

In [4]:
# group by userId
from collections import defaultdict


def group_by_user(data):
    grouped = defaultdict(list)
    for row in data:
        grouped[row["userId"]].append(row)
    return grouped


grouped_data = group_by_user(user_data)

In [5]:
# select userId with longest list
target_user = max(grouped_data, key=lambda x: len(grouped_data[x]))
target_user

'd6a5978cf232a3b4'

In [6]:
target_user_data = grouped_data[target_user]
target_user_data

[{'timestamp': '1713439831117',
  '_e': '',
  '_sfAccountId': '',
  '_sfUserId': '',
  'accountName': '',
  'accountProfileObjects': '',
  'accountType': '',
  'action': 'Untagged Page',
  'apptegicFirstSeen': '',
  'clientIp': '85.222.134.0',
  'clientTimeZone': '',
  'company': '',
  'customerId': '',
  'emailAddress': '',
  'extAccountId': '',
  'extUserId': '',
  'profileId': '',
  'profileObjects': '',
  'sfmcContactKey': '',
  'title': '',
  'url': 'https://www.mbusa.com/en/vehicles/inventory',
  'urlref': '',
  'userAgent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/124.0.0.0 Safari/537.36',
  'userEmail': '',
  'userId': 'd6a5978cf232a3b4',
  'userName': ''},
 {'timestamp': '1713439840018',
  '_e': '',
  '_sfAccountId': '',
  '_sfUserId': '',
  'accountName': '',
  'accountProfileObjects': '',
  'accountType': '',
  'action': 'Untagged Page',
  'apptegicFirstSeen': '',
  'clientIp': '85.222.134.0',
  'clientTimeZone': '',
  'c

In [7]:
# distinct urls, clientIp, userAgents
print(urls := set([user["url"] for user in target_user_data]))
print(clientIps := set([user["clientIp"] for user in target_user_data]))
print(userAgents := set([user["userAgent"] for user in target_user_data]))

{'https://www.mbusa.com/en/vehicles/class/eqb/suv', 'https://www.mbusa.com/en/inventory?model=AMGEQEV4', 'https://www.mbusa.com/en/inventory/Mercedes-Benz-Manhattan/56113?class=GLA:SUV&model=GLA250W', 'https://www.mbusa.com/en/vehicles/inventory', 'https://www.mbusa.com/en/inventory/Mercedes-Benz-of-Edison/51146/G/SUV/G63W4/W1NYC7HJ6RX510902', 'https://www.mbusa.com/en/vehicles/class/eqs/sedan', 'https://www.mbusa.com/en/inventory/search?model=GLB35W4&year=2024&exteriorColor=BLK&interiorColor=BLK&trim=H62&wheel=X_WHL_19i&sortModel=relevance&zip=10012', 'https://www.mbusa.com/en/vehicles/build/glb/suv', 'https://www.mbusa.com/en/special-offers?class=EQS:SUV,EQB:SUV,EQE:SDN,EQE:SUV,EQS:SDN', 'https://www.mbusa.com/en/vehicles/build/g-class/suv', 'https://www.mbusa.com/en/all-vehicles#vbg-electric', 'https://www.mbusa.com/en/vehicles/build/g-class/suv/g63w4', 'https://www.mbusa.com/en/vehicles/build/glc/suv/glc300w4?category=options', 'https://www.mbusa.com/en/vehicles/class/eqe/sedan', '

In [8]:
def lookup_ip_location(ip):
    # mocked
    return "Berlin, Germany"


locations = {lookup_ip_location(ip) for ip in clientIps}
locations

{'Berlin, Germany'}

In [9]:
# user data
user_data = {
    "userId": target_user,
    "urls": urls,
    "userAgents": userAgents,
    "locations": locations,
}
user_data

{'userId': 'd6a5978cf232a3b4',
 'urls': {'https://www.mbusa.com/en/all-vehicles#vbg-electric',
  'https://www.mbusa.com/en/inventory/Mercedes-Benz-Manhattan/56113?class=GLA:SUV&model=GLA250W',
  'https://www.mbusa.com/en/inventory/Mercedes-Benz-of-Edison/51146/G/SUV/G63W4/W1NYC7HJ6RX510902',
  'https://www.mbusa.com/en/inventory/search?class=GLA:SUV&zip=10012',
  'https://www.mbusa.com/en/inventory/search?model=GLB35W4&year=2024&exteriorColor=BLK&interiorColor=BLK&trim=H62&wheel=X_WHL_19i&sortModel=relevance&zip=10012',
  'https://www.mbusa.com/en/inventory?model=AMGEQEV4',
  'https://www.mbusa.com/en/special-offers?class=EQS:SUV,EQB:SUV,EQE:SDN,EQE:SUV,EQS:SDN',
  'https://www.mbusa.com/en/vehicles/build/g-class/suv',
  'https://www.mbusa.com/en/vehicles/build/g-class/suv/g63w4',
  'https://www.mbusa.com/en/vehicles/build/glb/suv',
  'https://www.mbusa.com/en/vehicles/build/glc/suv/glc300w4?category=options',
  'https://www.mbusa.com/en/vehicles/class/eqb/suv',
  'https://www.mbusa.co

In [10]:
from backend.db.users import user_db, UserData

await user_db.reset()
await user_db.add(
    UserData(
        user_id=user_data["userId"],
        visited_urls=list(user_data["urls"]),
        user_agents=list(user_data["userAgents"]),
        locations=list(user_data["locations"]),
    )
)

RuntimeError: Storage folder storage/qdrant is already accessed by another instance of Qdrant client. If you require concurrent access, use Qdrant server instead.