In [7]:
import json
import os
from os import path
from tqdm import tqdm
from tld import get_fld

In [14]:
DATA_PATH = "/mnt/data/MT/pilot-crawl"

def isCrawlDataFile(filename: str):
    return filename.endswith(".json") and filename != "metadata.json" and path.isfile(path.join(DATA_PATH, filename))

json_files = [f for f in os.listdir(DATA_PATH) if isCrawlDataFile(f)]
print("Found %d JSON files" % len(json_files))

Found 748 JSON files


In [26]:
nr_of_timeouts = 0
saved_calls_sorted_by_type = {}

for filename in tqdm(json_files):
    with open(path.join(DATA_PATH, filename)) as file:
        data = json.load(file)
        url = data["initialUrl"]
        if data["timeout"]:
            nr_of_timeouts += 1

        # Sort saved calls
        if len(data["data"]["privacySandbox"]["savedCalls"]) > 0:
            for saved_call in data["data"]["privacySandbox"]["savedCalls"]:
                saved_call["initialUrl"] = url # include initialUrl in the saved call data

                call_type = saved_call["description"]
                if call_type not in saved_calls_sorted_by_type:
                    saved_calls_sorted_by_type[call_type] = []

                saved_calls_sorted_by_type[call_type].append(saved_call)

print("Number of timeouts:", nr_of_timeouts)


100%|██████████| 748/748 [00:00<00:00, 839.72it/s]

Number of timeouts: 31





In [32]:
for call_type, saved_calls in saved_calls_sorted_by_type.items():
    distinct_sites = {call["initialUrl"] for call in saved_calls}
    print("%d calls for %s, on %d distinct sites:" % (len(saved_calls), call_type, len(distinct_sites)))
    for site in distinct_sites:
        print("\t%s" % site)

29 calls for Navigator.joinAdInterestGroup, on 9 distinct sites:
	http://aliexpress.com/
	http://zillow.com/
	http://costco.com/
	http://netgear.com/
	http://macys.com/
	http://alibaba.com/
	http://rakuten.co.jp/
	http://reg.ru/
	http://ui.com/


In [34]:
distinct_ig_owners = {call["arguments"]["0"]["owner"] for call in saved_calls_sorted_by_type["Navigator.joinAdInterestGroup"]}
print("%d distinct interest group owners:" % (len(distinct_ig_owners)))
for owner in distinct_ig_owners:
    print("\t%s" % owner)

5 distinct interest group owners:
	https://fledge.teads.tv
	https://fledge.us.criteo.com
	https://fledge.as.criteo.com
	https://fledge.eu.criteo.com
	https://td.doubleclick.net
