In [1]:
import json
import os
from os import path
from tqdm import tqdm
from tld import get_fld

In [3]:
DATA_PATH = "/mnt/data/MT/pilot-crawl2"

def isCrawlDataFile(filename: str):
    return filename.endswith(".json") and filename != "metadata.json" and path.isfile(path.join(DATA_PATH, filename))

json_files = [f for f in os.listdir(DATA_PATH) if isCrawlDataFile(f)]
print("Found %d JSON files" % len(json_files))

Found 731 JSON files


In [10]:
nr_of_timeouts = 0
saved_calls_sorted_by_type = {}

for filename in tqdm(json_files):
    with open(path.join(DATA_PATH, filename)) as file:
        data = json.load(file)
        url = data["initialUrl"]
        if data["timeout"]:
            nr_of_timeouts += 1

        # Sort saved calls
        ps_data = data["data"]["privacySandbox"]
        if not ps_data:
            continue
        saved_calls = ps_data["savedCalls"]
        if len(saved_calls) > 0:
            for saved_call in saved_calls:
                saved_call["initialUrl"] = url # include initialUrl in the saved call data

                call_type = saved_call["description"]
                if call_type not in saved_calls_sorted_by_type:
                    saved_calls_sorted_by_type[call_type] = []

                saved_calls_sorted_by_type[call_type].append(saved_call)

print("Number of timeouts:", nr_of_timeouts)


100%|██████████| 731/731 [00:03<00:00, 185.39it/s]

Number of timeouts: 53





First pilot crawl: `Number of timeouts: 31`

In [13]:
for call_type, saved_calls in saved_calls_sorted_by_type.items():
    distinct_sites = {call["initialUrl"] for call in saved_calls}
    print("%d calls for %s, on %d distinct sites" % (len(saved_calls), call_type, len(distinct_sites)))
    if call_type != "Document.browsingTopics":
        for site in distinct_sites:
            print("\t%s" % site)

629 calls for Document.browsingTopics, on 98 distinct sites
141 calls for Navigator.joinAdInterestGroup, on 24 distinct sites
	http://applovin.com/
	http://rakuten.co.jp/
	http://macys.com/
	http://onet.pl/
	http://themeforest.net/
	http://costco.com/
	http://ui.com/
	http://grammarly.com/
	http://mcafee.com/
	http://bestbuy.com/
	http://scribd.com/
	http://ubnt.com/
	http://teamviewer.com/
	http://hp.com/
	http://hostgator.com.br/
	http://splashtop.com/
	http://dell.com/
	http://repubblica.it/
	http://wiley.com/
	http://aliexpress.com/
	http://airbnb.com/
	http://nature.com/
	http://netgear.com/
	http://criteo.net/
34 calls for Navigator.runAdAuction, on 4 distinct sites
	http://huffpost.com/
	http://chess.com/
	http://buzzfeed.com/
	http://speedtest.net/


First pilot crawl:
```
29 calls for Navigator.joinAdInterestGroup, on 9 distinct sites:
	http://aliexpress.com/
	http://zillow.com/
	http://costco.com/
	http://netgear.com/
	http://macys.com/
	http://alibaba.com/
	http://rakuten.co.jp/
	http://reg.ru/
	http://ui.com/
```

In [14]:
distinct_ig_owners = {call["arguments"]["0"]["owner"] for call in saved_calls_sorted_by_type["Navigator.joinAdInterestGroup"]}
print("%d distinct interest group owners:" % (len(distinct_ig_owners)))
for owner in distinct_ig_owners:
    print("\t%s" % owner)

4 distinct interest group owners:
	https://fledge.us.criteo.com
	https://fledge.as.criteo.com
	https://fledge.eu.criteo.com
	https://td.doubleclick.net


First pilot crawl:
```
5 distinct interest group owners:
	https://fledge.teads.tv
	https://fledge.us.criteo.com
	https://fledge.as.criteo.com
	https://fledge.eu.criteo.com
	https://td.doubleclick.net
```

In [15]:
distinct_action_sellers = {call["arguments"]["0"]["seller"] for call in saved_calls_sorted_by_type["Navigator.runAdAuction"]}
print("%d distinct auction sellers:" % (len(distinct_action_sellers)))
for seller in distinct_action_sellers:
    print("\t%s" % seller)

1 distinct auction sellers:
	https://securepubads.g.doubleclick.net
