In [1]:
from pathlib import Path

import pandas as pd

from src.metric_utils import load_tranco_list

OUTPUT = "data/domains_to_crawl.parquet"

YEAR_TO_CRAWL = "2024"
MONTH_TO_CRAWL = "04"

MIN_NEL_RESOURCES_TO_BE_CRAWLED = 20

In [2]:
# Determine the input

# IMPORTANT: The input comes from the HTTP archive analysis
available_data_files = list(Path("data/httparchive_metrics/nel_domain_resource_monitoring_stats").glob("*.parquet"))

# Select data for the last available date
latest_file = available_data_files[-1]
latest_file

WindowsPath('data/httparchive_metrics/nel_domain_resource_monitoring_stats/2023-02.parquet')

In [3]:
# Load the configured TRANCO popular domain list

tranco_list = load_tranco_list(YEAR_TO_CRAWL, MONTH_TO_CRAWL)
if tranco_list is None:
    raise Exception(f"Fetch yourself a tranco list for date '{YEAR_TO_CRAWL}-{MONTH_TO_CRAWL}' in order for this to work")

tranco_list

Unnamed: 0,order,popular_domain_name
0,1,google.com
1,2,facebook.com
2,3,a-msedge.net
3,4,amazonaws.com
4,5,microsoft.com
...,...,...
4282597,4282598,72517.cn
4282598,4282599,enterpriseforce.co.uk
4282599,4282600,gamepulsepro.store
4282600,4282601,tucsonlifestyle.com


In [4]:
# Load the determined httparchive input

data = pd.read_parquet(latest_file)

# Cast types
data['url_domain_hosted_resources_with_nel'] = data['url_domain_hosted_resources_with_nel'].astype("UInt32")
data['url_domain'] = data['url_domain'].astype("object")

# The url_domains contained here are already unique url_domains - no group by or filter needed
popular_domain_data = data[data['url_domain'].isin(tranco_list['popular_domain_name'])]
popular_domain_data

Unnamed: 0,date,url_domain,url_domain_hosted_resources,url_domain_hosted_resources_with_nel,url_domain_monitored_resources_ratio
14,2023-02,0.plus,2,2,100.000000
16,2023-02,00.ge,34,33,97.059998
41,2023-02,00100.biz,92,92,100.000000
50,2023-02,001k.exchange,61,61,100.000000
58,2023-02,001xnxx.com,4,4,100.000000
...,...,...,...,...,...
2258969,2023-02,zzzttt01.com,1,1,100.000000
2258971,2023-02,zzztube.com,13,13,100.000000
2258972,2023-02,zzztube.tv,11,11,100.000000
2258977,2023-02,zzzzap.nl,37,37,100.000000


In [5]:
# Order the url_domains by popularity
popular_domain_data_with_order = tranco_list.merge(popular_domain_data, right_on='url_domain', left_on="popular_domain_name", how='right')
ordered_popular_domain_data = popular_domain_data_with_order.sort_values(by='order', ascending=True).drop(columns='popular_domain_name')
ordered_popular_domain_data

Unnamed: 0,order,date,url_domain,url_domain_hosted_resources,url_domain_hosted_resources_with_nel,url_domain_monitored_resources_ratio
65578,1,2023-02,google.com,72623,1,0.000000
183693,8,2023-02,youtube.com,19324,1,0.010000
31319,15,2023-02,cloudflare.com,11,8,72.730003
173910,32,2023-02,wikipedia.org,9,9,100.000000
20257,37,2023-02,bing.com,40,34,85.000000
...,...,...,...,...,...,...
85324,4282447,2023-02,keyrenteraustin.com,22,22,100.000000
126907,4282454,2023-02,portugaldigital.com.br,50,50,100.000000
30211,4282535,2023-02,chatavenue.com,8,8,100.000000
149955,4282571,2023-02,spiritcatholicradio.com,97,97,100.000000


In [6]:
# Pick which domains to crawl

eligible_domain_data = ordered_popular_domain_data[
    # Require a minimum amount of NEL monitored resources per url_domain 
    (ordered_popular_domain_data['url_domain_hosted_resources_with_nel'] > MIN_NEL_RESOURCES_TO_BE_CRAWLED)
    
    # Require that the url_domain's popularity rating is in the first million TRANCO popular domains rank
    & (ordered_popular_domain_data['order'] < 1_000_000)
]
eligible_domain_data

Unnamed: 0,order,date,url_domain,url_domain_hosted_resources,url_domain_hosted_resources_with_nel,url_domain_monitored_resources_ratio
20257,37,2023-02,bing.com,40,34,85.000000
185268,46,2023-02,zoom.us,2822,2822,100.000000
133432,90,2023-02,reddit.com,7073,7070,99.959999
182643,100,2023-02,yandex.ru,455605,453760,99.599998
42875,204,2023-02,discord.com,16993,16986,99.959999
...,...,...,...,...,...,...
45233,999937,2023-02,dreamspakistan.com,210,210,100.000000
45316,999943,2023-02,drhtv.com.pl,49,49,100.000000
45627,999956,2023-02,droppers.com.ar,268,268,100.000000
45796,999961,2023-02,dsccams.com,21,21,100.000000


In [7]:
eligible_domain_data.to_parquet(OUTPUT)