In [1]:
from pathlib import Path

import pandas as pd

from src.metric_utils import load_tranco_list

OUTPUT = "data/domains_to_crawl.parquet"

YEAR_TO_CRAWL = "2024"
MONTH_TO_CRAWL = "04"

MIN_NEL_RESOURCES_TO_BE_CRAWLED = 20

In [2]:
# Determine the input

# IMPORTANT: The input comes from the HTTP archive analysis
available_data_files = list(Path("data/httparchive_metrics/nel_domain_resource_monitoring_stats").glob("*.parquet"))

# Select data for the last available date
latest_file = available_data_files[-1]
latest_file

WindowsPath('data/httparchive_metrics/nel_domain_resource_monitoring_stats/2024-04.parquet')

In [3]:
# Load the configured TRANCO popular domain list

tranco_list = load_tranco_list(YEAR_TO_CRAWL, MONTH_TO_CRAWL)
if tranco_list is None:
    raise Exception(f"Fetch yourself a tranco list for date '{YEAR_TO_CRAWL}-{MONTH_TO_CRAWL}' in order for this to work")

tranco_list

Unnamed: 0,order,popular_domain_name
0,1,google.com
1,2,facebook.com
2,3,a-msedge.net
3,4,amazonaws.com
4,5,microsoft.com
...,...,...
4282597,4282598,72517.cn
4282598,4282599,enterpriseforce.co.uk
4282599,4282600,gamepulsepro.store
4282600,4282601,tucsonlifestyle.com


In [4]:
# Load the determined httparchive input

data = pd.read_parquet(latest_file)

# Cast types
data['url_domain_hosted_resources_with_nel'] = data['url_domain_hosted_resources_with_nel'].astype("UInt32")
data['url_domain'] = data['url_domain'].astype("object")

# The url_domains contained here are already unique url_domains - no group by or filter needed
popular_domain_data = data[data['url_domain'].isin(tranco_list['popular_domain_name'])]
popular_domain_data

Unnamed: 0,date,url_domain,url_domain_hosted_resources,url_domain_hosted_resources_with_nel,url_domain_monitored_resources_ratio
18,2024-04,0.plus,2,2,100.000000
19,2024-04,00.ge,29,29,100.000000
37,2024-04,000phlboss.com,23,23,100.000000
38,2024-04,000xnxx.skin,9,9,100.000000
43,2024-04,00100.biz,92,92,100.000000
...,...,...,...,...,...
2608528,2024-04,zzzhc.com,1,1,100.000000
2608537,2024-04,zzztube.com,12,12,100.000000
2608538,2024-04,zzztube.tv,10,10,100.000000
2608540,2024-04,zzzzap.nl,193,192,99.480003


In [5]:
# Order the url_domains by popularity
popular_domain_data_with_order = tranco_list.merge(popular_domain_data, right_on='url_domain', left_on="popular_domain_name", how='right')
ordered_popular_domain_data = popular_domain_data_with_order.sort_values(by='order', ascending=True).drop(columns='popular_domain_name')
ordered_popular_domain_data

Unnamed: 0,order,date,url_domain,url_domain_hosted_resources,url_domain_hosted_resources_with_nel,url_domain_monitored_resources_ratio
86998,1,2024-04,google.com,96276,2,0.000000
236755,32,2024-04,wikipedia.org,1,1,100.000000
28145,37,2024-04,bing.com,27,23,85.190002
246497,46,2024-04,zoom.us,3177,3177,100.000000
242710,47,2024-04,yahoo.com,5,1,20.000000
...,...,...,...,...,...,...
42390,4282511,2024-04,chris.nl,11,11,100.000000
200253,4282516,2024-04,sports-insight.co.uk,165,165,100.000000
41205,4282535,2024-04,chatavenue.com,9,9,100.000000
199946,4282571,2024-04,spiritcatholicradio.com,101,101,100.000000


In [6]:
# Pick which domains to crawl

eligible_domain_data = ordered_popular_domain_data[
    # Require a minimum amount of NEL monitored resources per url_domain 
    (ordered_popular_domain_data['url_domain_hosted_resources_with_nel'] > MIN_NEL_RESOURCES_TO_BE_CRAWLED)
    
    # Require that the url_domain's popularity rating is in the first million TRANCO popular domains rank
    & (ordered_popular_domain_data['order'] < 1_000_000)
]
eligible_domain_data

Unnamed: 0,order,date,url_domain,url_domain_hosted_resources,url_domain_hosted_resources_with_nel,url_domain_monitored_resources_ratio
28145,37,2024-04,bing.com,27,23,85.190002
246497,46,2024-04,zoom.us,3177,3177,100.000000
177915,90,2024-04,reddit.com,6293,6293,100.000000
241396,100,2024-04,yandex.ru,689904,689116,99.889999
57358,204,2024-04,discord.com,15146,15134,99.919998
...,...,...,...,...,...,...
61151,999957,2024-04,drrashelofficial.com.pk,96,96,100.000000
61347,999967,2024-04,dt5515.com,354,354,100.000000
61393,999972,2024-04,du4sas3t.xyz,33,33,100.000000
61475,999974,2024-04,ducadimorrone.com,107,107,100.000000


In [7]:
eligible_domain_data.to_parquet(OUTPUT)