In [1]:
from pathlib import Path

import pandas as pd

from src.metric_utils import load_tranco_list

OUTPUT = "data/domains_to_crawl.parquet"

YEAR_TO_CRAWL = "2024"
MONTH_TO_CRAWL = "04"

MIN_NEL_RESOURCES_TO_BE_CRAWLED = 20

In [2]:
# Determine the input

# IMPORTANT: The input comes from the HTTP archive analysis
available_data_files = list(Path("data/httparchive_metrics/nel_domain_resource_monitoring_stats").glob("*.parquet"))

# Select data for the last available date
latest_file = available_data_files[-1]
latest_file

WindowsPath('data/httparchive_metrics/nel_domain_resource_monitoring_stats/2024-04.parquet')

In [3]:
# Load the configured TRANCO popular domain list

tranco_list = load_tranco_list(YEAR_TO_CRAWL, MONTH_TO_CRAWL)
if tranco_list is None:
    raise Exception(f"Fetch yourself a tranco list for date '{YEAR_TO_CRAWL}-{MONTH_TO_CRAWL}' in order for this to work")

tranco_list

Unnamed: 0,order,popular_domain_name
0,1,google.com
1,2,facebook.com
2,3,a-msedge.net
3,4,amazonaws.com
4,5,microsoft.com
...,...,...
999995,999996,dvd-collection.com
999996,999997,dvdweb.in
999997,999998,dvl666.net
999998,999999,dwa136.com


In [4]:
# Load the determined httparchive input

data = pd.read_parquet(latest_file)

# Cast types
data['url_domain_hosted_resources_with_nel'] = data['url_domain_hosted_resources_with_nel'].astype("UInt32")
data['url_domain'] = data['url_domain'].astype("object")

# The url_domains contained here are already unique url_domains - no group by or filter needed
popular_domain_data = data[data['url_domain'].isin(tranco_list['popular_domain_name'])]
popular_domain_data

Unnamed: 0,date,url_domain,url_domain_hosted_resources,url_domain_hosted_resources_with_nel,url_domain_monitored_resources_ratio
37,2024-04,000phlboss.com,23,23,100.0
76,2024-04,0067.ru,4,4,100.0
93,2024-04,007soccerpicks.com,2,2,100.0
95,2024-04,007store.com,60,60,100.0
96,2024-04,007toto.com,17,17,100.0
...,...,...,...,...,...
2608504,2024-04,zzup.com,11,11,100.0
2608513,2024-04,zzxxtra.com,25,25,100.0
2608524,2024-04,zzzcode.ai,5,5,100.0
2608537,2024-04,zzztube.com,12,12,100.0


In [5]:
# Order the url_domains by popularity
popular_domain_data_with_order = tranco_list.merge(popular_domain_data, right_on='url_domain', left_on="popular_domain_name", how='right')
ordered_popular_domain_data = popular_domain_data_with_order.sort_values(by='order', ascending=True).drop(columns='popular_domain_name')
ordered_popular_domain_data

Unnamed: 0,order,date,url_domain,url_domain_hosted_resources,url_domain_hosted_resources_with_nel,url_domain_monitored_resources_ratio
31048,1,2024-04,google.com,96276,2,0.000000
83408,32,2024-04,wikipedia.org,1,1,100.000000
10204,37,2024-04,bing.com,27,23,85.190002
87160,46,2024-04,zoom.us,3177,3177,100.000000
85721,47,2024-04,yahoo.com,5,1,20.000000
...,...,...,...,...,...,...
21972,999976,2024-04,duci.szex.hu,10,10,100.000000
21994,999978,2024-04,dudoan.me,6,6,100.000000
22015,999981,2024-04,duknulla.com,13,13,100.000000
22037,999987,2024-04,dungcuyeu.com,134,134,100.000000


In [6]:
# Pick which domains to crawl

eligible_domain_data = ordered_popular_domain_data[
    # Require a minimum amount of NEL monitored resources per url_domain 
    (ordered_popular_domain_data['url_domain_hosted_resources_with_nel'] > MIN_NEL_RESOURCES_TO_BE_CRAWLED)
    
    # Require that the url_domain's popularity rating is in the first million TRANCO popular domains rank
    & (ordered_popular_domain_data['order'] < 1_000_000)
]
eligible_domain_data

Unnamed: 0,order,date,url_domain,url_domain_hosted_resources,url_domain_hosted_resources_with_nel,url_domain_monitored_resources_ratio
10204,37,2024-04,bing.com,27,23,85.190002
87160,46,2024-04,zoom.us,3177,3177,100.000000
63076,90,2024-04,reddit.com,6293,6293,100.000000
85063,100,2024-04,yandex.ru,689904,689116,99.889999
20414,204,2024-04,discord.com,15146,15134,99.919998
...,...,...,...,...,...,...
21854,999957,2024-04,drrashelofficial.com.pk,96,96,100.000000
21919,999967,2024-04,dt5515.com,354,354,100.000000
21931,999972,2024-04,du4sas3t.xyz,33,33,100.000000
21962,999974,2024-04,ducadimorrone.com,107,107,100.000000


In [7]:
eligible_domain_data.to_parquet(OUTPUT)