In [26]:
from pathlib import Path

import pandas as pd

from src.metric_utils import load_tranco_list

OUTPUT = "data/crawl_and_store_eligible_domains.parquet"

YEAR_TO_CRAWL = "2024"
MONTH_TO_CRAWL = "04"

MIN_NEL_RESOURCES_TO_BE_CRAWLED = 20

In [2]:
# Determine the input

# IMPORTANT: The input comes from the HTTP archive analysis
available_data_files = list(Path("data/metrics/nel_domain_resource_monitoring_stats").glob("*.parquet"))

latest_file = available_data_files[-1]
latest_file

WindowsPath('data/metrics/nel_domain_resource_monitoring_stats/2022-02.parquet')

In [3]:
# Load the configured TRANCO popular domain list

tranco_list = load_tranco_list(YEAR_TO_CRAWL, MONTH_TO_CRAWL)
if tranco_list is None:
    raise Exception(f"Fetch yourself a tranco list for date '{YEAR_TO_CRAWL}-{MONTH_TO_CRAWL}' in order for this to work")

tranco_list

Unnamed: 0,order,popular_domain_name
0,1,google.com
1,2,facebook.com
2,3,a-msedge.net
3,4,amazonaws.com
4,5,microsoft.com
...,...,...
4282597,4282598,72517.cn
4282598,4282599,enterpriseforce.co.uk
4282599,4282600,gamepulsepro.store
4282600,4282601,tucsonlifestyle.com


In [6]:
# Load the determined httparchive input

data = pd.read_parquet(latest_file)

# Cast types
data['url_domain_hosted_resources_with_nel'] = data['url_domain_hosted_resources_with_nel'].astype("UInt32")
data['url_domain'] = data['url_domain'].astype("object")

# The url_domains contained here are already unique url_domains - no group by or filter needed
popular_domain_data = data[data['url_domain'].isin(tranco_list['popular_domain_name'])]
popular_domain_data

Unnamed: 0,date,url_domain,url_domain_hosted_resources,url_domain_hosted_resources_with_nel,url_domain_monitored_resources_ratio
15,2022-02,001k.exchange,49,49,100.000000
18,2022-02,001xnxx.com,5,5,100.000000
21,2022-02,002mag.com,33,32,96.970001
23,2022-02,003ms.ru,2,2,100.000000
26,2022-02,0067.ru,3,3,100.000000
...,...,...,...,...,...
972149,2022-02,zztt21.com,8,8,100.000000
972152,2022-02,zztt24.com,16,16,100.000000
972153,2022-02,zztt26.com,8,8,100.000000
972161,2022-02,zzzchan.xyz,8,8,100.000000


In [15]:
# Order the url_domains by popularity
popular_domain_data_with_order = tranco_list.merge(popular_domain_data, right_on='url_domain', left_on="popular_domain_name", how='right')
ordered_popular_domain_data = popular_domain_data_with_order.sort_values(by='order', ascending=True).drop(columns='popular_domain_name')
ordered_popular_domain_data

Unnamed: 0,order,date,url_domain,url_domain_hosted_resources,url_domain_hosted_resources_with_nel,url_domain_monitored_resources_ratio
18435,15,2022-02,cloudflare.com,7,6,85.709999
99100,32,2022-02,wikipedia.org,5,5,100.000000
11670,37,2022-02,bing.com,52,5,9.620000
11862,69,2022-02,bit.ly,2817,1,0.040000
18434,77,2022-02,cloudflare-dns.com,13,10,76.919998
...,...,...,...,...,...,...
24223,4282396,2022-02,disneydooney.com,22,22,100.000000
93201,4282404,2022-02,turne.ua,30,29,96.669998
48102,4282447,2022-02,keyrenteraustin.com,24,24,100.000000
70833,4282454,2022-02,portugaldigital.com.br,47,47,100.000000


In [27]:
# Pick which domains to crawl

eligible_domain_data = ordered_popular_domain_data[
    (ordered_popular_domain_data['url_domain_hosted_resources_with_nel'] > MIN_NEL_RESOURCES_TO_BE_CRAWLED)
    & (ordered_popular_domain_data['order'] < 1_000_000)
]
eligible_domain_data

Unnamed: 0,order,date,url_domain,url_domain_hosted_resources,url_domain_hosted_resources_with_nel,url_domain_monitored_resources_ratio
101744,100,2022-02,yandex.ru,113457,113407,99.959999
24143,204,2022-02,discord.com,11118,11116,99.980003
99090,229,2022-02,wikimedia.org,417,417,100.000000
24144,346,2022-02,discord.gg,36,36,100.000000
100482,538,2022-02,xhamster.com,202,190,94.059998
...,...,...,...,...,...,...
25136,999894,2022-02,dosenppkn.com,34,34,100.000000
25553,999937,2022-02,dreamspakistan.com,122,122,100.000000
25587,999943,2022-02,drhtv.com.pl,22,22,100.000000
25768,999958,2022-02,drsportvip.ir,99,99,100.000000


In [28]:
eligible_domain_data.to_parquet(OUTPUT)