In [127]:
import numpy as np
import pandas as pd

from pathlib import Path

from src.metric_utils import load_tranco_list_from_custom_path


### CONFIG

In [182]:
latest_month_metrics_fmt = "2024-04"
latest_month_text_fmt = "Apr 2024"

### Load necessary CRAWL data

In [129]:
crawled_deployment = pd.read_parquet(f"../data/crawled_metrics/nel_deployment/{latest_month_metrics_fmt}.parquet")
crawled_deployment

Unnamed: 0,date,total_crawled_resources,total_crawled_domains,total_crawled_resources_with_nel,total_crawled_domains_with_nel,total_crawled_resources_with_correct_nel,total_crawled_domains_with_correct_nel
0,2024-04,3962754,23183,3943304,23084,3943299,23083


In [130]:
crawled_collector_provider_usage = pd.read_parquet(f"../data/crawled_metrics/nel_collector_provider_usage/{latest_month_metrics_fmt}.parquet")

In [131]:
crawled_config_is = pd.read_parquet(f"../data/crawled_metrics/nel_config/include_subdomains_{latest_month_metrics_fmt}.parquet")

In [132]:
crawled_config_ff = pd.read_parquet(f"../data/crawled_metrics/nel_config/failure_fraction_{latest_month_metrics_fmt}.parquet")

In [133]:
crawled_config_sf = pd.read_parquet(f"../data/crawled_metrics/nel_config/success_fraction_{latest_month_metrics_fmt}.parquet")

In [134]:
crawled_config_ma = pd.read_parquet(f"../data/crawled_metrics/nel_config/max_age_{latest_month_metrics_fmt}.parquet")

In [135]:
crawled_resource_monitoring_stats = pd.read_parquet(f"../data/crawled_metrics/nel_domain_resource_monitoring_stats/{latest_month_metrics_fmt}.parquet")

In [136]:
crawled_monitored_resource_types = pd.read_parquet(f"../data/crawled_metrics/nel_monitored_resource_types/{latest_month_metrics_fmt}.parquet")

In [137]:
crawled_resources_config_variability = pd.read_parquet(f"../data/crawled_metrics/nel_resource_config_variability/{latest_month_metrics_fmt}.parquet")

#### Load TRANCO 

In [139]:
year = latest_month_metrics_fmt.split("-")[0]
month = latest_month_metrics_fmt.split("-")[1]
tranco_list = load_tranco_list_from_custom_path(Path(f"../resources/"), year, month)
tranco_list

Unnamed: 0,order,popular_domain_name
0,1,google.com
1,2,facebook.com
2,3,a-msedge.net
3,4,amazonaws.com
4,5,microsoft.com
...,...,...
999995,999996,dvd-collection.com
999996,999997,dvdweb.in
999997,999998,dvl666.net
999998,999999,dwa136.com


## Prepare list of actually CRAWLED domains to be compared to HTTP ARCHIVE results 

In [165]:
crawled_domains = pd.read_parquet("../data/domains_to_crawl.parquet")

#### Domains that were intended to be crawled

In [168]:
intended_domains_to_compare = crawled_domains[['url_domain']]
intended_domains_to_compare

Unnamed: 0,url_domain
10204,bing.com
87160,zoom.us
63076,reddit.com
85063,yandex.ru
20414,discord.com
...,...
21854,drrashelofficial.com.pk
21919,dt5515.com
21931,du4sas3t.xyz
21962,ducadimorrone.com


#### Filter to domains that were actually attempted to be crawled 

In [169]:
crawl_data_raw_path = Path("../data/crawled_raw/blobs")

available_crawled_data_files = list(crawl_data_raw_path.glob('*.parquet'))
available_crawled_domains = list(map(lambda file: file.stem, available_crawled_data_files))
available_crawled_domains = pd.DataFrame({'url_domain': available_crawled_domains})

domains_crawl_attempted = available_crawled_domains[available_crawled_domains['url_domain'].isin(intended_domains_to_compare['url_domain'])]
domains_crawl_attempted

Unnamed: 0,url_domain
0,0115765.com
1,013info.rs
2,037hd.tv
3,0x00sec.org
4,1-800-phonesex.com
...,...
33065,zyciesokolowa.pl
33066,zygorguides.com
33067,zynca.se
33068,zzapomni.com


#### Filter to domains that were successfully crawled

In [170]:
latest_crawl_raw_data_path = list(Path("../data/crawled_raw/").glob("merged_*.parquet"))[-1]
latest_crawl_raw_data = pd.read_parquet(latest_crawl_raw_data_path)

domains_crawled = pd.DataFrame({'url_domain': latest_crawl_raw_data['url_domain'].unique()})

domains_crawled.size

23083

## RESULTS:

### Deployment stats

In [162]:
total_domains_crawl_size = domains_crawl_attempted.size
total_domains_crawl_size

33070

In [160]:
total_domains_crawl_failed = domains_crawl_attempted.size - domains_crawled.size
total_domains_crawl_failed

9987

In [171]:
crawled_deployment['total_crawled_domains'][0]

23183

In [172]:
crawled_deployment['total_crawled_domains_with_correct_nel'][0]

23083

In [164]:
crawled_deployment['total_crawled_domains_with_correct_nel'] / crawled_deployment['total_crawled_domains'] * 100

0    99.568649
dtype: float64

### Collector Provider Stats

In [145]:
crawled_collector_provider_usage

Unnamed: 0,date,providers,as_primary,share_as_primary,as_secondary,share_as_secondary,among_fallback
0,2024-04,cloudflare.com,22910.0,99.250531,0.0,0.0,0.0
1,2024-04,heroku.com,129.0,0.558853,0.0,0.0,0.0
2,2024-04,yandex.net,11.0,0.047654,8.0,80.0,0.0
3,2024-04,report-uri.com,10.0,0.043322,0.0,0.0,0.0
4,2024-04,uriports.com,5.0,0.021661,0.0,0.0,0.0
5,2024-04,hhdev.ru,5.0,0.021661,1.0,10.0,0.0
6,2024-04,3gl.net,3.0,0.012997,0.0,0.0,0.0
7,2024-04,gkd-re.de,3.0,0.012997,0.0,0.0,0.0
8,2024-04,wikimedia.org,2.0,0.008664,0.0,0.0,0.0
9,2024-04,csrtech.support,1.0,0.004332,0.0,0.0,0.0


### Configuration stats

In [173]:
crawled_config_is

Unnamed: 0,date,nel_include_subdomains,domain_count,domain_percent
0,2024-04,False,23061,99.904692
1,2024-04,True,22,0.095308


In [174]:
crawled_config_ff

Unnamed: 0,date,nel_failure_fraction,domain_count,domain_percent
0,2024-04,0.001,1,0.004332
1,2024-04,0.01,3,0.012997
2,2024-04,0.05,132,0.571849
3,2024-04,0.1,9,0.03899
4,2024-04,0.2,1,0.004332
5,2024-04,0.5,1,0.004332
6,2024-04,1.0,22936,99.363168


In [176]:
crawled_config_sf

Unnamed: 0,date,nel_success_fraction,domain_count,domain_percent
0,2024-04,0.0,21798,94.433133
1,2024-04,0.001,10,0.043322
2,2024-04,0.005,129,0.558853
3,2024-04,0.01,1145,4.96036
4,2024-04,0.05,1,0.004332


In [177]:
crawled_config_ma

Unnamed: 0,date,nel_max_age,domain_count,domain_percent
0,2024-04,100,7,0.030325
1,2024-04,600,1,0.004332
2,2024-04,3600,134,0.580514
3,2024-04,7200,3,0.012997
4,2024-04,86400,2,0.008664
5,2024-04,604800,22914,99.267859
6,2024-04,1209600,1,0.004332
7,2024-04,2592000,14,0.060651
8,2024-04,10886400,1,0.004332
9,2024-04,31536000,6,0.025993


### Resource stats

In [179]:
crawled_deployment['total_crawled_resources'][0]

3962754

In [178]:
crawled_resource_monitoring_stats

Unnamed: 0,date,url_domain,url_domain_hosted_resources,url_domain_hosted_resources_with_nel,url_domain_monitored_resources_ratio
0,2024-04,013info.rs,168,168,100.000000
1,2024-04,037hd.tv,200,200,100.000000
2,2024-04,0x00sec.org,237,237,100.000000
3,2024-04,1-800-phonesex.com,182,182,100.000000
4,2024-04,1-win.casino,105,105,100.000000
...,...,...,...,...,...
23078,2024-04,zwaar.co,67,66,98.507463
23079,2024-04,zwiftinsider.com,345,345,100.000000
23080,2024-04,zwr.gg,142,141,99.295775
23081,2024-04,zycie.news,118,118,100.000000


In [183]:
from results.result_utils import get_first_or_0

distribution_result = pd.DataFrame({}, index=[
    '(0%-10%>',
    '(10%-25%>',
    '(25%-50%>',
    '(50%-75%>',
    '(75%-100%)',
    '<100%>',
])

distr_col = []

data = crawled_resource_monitoring_stats.copy()

# month_result['(0%-10%>']    
next_val = data[(data['url_domain_monitored_resources_ratio'] > 0.0) & (data['url_domain_monitored_resources_ratio'] <= 10.0)].count()
distr_col.append(get_first_or_0(next_val))

# month_result['(10%-25%>']    
next_val = data[(data['url_domain_monitored_resources_ratio'] > 10.0) & (data['url_domain_monitored_resources_ratio'] <= 25.0)].count()
distr_col.append(get_first_or_0(next_val))

# month_result['(25%-50%>']    
next_val = data[(data['url_domain_monitored_resources_ratio'] > 25.0) & (data['url_domain_monitored_resources_ratio'] <= 50.0)].count()
distr_col.append(get_first_or_0(next_val))

# month_result['(50%-75%>']    
next_val = data[(data['url_domain_monitored_resources_ratio'] > 50.0) & (data['url_domain_monitored_resources_ratio'] <= 75.0)].count()
distr_col.append(get_first_or_0(next_val))

# month_result['(75%-100%)']    
next_val = data[(data['url_domain_monitored_resources_ratio'] > 75.0) & (data['url_domain_monitored_resources_ratio'] < 100.0)].count()
distr_col.append(get_first_or_0(next_val))

# month_result['<100%>']
next_val = data[data['url_domain_monitored_resources_ratio'] == 100.0].count()
distr_col.append(get_first_or_0(next_val))


distribution_result[latest_month_text_fmt] = distr_col
distribution_result

Unnamed: 0,Apr 2024
(0%-10%>,2
(10%-25%>,628
(25%-50%>,38
(50%-75%>,27
(75%-100%),2452
<100%>,19936


In [193]:
distribution_result[distribution_result.index == '<100%>'][latest_month_text_fmt].iloc[0] / crawled_deployment['total_crawled_domains']

0    0.85994
Name: total_crawled_domains, dtype: float64

In [194]:
distribution_result[distribution_result.index == '(75%-100%)'][latest_month_text_fmt].iloc[0] / crawled_deployment['total_crawled_domains']

0    0.105767
Name: total_crawled_domains, dtype: float64

In [195]:
distribution_result[distribution_result.index == '(10%-25%>'][latest_month_text_fmt].iloc[0] / crawled_deployment['total_crawled_domains']

0    0.027089
Name: total_crawled_domains, dtype: float64

#### Config variability

In [204]:
config_variation_count_data = pd.DataFrame()
current_month_data = crawled_resources_config_variability[['date', 'url_domain', 'nel_include_subdomains']].copy()

config_variations_by_url_domain = current_month_data.groupby(['url_domain'])['nel_include_subdomains'].count()
current_month_data['variation_count'] = current_month_data['url_domain'].map(config_variations_by_url_domain)
current_month_data.drop(columns=['nel_include_subdomains'], inplace=True)
    
current_month_data = current_month_data.groupby(['variation_count']).agg(domains=('url_domain', 'count'))
current_month_data.reset_index(inplace=True)    

current_month_data['date'] = latest_month_text_fmt
current_month_data = current_month_data[['date', 'variation_count', 'domains']] 

current_month_data['share'] = current_month_data['domains'] / crawled_deployment['total_crawled_domains_with_correct_nel']

config_variation_count_data = pd.concat([config_variation_count_data, current_month_data])
    
config_variation_count_data

Unnamed: 0,date,variation_count,domains,share
0,Apr 2024,1,22842,0.989559
1,Apr 2024,2,480,
2,Apr 2024,3,3,


### Resource types

In [207]:
resource_type_data = pd.DataFrame()
current_month_data = crawled_monitored_resource_types.copy()

current_month_data = current_month_data.groupby(['date', 'type'], observed=True, as_index=False).agg({'count': 'sum'})
current_month_data.sort_values(by='count', ascending=False, inplace=True)
current_month_data.reset_index(inplace=True, drop=True)

current_month_data['date'] = latest_month_text_fmt

resource_type_data = pd.concat([resource_type_data, current_month_data])
    
resource_type_data

Unnamed: 0,date,type,count
0,Apr 2024,image,2457577
1,Apr 2024,script,483698
2,Apr 2024,html,482927
3,Apr 2024,css,312895
4,Apr 2024,other,100116
5,Apr 2024,,47643
6,Apr 2024,font,36835
7,Apr 2024,text,12527
8,Apr 2024,video,5624
9,Apr 2024,audio,2770
