In [10]:
from pathlib import Path

import pandas as pd

In [11]:
dates_to_visualize = [
    "2019-02",
    "2020-02",
    "2021-02",
    "2022-02",
    "2023-02",
    # "2024-02",
]

In [12]:
available_data_files = list(Path("../data/httparchive_metrics/nel_resource_config_variability").glob("*.parquet"))

used_data_files = list(filter(lambda file: file.stem in dates_to_visualize, available_data_files))
used_data_files

[WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2019-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2020-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2021-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2022-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2023-02.parquet')]

### Load TRANCO Popular domain list

In [13]:
from src.metric_utils import load_tranco_list_from_custom_path

tranco_dict = {}
for month in dates_to_visualize:
    months_list = load_tranco_list_from_custom_path(Path("../resources"), *month.split('-'))
    tranco_dict[month] = months_list

tranco_dict[month]

Unnamed: 0,order,popular_domain_name
0,1,google.com
1,2,a-msedge.net
2,3,youtube.com
3,4,facebook.com
4,5,microsoft.com
...,...,...
999995,999996,zio.to
999996,999997,tostado.com.ar
999997,999998,comnewsvideo.jp
999998,999999,direweb.it


### Aggregate result to visualize

In [14]:
from results.result_utils import date_to_text_format, concat_data_from_files

result = concat_data_from_files(used_data_files)

result['date_formatted'] = result['date'].map(date_to_text_format)

result

Unnamed: 0,date,url_domain,nel_include_subdomains,nel_failure_fraction,nel_success_fraction,nel_max_age,resources_with_this_config,date_formatted
0,2019-02,0d38c32709c097507fb7d35fbc48545e.report-uri.com,true,0.00001,0.0,3600,1,Feb 2019
1,2019-02,1627f29bce741ebdc46108ecd8ebba3c.report-uri.com,true,0.00001,0.0,3600,1,Feb 2019
2,2019-02,18d36df5be4d2f0680090c55b489865d.report-uri.com,true,0.00001,0.0,3600,1,Feb 2019
3,2019-02,1e031bc28af67e84e052beae680ccd74.report-uri.com,true,0.00001,0.0,3600,1,Feb 2019
4,2019-02,21torr.report-uri.com,true,0.00001,0.0,3600,2,Feb 2019
...,...,...,...,...,...,...,...,...
1994396,2023-02,zzzw.de,false,1.0,0.0,604800,62,Feb 2023
1994397,2023-02,zzzz.bg,false,1.0,0.0,604800,6,Feb 2023
1994398,2023-02,zzzz.tw,false,1.0,0.0,604800,42,Feb 2023
1994399,2023-02,zzzzap.nl,false,1.0,0.0,604800,37,Feb 2023


### Filter result data to popular collector provider domains

In [15]:
popular_result = pd.DataFrame({})
for month in dates_to_visualize:
    month_result = result[result['date'] == month]
    
    month_result = month_result[month_result['url_domain'].isin(tranco_dict[month]['popular_domain_name'])]
    month_result.reset_index(drop=True, inplace=True)
    
    popular_result = pd.concat([popular_result, month_result])
    
popular_result

Unnamed: 0,date,url_domain,nel_include_subdomains,nel_failure_fraction,nel_success_fraction,nel_max_age,resources_with_this_config,date_formatted
0,2019-02,coinloan.io,true,1.0,0.0,31536000,51,Feb 2019
1,2019-02,easytithe.com,true,1.0,0.0,31536000,1,Feb 2019
2,2019-02,flightplandatabase.com,true,1.0,0.0,604800,1,Feb 2019
3,2019-02,linustechtips.com,false,0.001,0.0,0,73,Feb 2019
4,2019-02,nodecraft.com,true,1.0,0.0,31536000,1,Feb 2019
...,...,...,...,...,...,...,...,...
66629,2023-02,zztt86.com,false,1.0,0.0,604800,1,Feb 2023
66630,2023-02,zzup.com,false,1.0,0.0,604800,67,Feb 2023
66631,2023-02,zzztube.com,false,1.0,0.0,604800,13,Feb 2023
66632,2023-02,zzztube.tv,false,1.0,0.0,604800,11,Feb 2023


### TOP 3 most popular config variations ON POPULAR DOMAINS per month

In [16]:
monthly_data = pd.DataFrame()
for month in dates_to_visualize:
    current_month_data = popular_result[popular_result['date'] == month].copy()
    
    current_month_data = current_month_data.groupby(['nel_include_subdomains', 'nel_failure_fraction', 'nel_success_fraction', 'nel_max_age'], observed=True, as_index=False).agg(domains=('url_domain', 'count'))
    
    current_month_data.sort_values(by='domains', ascending=False, inplace=True)
    
    current_month_data['date'] = date_to_text_format(month)
    current_month_data = current_month_data.reindex(columns=['date', 'nel_include_subdomains', 'nel_failure_fraction', 'nel_success_fraction', 'nel_max_age', 'domains'])
    
    current_month_data.reset_index(drop=True, inplace=True)
    current_month_data = current_month_data[current_month_data.index < 3]
    
    monthly_data = pd.concat([monthly_data, current_month_data])
    
monthly_data

Unnamed: 0,date,nel_include_subdomains,nel_failure_fraction,nel_success_fraction,nel_max_age,domains
0,Feb 2019,True,1.0,0.0,31536000,7
1,Feb 2019,False,1.0,0.0,10886400,2
2,Feb 2019,False,0.001,0.0,0,1
0,Feb 2020,False,0.01,0.0001,2592000,5875
1,Feb 2020,True,1.0,0.0,31536000,8
2,Feb 2020,True,1.0,0.0,2592000,4
0,Feb 2021,False,1.0,0.0,604800,50800
1,Feb 2021,False,1.0,0.0001,2592000,6362
2,Feb 2021,False,0.1,0.001,86400,18
0,Feb 2022,False,1.0,0.0,604800,59048


### Number of NEL Config variations found on the analyzed POPULAR domains 

In [17]:
monthly_data = pd.DataFrame()
for month in dates_to_visualize:
    current_month_data = popular_result[popular_result['date'] == month][['date', 'url_domain', 'nel_include_subdomains']].copy()
    
    config_variations_by_url_domain = current_month_data.groupby(['url_domain'])['nel_include_subdomains'].count()
    current_month_data['variation_count'] = current_month_data['url_domain'].map(config_variations_by_url_domain)
    current_month_data.drop(columns=['nel_include_subdomains'], inplace=True)
        
    current_month_data = current_month_data.groupby(['variation_count']).agg(domains=('url_domain', 'count'))
    current_month_data.reset_index(inplace=True)    
    
    current_month_data['date'] = date_to_text_format(month)
    current_month_data = current_month_data[['date', 'variation_count', 'domains']]
    
    monthly_data = pd.concat([monthly_data, current_month_data])
    
monthly_data

Unnamed: 0,date,variation_count,domains
0,Feb 2019,1,13
0,Feb 2020,1,5903
1,Feb 2020,2,2
0,Feb 2021,1,56842
1,Feb 2021,2,402
2,Feb 2021,3,3
0,Feb 2022,1,59482
1,Feb 2022,2,70
0,Feb 2023,1,65625
1,Feb 2023,2,1006
