In [11]:
from pathlib import Path

import pandas as pd

In [12]:
dates_to_visualize = [
    "2019-02",
    "2020-02",
    "2021-02",
    "2022-02",
    "2023-02",
    # "2024-02",
]

In [13]:
available_data_files = list(Path("../data/httparchive_metrics/nel_resource_config_variability").glob("*.parquet"))

used_data_files = list(filter(lambda file: file.stem in dates_to_visualize, available_data_files))
used_data_files

[WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2019-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2020-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2021-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2022-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2023-02.parquet')]

### Aggregate result to visualize

In [14]:
from results.result_utils import date_to_text_format, concat_data_from_files

result = concat_data_from_files(used_data_files)

result['date_formatted'] = result['date'].map(date_to_text_format)

result

Unnamed: 0,date,url_domain,nel_include_subdomains,nel_failure_fraction,nel_success_fraction,nel_max_age,resources_with_this_config,date_formatted
0,2019-02,0d38c32709c097507fb7d35fbc48545e.report-uri.com,true,0.00001,0.0,3600,1,Feb 2019
1,2019-02,1627f29bce741ebdc46108ecd8ebba3c.report-uri.com,true,0.00001,0.0,3600,1,Feb 2019
2,2019-02,18d36df5be4d2f0680090c55b489865d.report-uri.com,true,0.00001,0.0,3600,1,Feb 2019
3,2019-02,1e031bc28af67e84e052beae680ccd74.report-uri.com,true,0.00001,0.0,3600,1,Feb 2019
4,2019-02,21torr.report-uri.com,true,0.00001,0.0,3600,2,Feb 2019
...,...,...,...,...,...,...,...,...
1994396,2023-02,zzzw.de,false,1.0,0.0,604800,62,Feb 2023
1994397,2023-02,zzzz.bg,false,1.0,0.0,604800,6,Feb 2023
1994398,2023-02,zzzz.tw,false,1.0,0.0,604800,42,Feb 2023
1994399,2023-02,zzzzap.nl,false,1.0,0.0,604800,37,Feb 2023


### TOP 3 most popular config variations per month

In [15]:
monthly_data = pd.DataFrame()
for month in dates_to_visualize:
    current_month_data = result[result['date'] == month].copy()
    
    current_month_data = current_month_data.groupby(['nel_include_subdomains', 'nel_failure_fraction', 'nel_success_fraction', 'nel_max_age'], observed=True, as_index=False).agg(domains=('url_domain', 'count'))
    
    current_month_data.sort_values(by='domains', ascending=False, inplace=True)
    
    current_month_data['date'] = date_to_text_format(month)
    current_month_data = current_month_data.reindex(columns=['date', 'nel_include_subdomains', 'nel_failure_fraction', 'nel_success_fraction', 'nel_max_age', 'domains'])
    
    current_month_data.reset_index(drop=True, inplace=True)
    current_month_data = current_month_data[current_month_data.index < 3]
    
    monthly_data = pd.concat([monthly_data, current_month_data])
    
# current_month_data
monthly_data

Unnamed: 0,date,nel_include_subdomains,nel_failure_fraction,nel_success_fraction,nel_max_age,domains
0,Feb 2019,True,1e-05,0.0,3600,223
1,Feb 2019,True,1.0,0.0,31536000,85
2,Feb 2019,True,1.0,0.0,2592000,29
0,Feb 2020,False,0.01,0.0001,2592000,108257
1,Feb 2020,True,1e-05,0.0,3600,293
2,Feb 2020,True,0.1,0.0,14400,254
0,Feb 2021,False,1.0,0.0,604800,784918
1,Feb 2021,False,1.0,0.0001,2592000,223554
2,Feb 2021,False,0.05,0.0,86400,1217
0,Feb 2022,False,1.0,0.0,604800,897825


### Number of NEL Config variations found on the analyzed domains 

In [16]:
monthly_data = pd.DataFrame()
for month in dates_to_visualize:
    current_month_data = result[result['date'] == month][['date', 'url_domain', 'nel_include_subdomains']].copy()
    
    config_variations_by_url_domain = current_month_data.groupby(['url_domain'])['nel_include_subdomains'].count()
    current_month_data['variation_count'] = current_month_data['url_domain'].map(config_variations_by_url_domain)
    current_month_data.drop(columns=['nel_include_subdomains'], inplace=True)
        
    current_month_data = current_month_data.groupby(['variation_count']).agg(domains=('url_domain', 'count'))
    current_month_data.reset_index(inplace=True)    
    
    current_month_data['date'] = date_to_text_format(month)
    current_month_data = current_month_data[['date', 'variation_count', 'domains']]
    
    monthly_data = pd.concat([monthly_data, current_month_data])
    
monthly_data

Unnamed: 0,date,variation_count,domains
0,Feb 2019,1,370
0,Feb 2020,1,109595
1,Feb 2020,2,18
0,Feb 2021,1,1003242
1,Feb 2021,2,9718
2,Feb 2021,3,6
0,Feb 2022,1,971459
1,Feb 2022,2,1266
2,Feb 2022,3,198
3,Feb 2022,4,4
