In [1]:
from pathlib import Path

import pandas as pd

In [2]:
dates_to_visualize = [
                                                                                            "2018-09", "2018-10", "2018-11", "2018-12",
    "2019-01", "2019-02", "2019-03", "2019-04", "2019-05", "2019-06", "2019-07", "2019-08", "2019-09", "2019-10", "2019-11", "2019-12", 
    "2020-01", "2020-02", "2020-03", "2020-04", "2020-05", "2020-06", "2020-07", "2020-08", "2020-09", "2020-10", "2020-11", "2020-12",
    "2021-01", "2021-02", "2021-03", "2021-04", "2021-05", "2021-06", "2021-07", "2021-08", "2021-09", "2021-10", "2021-11", "2021-12",
    "2022-01", "2022-02", "2022-03", "2022-04", "2022-05", "2022-06", "2022-07", "2022-08", "2022-09", "2022-10", "2022-11", "2022-12",
    "2023-01", "2023-02",
    # "2024-02",
]

In [3]:
available_data_files = list(Path("../data/httparchive_metrics/nel_resource_config_variability").glob("*.parquet"))

used_data_files = list(filter(lambda file: file.stem in dates_to_visualize, available_data_files))
used_data_files

[WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2018-09.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2018-10.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2018-11.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2018-12.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2019-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2019-03.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2019-04.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2019-05.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2019-06.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variability/2019-07.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_resource_config_variabil

### Aggregate result to visualize

In [4]:
from results.result_utils import date_to_text_format, concat_data_from_files

result = concat_data_from_files(used_data_files)

result['date_formatted'] = result['date'].map(date_to_text_format)

result

Unnamed: 0,date,url_domain,nel_include_subdomains,nel_failure_fraction,nel_success_fraction,nel_max_age,resources_with_this_config,date_formatted
0,2018-09,m2.cn.bing.com,false,1.0,0.1,604800,1,Sep 2018
1,2018-09,www4.bing.com,false,1.0,0.1,604800,1,Sep 2018
0,2018-10,classifieds.seloc.org,true,1.0,0.0,31536000,68,Oct 2018
1,2018-10,cpqa.catchpoint.com,true,1.0,0.0,2592000,1,Oct 2018
2,2018-10,forums.seloc.org,true,1.0,0.0,31536000,1,Oct 2018
...,...,...,...,...,...,...,...,...
2278116,2023-02,zzzw.de,false,1.0,0.0,604800,62,Feb 2023
2278117,2023-02,zzzz.bg,false,1.0,0.0,604800,6,Feb 2023
2278118,2023-02,zzzz.tw,false,1.0,0.0,604800,42,Feb 2023
2278119,2023-02,zzzzap.nl,false,1.0,0.0,604800,37,Feb 2023


### TOP 3 most popular config variations per month

In [5]:
monthly_data = pd.DataFrame()
for month in dates_to_visualize:
    current_month_data = result[result['date'] == month].copy()
    
    current_month_data = current_month_data.groupby(['nel_include_subdomains', 'nel_failure_fraction', 'nel_success_fraction', 'nel_max_age'], observed=True, as_index=False).agg(domains=('url_domain', 'count'))
    
    current_month_data.sort_values(by='domains', ascending=False, inplace=True)
    
    current_month_data['date'] = date_to_text_format(month)
    current_month_data = current_month_data.reindex(columns=['date', 'nel_include_subdomains', 'nel_failure_fraction', 'nel_success_fraction', 'nel_max_age', 'domains'])
    
    current_month_data.reset_index(drop=True, inplace=True)
    current_month_data = current_month_data[current_month_data.index < 3]
    
    monthly_data = pd.concat([monthly_data, current_month_data])
    
# current_month_data
monthly_data

Unnamed: 0,date,nel_include_subdomains,nel_failure_fraction,nel_success_fraction,nel_max_age,domains
0,Sep 2018,false,1.0,0.1,604800,2
0,Oct 2018,false,1.0,0.01,604800,4
1,Oct 2018,true,1.0,0.0,31536000,3
2,Oct 2018,false,1.0,0.0,10886400,1
0,Nov 2018,true,0.001,0.0,3600,126
...,...,...,...,...,...,...
1,Jan 2023,false,1.0,0.01,604800,627497
2,Jan 2023,false,0.1,0.0,2592000,23066
0,Feb 2023,false,1.0,0.0,604800,1605991
1,Feb 2023,false,1.0,0.01,604800,620940


### Number of NEL Config variations found on the analyzed domains 

In [6]:
monthly_data = pd.DataFrame()
for month in dates_to_visualize:
    current_month_data = result[result['date'] == month][['date', 'url_domain', 'nel_include_subdomains']].copy()
    
    config_variations_by_url_domain = current_month_data.groupby(['url_domain'])['nel_include_subdomains'].count()
    current_month_data['variation_count'] = current_month_data['url_domain'].map(config_variations_by_url_domain)
    current_month_data.drop(columns=['nel_include_subdomains'], inplace=True)
        
    current_month_data = current_month_data.groupby(['variation_count']).agg(domains=('url_domain', 'count'))
    current_month_data.reset_index(inplace=True)    
    
    current_month_data['date'] = date_to_text_format(month)
    current_month_data = current_month_data[['date', 'variation_count', 'domains']]
    
    monthly_data = pd.concat([monthly_data, current_month_data])
    
monthly_data

Unnamed: 0,date,variation_count,domains
0,Sep 2018,1.0,2
0,Oct 2018,1.0,10
0,Nov 2018,1.0,183
1,Nov 2018,2.0,8
0,Dec 2018,1.0,375
...,...,...,...
1,Feb 2023,2.0,38062
2,Feb 2023,3.0,66
3,Feb 2023,4.0,28
4,Feb 2023,5.0,75
