In [3]:
from pathlib import Path

import pandas as pd

In [4]:
dates_to_visualize = [
    "2019-02",
    "2020-02",
    "2021-02",
    "2022-02",
    "2023-02",
    # "2024-02",
]

In [7]:
available_data_files = list(Path("../data/httparchive_metrics/nel_domain_resource_monitoring_stats").glob("*.parquet"))

used_data_files = list(filter(lambda file: file.stem in dates_to_visualize, available_data_files))
used_data_files

[WindowsPath('../data/httparchive_metrics/nel_domain_resource_monitoring_stats/2019-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_domain_resource_monitoring_stats/2020-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_domain_resource_monitoring_stats/2021-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_domain_resource_monitoring_stats/2022-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_domain_resource_monitoring_stats/2023-02.parquet')]

### Aggregate result to visualize

In [8]:
from results.result_utils import date_to_text_format, concat_data_from_files

result = concat_data_from_files(used_data_files)

result['date_formatted'] = result['date'].map(date_to_text_format)

result

Unnamed: 0,date,url_domain,url_domain_hosted_resources,url_domain_hosted_resources_with_nel,url_domain_monitored_resources_ratio,date_formatted
0,2019-02,0d38c32709c097507fb7d35fbc48545e.report-uri.com,1,1,100.000000,Feb 2019
1,2019-02,1627f29bce741ebdc46108ecd8ebba3c.report-uri.com,1,1,100.000000,Feb 2019
2,2019-02,18d36df5be4d2f0680090c55b489865d.report-uri.com,1,1,100.000000,Feb 2019
3,2019-02,1e031bc28af67e84e052beae680ccd74.report-uri.com,1,1,100.000000,Feb 2019
4,2019-02,21torr.report-uri.com,2,2,100.000000,Feb 2019
...,...,...,...,...,...,...
1977381,2023-02,zzzw.de,63,62,98.410004,Feb 2023
1977382,2023-02,zzzz.bg,6,6,100.000000,Feb 2023
1977383,2023-02,zzzz.tw,42,42,100.000000,Feb 2023
1977384,2023-02,zzzzap.nl,37,37,100.000000,Feb 2023


### NEL Domain monitored resources ratio distribution

In [16]:
from results.result_utils import get_first_or_0

distribution_result = pd.DataFrame({}, index=[
    '<0%>',
    '(0%-10%>',
    '(10%-25%>',
    '(25%-50%>',
    '(50%-75%>',
    '(75%-100%)',
    '<100%>',
])

for month in dates_to_visualize:
    current_month_data = result[result['date'] == month].copy()
    
    month_data_col = []
    
    # month_result['<0%>']
    next_val = current_month_data[current_month_data['url_domain_monitored_resources_ratio'] == 0.0].count()
    month_data_col.append(get_first_or_0(next_val))

    # month_result['(0%-10%>']    
    next_val = current_month_data[(current_month_data['url_domain_monitored_resources_ratio'] > 0.0) & (current_month_data['url_domain_monitored_resources_ratio'] <= 10.0)].count()
    month_data_col.append(get_first_or_0(next_val))

    # month_result['(10%-25%>']    
    next_val = current_month_data[(current_month_data['url_domain_monitored_resources_ratio'] > 10.0) & (current_month_data['url_domain_monitored_resources_ratio'] <= 25.0)].count()
    month_data_col.append(get_first_or_0(next_val))

    # month_result['(25%-50%>']    
    next_val = current_month_data[(current_month_data['url_domain_monitored_resources_ratio'] > 25.0) & (current_month_data['url_domain_monitored_resources_ratio'] <= 50.0)].count()
    month_data_col.append(get_first_or_0(next_val))

    # month_result['(50%-75%>']    
    next_val = current_month_data[(current_month_data['url_domain_monitored_resources_ratio'] > 50.0) & (current_month_data['url_domain_monitored_resources_ratio'] <= 75.0)].count()
    month_data_col.append(get_first_or_0(next_val))

    # month_result['(75%-100%)']    
    next_val = current_month_data[(current_month_data['url_domain_monitored_resources_ratio'] > 75.0) & (current_month_data['url_domain_monitored_resources_ratio'] < 100.0)].count()
    month_data_col.append(get_first_or_0(next_val))
    
    # month_result['<100%>']
    next_val = current_month_data[current_month_data['url_domain_monitored_resources_ratio'] == 100.0].count()
    month_data_col.append(get_first_or_0(next_val))
    
    
    distribution_result[date_to_text_format(month)] = month_data_col
    
distribution_result

Unnamed: 0,Feb 2019,Feb 2020,Feb 2021,Feb 2022,Feb 2023
<0%>,1,2,6,10,28
(0%-10%>,29,188,1324,19856,23214
(10%-25%>,7,250,1074,4717,2218
(25%-50%>,8,11858,23114,5214,5341
(50%-75%>,25,10461,23678,5207,7006
(75%-100%),29,5378,32986,64864,93968
<100%>,271,81467,925923,872300,1845611
