In [1]:
from pathlib import Path

import pandas as pd

In [2]:
dates_to_visualize = [
    "2019-02",
    "2020-02",
    "2021-02",
    "2022-02",
    "2023-02",
    # "2024-02",
]

In [3]:
available_data_files = list(Path("../data/httparchive_metrics/nel_monitored_resource_types").glob("*.parquet"))

used_data_files = list(filter(lambda file: file.stem in dates_to_visualize, available_data_files))
used_data_files

[WindowsPath('../data/httparchive_metrics/nel_monitored_resource_types/2019-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_monitored_resource_types/2020-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_monitored_resource_types/2021-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_monitored_resource_types/2022-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_monitored_resource_types/2023-02.parquet')]

In [4]:
from results.result_utils import date_to_text_format, concat_data_from_files

result = concat_data_from_files(used_data_files)

result['date_formatted'] = result['date'].map(date_to_text_format)

result

Unnamed: 0,date,url_domain,type,count,date_formatted
0,2019-02,0d38c32709c097507fb7d35fbc48545e.report-uri.com,text,1,Feb 2019
1,2019-02,1627f29bce741ebdc46108ecd8ebba3c.report-uri.com,text,1,Feb 2019
2,2019-02,18d36df5be4d2f0680090c55b489865d.report-uri.com,text,1,Feb 2019
3,2019-02,1e031bc28af67e84e052beae680ccd74.report-uri.com,text,1,Feb 2019
4,2019-02,21torr.report-uri.com,text,2,Feb 2019
...,...,...,...,...,...
7193686,2023-02,zzzzap.nl,script,14,Feb 2023
7193687,2023-02,zzzzap.nl,html,1,Feb 2023
7193688,2023-02,zzzzzz.me,css,1,Feb 2023
7193689,2023-02,zzzzzz.me,script,1,Feb 2023


### Monitored resources by type

In [5]:
monthly_data = pd.DataFrame()
for month in dates_to_visualize:
    current_month_data = result[result['date'] == month].copy()
    
    current_month_data = current_month_data.groupby(['date', 'type'], as_index=False).agg({'count': 'sum'})
    current_month_data.sort_values(by='count', ascending=False, inplace=True)
    current_month_data.reset_index(inplace=True, drop=True)
    
    current_month_data['date'] = current_month_data['date'].map(date_to_text_format)
    
    monthly_data = pd.concat([monthly_data, current_month_data])
    
monthly_data

Unnamed: 0,date,type,count
0,Feb 2019,image,1225
1,Feb 2019,script,536
2,Feb 2019,text,244
3,Feb 2019,html,211
4,Feb 2019,css,195
5,Feb 2019,font,102
6,Feb 2019,other,12
7,Feb 2019,video,5
8,Feb 2019,audio,2
0,Feb 2020,script,207804


### Most monitored resource type for the whole time period (HTTPArchive data) 

In [7]:
result.groupby('type')['count'].sum().sort_values(ascending=False)

type
image     106921511
script     53763780
css        26911046
html        8067751
font        6324392
other       3103701
video        499209
text         474016
xml           91063
audio         83952
Name: count, dtype: UInt32