In [11]:
from pathlib import Path

import pandas as pd

In [12]:
dates_to_visualize = [
                                                                                          # No TRANCO list available before 2018-12
                                                                                          # "2018-09", "2018-10", "2018-11", 
                                                                                                                             "2018-12",
    "2019-01", "2019-02", "2019-03", "2019-04", "2019-05", "2019-06", "2019-07", "2019-08", "2019-09", "2019-10", "2019-11", "2019-12", 
    "2020-01", "2020-02", "2020-03", "2020-04", "2020-05", "2020-06", "2020-07", "2020-08", "2020-09", "2020-10", "2020-11", "2020-12",
    "2021-01", "2021-02", "2021-03", "2021-04", "2021-05", "2021-06", "2021-07", "2021-08", "2021-09", "2021-10", "2021-11", "2021-12",
    "2022-01", "2022-02", "2022-03", "2022-04", "2022-05", "2022-06", "2022-07", "2022-08", "2022-09", "2022-10", "2022-11", "2022-12",
    "2023-01", "2023-02",
    # "2024-02",
]

In [13]:
available_data_files = list(Path("../data/httparchive_metrics/nel_monitored_resource_types").glob("*.parquet"))

used_data_files = list(filter(lambda file: file.stem in dates_to_visualize, available_data_files))
used_data_files

[WindowsPath('../data/httparchive_metrics/nel_monitored_resource_types/2019-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_monitored_resource_types/2020-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_monitored_resource_types/2021-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_monitored_resource_types/2022-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_monitored_resource_types/2023-02.parquet')]

### Load TRANCO Popular domain list


In [14]:
from src.metric_utils import load_tranco_list_from_custom_path

tranco_dict = {}
for month in dates_to_visualize:
    months_list = load_tranco_list_from_custom_path(Path("../resources"), *month.split('-'))
    tranco_dict[month] = months_list

tranco_dict[month]

Unnamed: 0,order,popular_domain_name
0,1,google.com
1,2,a-msedge.net
2,3,youtube.com
3,4,facebook.com
4,5,microsoft.com
...,...,...
999995,999996,zio.to
999996,999997,tostado.com.ar
999997,999998,comnewsvideo.jp
999998,999999,direweb.it


In [15]:
from results.result_utils import date_to_text_format, concat_data_from_files

result = concat_data_from_files(used_data_files)

result['date_formatted'] = result['date'].map(date_to_text_format)

result

Unnamed: 0,date,url_domain,type,count,date_formatted
0,2019-02,0d38c32709c097507fb7d35fbc48545e.report-uri.com,text,1,Feb 2019
1,2019-02,1627f29bce741ebdc46108ecd8ebba3c.report-uri.com,text,1,Feb 2019
2,2019-02,18d36df5be4d2f0680090c55b489865d.report-uri.com,text,1,Feb 2019
3,2019-02,1e031bc28af67e84e052beae680ccd74.report-uri.com,text,1,Feb 2019
4,2019-02,21torr.report-uri.com,text,2,Feb 2019
...,...,...,...,...,...
7193686,2023-02,zzzzap.nl,script,14,Feb 2023
7193687,2023-02,zzzzap.nl,html,1,Feb 2023
7193688,2023-02,zzzzzz.me,css,1,Feb 2023
7193689,2023-02,zzzzzz.me,script,1,Feb 2023


### Filter result data to popular collector provider domains

In [16]:
popular_result = pd.DataFrame({})
for month in dates_to_visualize:
    month_result = result[result['date'] == month]
    
    month_result = month_result[month_result['url_domain'].isin(tranco_dict[month]['popular_domain_name'])]
    month_result.reset_index(drop=True, inplace=True)
    
    popular_result = pd.concat([popular_result, month_result])
    
popular_result

Unnamed: 0,date,url_domain,type,count,date_formatted
0,2019-02,coinloan.io,css,2,Feb 2019
1,2019-02,coinloan.io,html,1,Feb 2019
2,2019-02,coinloan.io,image,45,Feb 2019
3,2019-02,coinloan.io,script,3,Feb 2019
4,2019-02,easytithe.com,html,1,Feb 2019
...,...,...,...,...,...
273141,2023-02,zzztube.tv,script,4,Feb 2023
273142,2023-02,zzztube.tv,html,1,Feb 2023
273143,2023-02,zzzzzz.me,css,1,Feb 2023
273144,2023-02,zzzzzz.me,script,1,Feb 2023


### Monitored resources on POPULAR domains by type

In [17]:
monthly_data = pd.DataFrame()
for month in dates_to_visualize:
    current_month_data = popular_result[popular_result['date'] == month].copy()
    
    current_month_data = current_month_data.groupby(['date', 'type'], as_index=False).agg({'count': 'sum'})
    current_month_data.sort_values(by='count', ascending=False, inplace=True)
    current_month_data.reset_index(inplace=True, drop=True)
    
    current_month_data['date'] = current_month_data['date'].map(date_to_text_format)
    
    monthly_data = pd.concat([monthly_data, current_month_data])
    
monthly_data

Unnamed: 0,date,type,count
0,Feb 2019,image,111
1,Feb 2019,script,28
2,Feb 2019,html,18
3,Feb 2019,css,15
4,Feb 2019,font,11
5,Feb 2019,text,9
6,Feb 2019,other,2
7,Feb 2019,video,2
8,Feb 2019,audio,1
0,Feb 2020,script,17323


### Most monitored resource type on POPULAR domains for the whole time period (HTTPArchive data) 

In [19]:
popular_result.groupby('type')['count'].sum().sort_values(ascending=False)

type
image     6325145
script    3473579
css       1266593
html       780803
font       290743
other      254085
video       74516
text        60037
xml         29562
audio        5913
Name: count, dtype: UInt32