In [1]:
from pathlib import Path

import pandas as pd

In [2]:
pd.options.display.float_format = '{:.2f}'.format

In [3]:
dates_to_visualize = [
                                                                                          # No TRANCO list available before 2018-12
                                                                                          # "2018-09", "2018-10", "2018-11", 
                                                                                                                             "2018-12",
    "2019-01", "2019-02", "2019-03", "2019-04", "2019-05", "2019-06", "2019-07", "2019-08", "2019-09", "2019-10", "2019-11", "2019-12", 
    "2020-01", "2020-02", "2020-03", "2020-04", "2020-05", "2020-06", "2020-07", "2020-08", "2020-09", "2020-10", "2020-11", "2020-12",
    "2021-01", "2021-02", "2021-03", "2021-04", "2021-05", "2021-06", "2021-07", "2021-08", "2021-09", "2021-10", "2021-11", "2021-12",
    "2022-01", "2022-02", "2022-03", "2022-04", "2022-05", "2022-06", "2022-07", "2022-08", "2022-09", "2022-10", "2022-11", "2022-12",
    "2023-01", "2023-02",
    # "2024-02",
]

In [4]:
available_data_files = list(Path("../data/httparchive_metrics/popular_nel_collector_provider_usage").glob("*.parquet"))

used_data_files = list(filter(lambda file: file.stem in dates_to_visualize, available_data_files))
used_data_files

[WindowsPath('../data/httparchive_metrics/popular_nel_collector_provider_usage/2018-12.parquet'),
 WindowsPath('../data/httparchive_metrics/popular_nel_collector_provider_usage/2019-02.parquet'),
 WindowsPath('../data/httparchive_metrics/popular_nel_collector_provider_usage/2019-03.parquet'),
 WindowsPath('../data/httparchive_metrics/popular_nel_collector_provider_usage/2019-04.parquet'),
 WindowsPath('../data/httparchive_metrics/popular_nel_collector_provider_usage/2019-05.parquet'),
 WindowsPath('../data/httparchive_metrics/popular_nel_collector_provider_usage/2019-06.parquet'),
 WindowsPath('../data/httparchive_metrics/popular_nel_collector_provider_usage/2019-07.parquet'),
 WindowsPath('../data/httparchive_metrics/popular_nel_collector_provider_usage/2019-08.parquet'),
 WindowsPath('../data/httparchive_metrics/popular_nel_collector_provider_usage/2019-09.parquet'),
 WindowsPath('../data/httparchive_metrics/popular_nel_collector_provider_usage/2019-10.parquet'),
 WindowsPath('../dat

### Aggregate result to visualize


In [5]:
from results.result_utils import date_to_text_format, concat_data_from_files

result = concat_data_from_files(used_data_files)

result['date_formatted'] = result['date'].map(date_to_text_format)

result['as_primary'] = result['as_primary'].astype('UInt32')
result['as_secondary'] = result['as_secondary'].astype('UInt32')
result['among_fallback'] = result['among_fallback'].astype('UInt32')

result

Unnamed: 0,date,providers,as_primary,share_as_primary,as_secondary,share_as_secondary,among_fallback,date_formatted
0,2018-12,report-uri.com,8,88.89,0,0.00,0,Dec 2018
1,2018-12,krvtz.net,1,11.11,0,0.00,0,Dec 2018
0,2019-02,report-uri.com,11,84.62,0,0.00,0,Feb 2019
1,2019-02,krvtz.net,1,7.69,0,0.00,0,Feb 2019
2,2019-02,uriports.com,1,7.69,0,0.00,0,Feb 2019
...,...,...,...,...,...,...,...,...
65,2023-02,dj9s4kmieytgz.cloudfront.net,0,0.00,0,0.00,0,Feb 2023
66,2023-02,logflare.app,0,0.00,0,0.00,0,Feb 2023
67,2023-02,jeurissen.co,0,0.00,0,0.00,0,Feb 2023
68,2023-02,logq.net,0,0.00,0,0.00,0,Feb 2023


### Top 5 POPULAR Primary Collector Providers per month


In [6]:
top_5_data = result[result.index < 5][['date', 'providers', 'as_primary', 'share_as_primary']]

count_by_date = result.groupby(['date'])['as_primary'].agg(lambda group: len([x for x in group if x > 0]))
top_5_data['count'] = top_5_data['date'].map(count_by_date)

top_5_data = top_5_data.reindex(columns=['date', 'count', 'providers', 'as_primary', 'share_as_primary'])
top_5_data['date'] = top_5_data['date'].map(date_to_text_format)
# top_5_data.groupby(['date', 'count', 'providers']).first()
top_5_data.sort_values(['date', 'share_as_primary'], ascending=[True, False])

Unnamed: 0,date,count,providers,as_primary,share_as_primary
0,Apr 2019,4,shopifycloud.com,4775,99.69
1,Apr 2019,4,report-uri.com,12,0.25
2,Apr 2019,4,krvtz.net,1,0.02
3,Apr 2019,4,uriports.com,1,0.02
0,Apr 2020,10,shopifycloud.com,5931,99.45
...,...,...,...,...,...
0,Sep 2022,36,cloudflare.com,71514,99.55
1,Sep 2022,36,cafe24.com,170,0.24
2,Sep 2022,36,yandex.net,27,0.04
3,Sep 2022,36,report-uri.com,23,0.03


### Top 5 POPULAR Secondary Collector Providers per month


In [7]:
secondaries = result[['date', 'providers', 'as_secondary', 'share_as_secondary']]
secondaries = secondaries[secondaries['as_secondary'] > 0]

top_5_data = pd.DataFrame()
for month in dates_to_visualize:
    monthly_data = secondaries[secondaries['date'] == month]
    
    monthly_data.reset_index(drop=True, inplace=True)
    top_5_data = pd.concat([top_5_data, monthly_data])

top_5_data = top_5_data[top_5_data.index < 5]

count_by_date = result.groupby(['date'])['as_secondary'].agg(lambda group: len([x for x in group if x > 0]))
top_5_data['count'] = top_5_data['date'].map(count_by_date)

top_5_data = top_5_data.reindex(columns=['date', 'count', 'providers', 'as_secondary', 'share_as_secondary'])
top_5_data['date'] = top_5_data['date'].map(date_to_text_format)
# # top_5_data.groupby(['date', 'count', 'providers']).first()
top_5_data.sort_values(['date', 'share_as_secondary'], ascending=[True, False]).reset_index(drop=True)

Unnamed: 0,date,count,providers,as_secondary,share_as_secondary
0,Apr 2021,2,fastlylabs.com,1,50.00
1,Apr 2021,2,ijs.si,1,50.00
2,Apr 2022,2,fastlylabs.com,1,50.00
3,Apr 2022,2,ijs.si,1,50.00
4,Aug 2020,2,fastlylabs.com,1,50.00
...,...,...,...,...,...
64,Sep 2021,2,fastlylabs.com,1,50.00
65,Sep 2021,2,ijs.si,1,50.00
66,Sep 2022,3,fastlylabs.com,1,33.33
67,Sep 2022,3,ijs.si,1,33.33


### Top 5 POPULAR Collector Providers by occurrence

In [8]:
occurrence_data = result[['date', 'providers', 'as_primary', 'as_secondary', 'among_fallback']]

top_5_data = pd.DataFrame()
for month in dates_to_visualize:
    monthly_data = occurrence_data[occurrence_data['date'] == month].copy()
    
    monthly_data['total_occurrences'] = monthly_data['as_primary'] + monthly_data['as_secondary'] + monthly_data['among_fallback']
    monthly_data.drop(columns=['as_primary', 'as_secondary', 'among_fallback'], inplace=True)
    
    monthly_data['date'] = monthly_data['date'].map(date_to_text_format)
    
    monthly_data.reset_index(drop=True, inplace=True)
    
    top_5_data = pd.concat([top_5_data, monthly_data])
    
top_5_data = top_5_data[top_5_data.index < 5]
top_5_data

Unnamed: 0,date,providers,total_occurrences
0,Dec 2018,report-uri.com,8
1,Dec 2018,krvtz.net,1
0,Feb 2019,report-uri.com,11
1,Feb 2019,krvtz.net,1
2,Feb 2019,uriports.com,1
...,...,...,...
0,Feb 2023,cloudflare.com,77631
1,Feb 2023,cafe24.com,205
2,Feb 2023,yandex.net,47
3,Feb 2023,report-uri.com,25


### POPULAR NEL Collector Providers employed by N domains


In [9]:
from results.result_utils import get_first_or_0

employment_result = pd.DataFrame({}, index=[
    '1',
    '2',
    '3-10',
    '11-100',
    '101-1K',
    'More',
])

for date in dates_to_visualize:
    month_collector_data = result[result['date'] == date].copy()
    
    month_data_col = []
    
    # month_result['1']
    next_val = month_collector_data[month_collector_data['as_primary'] == 1].count()
    month_data_col.append(get_first_or_0(next_val))
    
    # month_result['2'] 
    next_val = month_collector_data[month_collector_data['as_primary'] == 2].count()
    month_data_col.append(get_first_or_0(next_val))

    # month_result['3-10']
    next_val = month_collector_data[(month_collector_data['as_primary'] >= 3) & (month_collector_data['as_primary'] <= 10)].count()
    month_data_col.append(get_first_or_0(next_val))
    
    # month_result['3-10']
    next_val = month_collector_data[(month_collector_data['as_primary'] >= 11) & (month_collector_data['as_primary'] <= 100)].count()
    month_data_col.append(get_first_or_0(next_val))
    
    # month_result['3-10']
    next_val = month_collector_data[(month_collector_data['as_primary'] >= 101) & (month_collector_data['as_primary'] <= 1000)].count()
    month_data_col.append(get_first_or_0(next_val))
    
    # month_result['3-10']
    next_val = month_collector_data[month_collector_data['as_primary'] >= 1001].count()
    month_data_col.append(get_first_or_0(next_val))
    
    
    employment_result[date_to_text_format(date)] = month_data_col
    
employment_result

Unnamed: 0,Dec 2018,Jan 2019,Feb 2019,Mar 2019,Apr 2019,May 2019,Jun 2019,Jul 2019,Aug 2019,Sep 2019,...,May 2022,Jun 2022,Jul 2022,Aug 2022,Sep 2022,Oct 2022,Nov 2022,Dec 2022,Jan 2023,Feb 2023
1,1,0,2,2,2,2,2,1,2,2,...,0,0,19,19,18,22,16,16,15,21
2,0,0,0,0,0,0,0,0,1,0,...,0,0,4,3,4,2,3,4,4,4
3-10,1,0,0,0,0,0,0,1,0,2,...,0,0,7,8,7,10,9,7,8,5
11-100,0,0,1,1,1,1,1,1,1,1,...,0,0,4,5,5,4,5,6,4,5
101-1K,0,0,0,0,0,0,0,0,0,0,...,0,0,1,1,1,1,1,1,1,1
More,0,0,0,0,1,1,1,1,1,1,...,0,0,1,1,1,1,1,1,1,1


### Total number of POPULAR collectors found during the analysed time period


In [10]:
len(result['providers'].unique())

70