In [1]:
from pathlib import Path

import pandas as pd

In [2]:
pd.options.display.float_format = '{:.2f}'.format

In [3]:
dates_to_visualize = [
    "2019-02",
    "2020-02",
    "2021-02",
    "2022-02",
    "2023-02",
    # "2024-02",
]

In [4]:
available_data_files = list(Path("../data/httparchive_metrics/nel_collector_provider_usage").glob("*.parquet"))

used_data_files = list(filter(lambda file: file.stem in dates_to_visualize, available_data_files))
used_data_files

[WindowsPath('../data/httparchive_metrics/nel_collector_provider_usage/2019-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_collector_provider_usage/2020-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_collector_provider_usage/2021-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_collector_provider_usage/2022-02.parquet'),
 WindowsPath('../data/httparchive_metrics/nel_collector_provider_usage/2023-02.parquet')]

### Aggregate result to visualize


In [5]:
from results.result_utils import date_to_text_format, concat_data_from_files

result = concat_data_from_files(used_data_files)

result['date_formatted'] = result['date'].map(date_to_text_format)

result['as_primary'] = result['as_primary'].astype('UInt32')
result['as_secondary'] = result['as_secondary'].astype('UInt32')
result['among_fallback'] = result['among_fallback'].astype('UInt32')

result

Unnamed: 0,date,providers,as_primary,share_as_primary,as_secondary,share_as_secondary,among_fallback,date_formatted
0,2019-02,report-uri.com,318,85.95,0,0.00,0,Feb 2019
1,2019-02,3gl.net,21,5.68,0,0.00,0,Feb 2019
2,2019-02,uriports.com,9,2.43,0,0.00,0,Feb 2019
3,2019-02,gvt2.com,5,1.35,5,100.00,8,Feb 2019
4,2019-02,seloc.club,5,1.35,0,0.00,0,Feb 2019
...,...,...,...,...,...,...,...,...
148,2023-02,loaney.in,1,0.00,0,0.00,0,Feb 2023
149,2023-02,loaney.es,1,0.00,0,0.00,0,Feb 2023
150,2023-02,skgeodesy.sk,1,0.00,0,0.00,0,Feb 2023
151,2023-02,ijs.si,0,0.00,3,0.70,0,Feb 2023


### Top 5 Primary Collector Providers per month


In [6]:
top_5_data = result[result.index < 5][['date', 'providers', 'as_primary', 'share_as_primary']]

count_by_date = result.groupby(['date'])['as_primary'].agg(lambda group: len([x for x in group if x > 0]))
top_5_data['count'] = top_5_data['date'].map(count_by_date)

top_5_data = top_5_data.reindex(columns=['date', 'count', 'providers', 'as_primary', 'share_as_primary'])
top_5_data['date'] = top_5_data['date'].map(date_to_text_format)
# top_5_data.groupby(['date', 'count', 'providers']).first()
top_5_data.sort_values(['date', 'share_as_primary'], ascending=[True, False])

Unnamed: 0,date,count,providers,as_primary,share_as_primary
0,Feb 2019,11,report-uri.com,318,85.95
1,Feb 2019,11,3gl.net,21,5.68
2,Feb 2019,11,uriports.com,9,2.43
3,Feb 2019,11,gvt2.com,5,1.35
4,Feb 2019,11,seloc.club,5,1.35
0,Feb 2020,40,shopifycloud.com,108255,98.77
1,Feb 2020,40,report-uri.com,623,0.57
2,Feb 2020,40,powerboutique.net,254,0.23
3,Feb 2020,40,linkedin.com,102,0.09
4,Feb 2020,40,fastly-insights.com,79,0.07


### Top 5 Secondary Collector Providers per month


In [7]:
secondaries = result[['date', 'providers', 'as_secondary', 'share_as_secondary']]
secondaries = secondaries[secondaries['as_secondary'] > 0]

top_5_data = pd.DataFrame()
for month in dates_to_visualize:
    monthly_data = secondaries[secondaries['date'] == month]
    
    monthly_data.reset_index(drop=True, inplace=True)
    top_5_data = pd.concat([top_5_data, monthly_data])

top_5_data = top_5_data[top_5_data.index < 5]

count_by_date = result.groupby(['date'])['as_secondary'].agg(lambda group: len([x for x in group if x > 0]))
top_5_data['count'] = top_5_data['date'].map(count_by_date)

top_5_data = top_5_data.reindex(columns=['date', 'count', 'providers', 'as_secondary', 'share_as_secondary'])
top_5_data['date'] = top_5_data['date'].map(date_to_text_format)
# # top_5_data.groupby(['date', 'count', 'providers']).first()
top_5_data.sort_values(['date', 'share_as_secondary'], ascending=[True, False]).reset_index(drop=True)

Unnamed: 0,date,count,providers,as_secondary,share_as_secondary
0,Feb 2019,1,gvt2.com,5,100.0
1,Feb 2020,1,gvt2.com,22,100.0
2,Feb 2021,4,fastlylabs.com,80,60.61
3,Feb 2021,4,yandex.net,38,28.79
4,Feb 2021,4,gvt2.com,11,8.33
5,Feb 2021,4,ijs.si,3,2.27
6,Feb 2022,6,fastlylabs.com,102,43.4
7,Feb 2022,6,yandex.net,97,41.28
8,Feb 2022,6,gvt2.com,28,11.91
9,Feb 2022,6,ecsvc.net,5,2.13


### Top 5 Collector Providers by occurrence

In [8]:
occurrence_data = result[['date', 'providers', 'as_primary', 'as_secondary', 'among_fallback']]

top_5_data = pd.DataFrame()
for month in dates_to_visualize:
    monthly_data = occurrence_data[occurrence_data['date'] == month].copy()
    
    monthly_data['total_occurrences'] = monthly_data['as_primary'] + monthly_data['as_secondary'] + monthly_data['among_fallback']
    monthly_data.drop(columns=['as_primary', 'as_secondary', 'among_fallback'], inplace=True)
    
    monthly_data['date'] = monthly_data['date'].map(date_to_text_format)
    
    monthly_data.reset_index(drop=True, inplace=True)
    
    top_5_data = pd.concat([top_5_data, monthly_data])
    
top_5_data = top_5_data[top_5_data.index < 5]
top_5_data

Unnamed: 0,date,providers,total_occurrences
0,Feb 2019,report-uri.com,318
1,Feb 2019,3gl.net,21
2,Feb 2019,uriports.com,9
3,Feb 2019,gvt2.com,18
4,Feb 2019,seloc.club,5
0,Feb 2020,shopifycloud.com,108255
1,Feb 2020,report-uri.com,623
2,Feb 2020,powerboutique.net,254
3,Feb 2020,linkedin.com,102
4,Feb 2020,fastly-insights.com,79


### NEL Collector Providers employed by N domains


In [9]:
from results.result_utils import get_first_or_0

employment_result = pd.DataFrame({}, index=[
    '1',
    '2',
    '3-10',
    '11-100',
    '101-1K',
    'More',
])

for date in dates_to_visualize:
    month_collector_data = result[result['date'] == date].copy()
    
    month_data_col = []
    
    # month_result['1']
    next_val = month_collector_data[month_collector_data['as_primary'] == 1].count()
    month_data_col.append(get_first_or_0(next_val))
    
    # month_result['2'] 
    next_val = month_collector_data[month_collector_data['as_primary'] == 2].count()
    month_data_col.append(get_first_or_0(next_val))

    # month_result['3-10']
    next_val = month_collector_data[(month_collector_data['as_primary'] >= 3) & (month_collector_data['as_primary'] <= 10)].count()
    month_data_col.append(get_first_or_0(next_val))
    
    # month_result['3-10']
    next_val = month_collector_data[(month_collector_data['as_primary'] >= 11) & (month_collector_data['as_primary'] <= 100)].count()
    month_data_col.append(get_first_or_0(next_val))
    
    # month_result['3-10']
    next_val = month_collector_data[(month_collector_data['as_primary'] >= 101) & (month_collector_data['as_primary'] <= 1000)].count()
    month_data_col.append(get_first_or_0(next_val))
    
    # month_result['3-10']
    next_val = month_collector_data[month_collector_data['as_primary'] >= 1001].count()
    month_data_col.append(get_first_or_0(next_val))
    
    
    employment_result[date_to_text_format(date)] = month_data_col
    
employment_result

Unnamed: 0,Feb 2019,Feb 2020,Feb 2021,Feb 2022,Feb 2023
1,3,19,52,63,81
2,2,4,10,20,14
3-10,4,5,11,17,26
11-100,1,8,10,17,12
101-1K,1,3,5,9,9
More,0,1,4,3,9


### Total number of collectors found during the analysed time period


In [10]:
len(result['providers'].unique())

210