In [32]:
from pathlib import Path

import pandas as pd

In [33]:
pd.options.display.float_format = '{:.2f}'.format

In [34]:
dates_to_visualize = [
    "2019-02",
    "2020-02",
    "2021-02",
    "2022-02",
    "2023-02",
    # "2024-02",
]

In [35]:
available_data_files = list(Path("../data/httparchive_metrics/popular_nel_collector_provider_usage").glob("*.parquet"))

used_data_files = list(filter(lambda file: file.stem in dates_to_visualize, available_data_files))
used_data_files

[WindowsPath('../data/httparchive_metrics/popular_nel_collector_provider_usage/2019-02.parquet'),
 WindowsPath('../data/httparchive_metrics/popular_nel_collector_provider_usage/2020-02.parquet'),
 WindowsPath('../data/httparchive_metrics/popular_nel_collector_provider_usage/2021-02.parquet'),
 WindowsPath('../data/httparchive_metrics/popular_nel_collector_provider_usage/2022-02.parquet'),
 WindowsPath('../data/httparchive_metrics/popular_nel_collector_provider_usage/2023-02.parquet')]

### Aggregate result to visualize


In [36]:
from results.result_utils import date_to_text_format, concat_data_from_files

result = concat_data_from_files(used_data_files)

result['date_formatted'] = result['date'].map(date_to_text_format)

result['as_primary'] = result['as_primary'].astype('UInt32')
result['as_secondary'] = result['as_secondary'].astype('UInt32')
result['among_fallback'] = result['among_fallback'].astype('UInt32')

result

Unnamed: 0,date,providers,as_primary,share_as_primary,as_secondary,share_as_secondary,among_fallback,date_formatted
0,2019-02,report-uri.com,11,84.62,0,0.00,0,Feb 2019
1,2019-02,uriports.com,1,7.69,0,0.00,0,Feb 2019
2,2019-02,krvtz.net,1,7.69,0,0.00,0,Feb 2019
0,2020-02,shopifycloud.com,5875,99.51,0,0.00,0,Feb 2020
1,2020-02,report-uri.com,16,0.27,0,0.00,0,Feb 2020
...,...,...,...,...,...,...,...,...
49,2023-02,sakshat.ac.in,0,0.00,0,0.00,0,Feb 2023
50,2023-02,krvtz.net,0,0.00,0,0.00,0,Feb 2023
51,2023-02,dj9s4kmieytgz.cloudfront.net,0,0.00,0,0.00,0,Feb 2023
52,2023-02,nelreports.net,0,0.00,0,0.00,0,Feb 2023


### Top 5 POPULAR Primary Collector Providers per month


In [37]:
top_5_data = result[result.index < 5][['date', 'providers', 'as_primary', 'share_as_primary']]

count_by_date = result.groupby(['date'])['as_primary'].agg(lambda group: len([x for x in group if x > 0]))
top_5_data['count'] = top_5_data['date'].map(count_by_date)

top_5_data = top_5_data.reindex(columns=['date', 'count', 'providers', 'as_primary', 'share_as_primary'])
top_5_data['date'] = top_5_data['date'].map(date_to_text_format)
# top_5_data.groupby(['date', 'count', 'providers']).first()
top_5_data.sort_values(['date', 'share_as_primary'], ascending=[True, False])

Unnamed: 0,date,count,providers,as_primary,share_as_primary
0,Feb 2019,3,report-uri.com,11,84.62
1,Feb 2019,3,uriports.com,1,7.69
2,Feb 2019,3,krvtz.net,1,7.69
0,Feb 2020,10,shopifycloud.com,5875,99.51
1,Feb 2020,10,report-uri.com,16,0.27
2,Feb 2020,10,highwebmedia.com,3,0.05
3,Feb 2020,10,uriports.com,2,0.03
4,Feb 2020,10,europe-west1-bbc-otg-traf-mgr-bq-prod-4591.clo...,2,0.03
0,Feb 2021,22,cloudflare.com,50658,88.8
1,Feb 2021,22,shopifycloud.com,6311,11.06


### Top 5 POPULAR Secondary Collector Providers per month


In [38]:
secondaries = result[['date', 'providers', 'as_secondary', 'share_as_secondary']]
secondaries = secondaries[secondaries['as_secondary'] > 0]

top_5_data = pd.DataFrame()
for month in dates_to_visualize:
    monthly_data = secondaries[secondaries['date'] == month]
    
    monthly_data.reset_index(drop=True, inplace=True)
    top_5_data = pd.concat([top_5_data, monthly_data])

top_5_data = top_5_data[top_5_data.index < 5]

count_by_date = result.groupby(['date'])['as_secondary'].agg(lambda group: len([x for x in group if x > 0]))
top_5_data['count'] = top_5_data['date'].map(count_by_date)

top_5_data = top_5_data.reindex(columns=['date', 'count', 'providers', 'as_secondary', 'share_as_secondary'])
top_5_data['date'] = top_5_data['date'].map(date_to_text_format)
# # top_5_data.groupby(['date', 'count', 'providers']).first()
top_5_data.sort_values(['date', 'share_as_secondary'], ascending=[True, False]).reset_index(drop=True)

Unnamed: 0,date,count,providers,as_secondary,share_as_secondary
0,Feb 2021,2,fastlylabs.com,1,50.0
1,Feb 2021,2,ijs.si,1,50.0
2,Feb 2022,2,ijs.si,1,50.0
3,Feb 2022,2,fastlylabs.com,1,50.0
4,Feb 2023,3,yandex.net,20,90.91
5,Feb 2023,3,fastlylabs.com,1,4.55
6,Feb 2023,3,ijs.si,1,4.55


### Top 5 POPULAR Collector Providers by occurrence

In [39]:
occurrence_data = result[['date', 'providers', 'as_primary', 'as_secondary', 'among_fallback']]

top_5_data = pd.DataFrame()
for month in dates_to_visualize:
    monthly_data = occurrence_data[occurrence_data['date'] == month].copy()
    
    monthly_data['total_occurrences'] = monthly_data['as_primary'] + monthly_data['as_secondary'] + monthly_data['among_fallback']
    monthly_data.drop(columns=['as_primary', 'as_secondary', 'among_fallback'], inplace=True)
    
    monthly_data['date'] = monthly_data['date'].map(date_to_text_format)
    
    monthly_data.reset_index(drop=True, inplace=True)
    
    top_5_data = pd.concat([top_5_data, monthly_data])
    
top_5_data = top_5_data[top_5_data.index < 5]
top_5_data

Unnamed: 0,date,providers,total_occurrences
0,Feb 2019,report-uri.com,11
1,Feb 2019,uriports.com,1
2,Feb 2019,krvtz.net,1
0,Feb 2020,shopifycloud.com,5875
1,Feb 2020,report-uri.com,16
2,Feb 2020,highwebmedia.com,3
3,Feb 2020,uriports.com,2
4,Feb 2020,europe-west1-bbc-otg-traf-mgr-bq-prod-4591.clo...,2
0,Feb 2021,cloudflare.com,50658
1,Feb 2021,shopifycloud.com,6311


### POPULAR NEL Collector Providers employed by N domains


In [40]:
from results.result_utils import get_first_or_0

employment_result = pd.DataFrame({}, index=[
    '1',
    '2',
    '3-10',
    '11-100',
    '101-1K',
    'More',
])

for date in dates_to_visualize:
    month_collector_data = result[result['date'] == date].copy()
    
    month_data_col = []
    
    # month_result['1']
    next_val = month_collector_data[month_collector_data['as_primary'] == 1].count()
    month_data_col.append(get_first_or_0(next_val))
    
    # month_result['2'] 
    next_val = month_collector_data[month_collector_data['as_primary'] == 2].count()
    month_data_col.append(get_first_or_0(next_val))

    # month_result['3-10']
    next_val = month_collector_data[(month_collector_data['as_primary'] >= 3) & (month_collector_data['as_primary'] <= 10)].count()
    month_data_col.append(get_first_or_0(next_val))
    
    # month_result['3-10']
    next_val = month_collector_data[(month_collector_data['as_primary'] >= 11) & (month_collector_data['as_primary'] <= 100)].count()
    month_data_col.append(get_first_or_0(next_val))
    
    # month_result['3-10']
    next_val = month_collector_data[(month_collector_data['as_primary'] >= 101) & (month_collector_data['as_primary'] <= 1000)].count()
    month_data_col.append(get_first_or_0(next_val))
    
    # month_result['3-10']
    next_val = month_collector_data[month_collector_data['as_primary'] >= 1001].count()
    month_data_col.append(get_first_or_0(next_val))
    
    
    employment_result[date_to_text_format(date)] = month_data_col
    
employment_result

Unnamed: 0,Feb 2019,Feb 2020,Feb 2021,Feb 2022,Feb 2023
1,2,5,14,19,16
2,0,2,1,3,2
3-10,0,1,2,9,7
11-100,1,1,3,2,4
101-1K,0,0,0,1,1
More,0,1,2,1,1


### Total number of POPULAR collectors found during the analysed time period


In [41]:
len(result['providers'].unique())

54