In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import os

In [2]:
results_df_dict = {}

for root, dirs, files in os.walk("results/"):
    for file in sorted(files):
        file_path = f"{root}{file}"
        record_name = file.split('.')[2]
        tmp_df = pd.read_parquet(file_path)
        results_df_dict[record_name] = tmp_df

In [3]:
keys = list(results_df_dict.keys())
keys

['2018_01_01_desktop',
 '2019_02_01_desktop',
 '2020_01_01_desktop',
 '2021_01_01_desktop',
 '2022_01_01_desktop',
 '2023_01_01_desktop']

In [4]:
# nel_include_subdomains default false
# nel_success_fraction default 0.0
# nel_failure_fraction default 1.0

from operator import is_

for key in keys[1:]:
    tmp_df = results_df_dict[key]

    tmp_df['nel_include_subdomains'] = tmp_df['nel_include_subdomains'].astype('bool')
    tmp_df['nel_success_fraction'] = tmp_df['nel_success_fraction'].astype('float')
    tmp_df['nel_failure_fraction'] = tmp_df['nel_failure_fraction'].astype('float')

    tmp_df['nel_include_subdomains'] = np.where(np.vectorize(is_)(tmp_df['nel_include_subdomains'], None), False, tmp_df['nel_include_subdomains'])
    tmp_df['nel_success_fraction'] = np.where(np.vectorize(is_)(tmp_df['nel_success_fraction'], None), 0.0, tmp_df['nel_success_fraction'])
    tmp_df['nel_failure_fraction'] = np.where(np.vectorize(is_)(tmp_df['nel_failure_fraction'], None), 1.0, tmp_df['nel_failure_fraction'])

In [5]:
def get_virtue(results_df_dict, what, col_name):
    virtue_df = pd.DataFrame()

    for key in keys[1:]:
        tmp_df = results_df_dict[key][[what, 'requestid']].groupby(by=[what]).count().sort_values(by='requestid', ascending=False).reset_index()
        year = key.split('_')[0]
        tmp_df.columns = [f'{year}_{col_name}', f'{year}_cnt']
        tmp_df[f'{year}_prcnt'] = 100/len(results_df_dict[key])*tmp_df[f'{year}_cnt']
        virtue_df = pd.merge(virtue_df, tmp_df, how='outer', left_index=True, right_index=True)

    return virtue_df

In [6]:
success_df = get_virtue(results_df_dict, 'nel_success_fraction', 'success')
success_df

Unnamed: 0,2019_success,2019_cnt,2019_prcnt,2020_success,2020_cnt,2020_prcnt,2021_success,2021_cnt,2021_prcnt,2022_success,2022_cnt,2022_prcnt,2023_success,2023_cnt,2023_prcnt
0,0.0,23.0,6.478873,0.0001,79448.0,98.989521,0.0001,184574.0,23.455541,0.0,642181,91.058634,0.0,1181443,67.129808
1,0.01,2.0,0.56338,0.00066,31.0,0.038625,0.0,728.0,0.092514,0.01,50457,7.154596,0.01,539896,30.67699
2,0.25,2.0,0.56338,0.0,27.0,0.033641,0.001,176.0,0.022366,0.001,198,0.028076,0.001,9031,0.513143
3,0.0001,1.0,0.28169,0.001,25.0,0.031149,0.00066,39.0,0.004956,0.005,70,0.009926,0.005,573,0.032558
4,,,,0.05,5.0,0.00623,0.01,19.0,0.002415,0.00066,48,0.006806,0.2,510,0.028978
5,,,,0.25,2.0,0.002492,1.0,4.0,0.000508,0.0001,24,0.003403,0.00066,18,0.001023
6,,,,0.01,1.0,0.001246,0.05,3.0,0.000381,0.05,12,0.001702,1.0,17,0.000966
7,,,,0.1,1.0,0.001246,0.1,2.0,0.000254,0.2,9,0.001276,0.05,16,0.000909
8,,,,1.0,1.0,0.001246,,,,0.1,2,0.000284,0.0001,11,0.000625
9,,,,,,,,,,0.5,2,0.000284,0.1,10,0.000568


In [7]:
failure_df = get_virtue(results_df_dict, 'nel_failure_fraction', 'failure')
failure_df

Unnamed: 0,2019_failure,2019_cnt,2019_prcnt,2020_failure,2020_cnt,2020_prcnt,2021_failure,2021_cnt,2021_prcnt,2022_failure,2022_cnt,2022_prcnt,2023_failure,2023_cnt,2023_prcnt
0,1e-05,223.0,62.816901,0.01,79464.0,99.009457,0.05,634.0,0.080568,0.1,10415.0,1.476804,0.1,15254,0.866735
1,1.0,33.0,9.295775,1e-05,268.0,0.333919,1e-05,325.0,0.041301,0.05,695.0,0.098548,1.0,9741,0.553485
2,0.1,3.0,0.84507,1.0,118.0,0.147024,1.0,265.0,0.033676,1.0,414.0,0.058704,0.001,3319,0.188586
3,0.5,2.0,0.56338,0.1,14.0,0.017444,0.1,117.0,0.014868,1e-05,393.0,0.055726,0.05,880,0.050002
4,0.001,1.0,0.28169,0.5,4.0,0.004984,0.01,100.0,0.012708,0.01,76.0,0.010776,1e-05,584,0.033183
5,0.01,1.0,0.28169,0.0001,2.0,0.002492,0.25,5.0,0.000635,0.001,39.0,0.00553,0.01,119,0.006762
6,0.8,1.0,0.28169,0.001,2.0,0.002492,0.0001,4.0,0.000508,0.15,13.0,0.001843,0.2,79,0.004489
7,,,,0.2,2.0,0.002492,0.2,4.0,0.000508,0.5,7.0,0.000993,0.5,63,0.00358
8,,,,0.0,1.0,0.001246,0.5,3.0,0.000381,0.25,4.0,0.000567,1e-06,10,0.000568
9,,,,0.05,1.0,0.001246,0.001,1.0,0.000127,0.9,4.0,0.000567,0.25,7,0.000398


In [9]:
subdomains_df = get_virtue(results_df_dict, 'nel_include_subdomains', 'subdomains')
subdomains_df

Unnamed: 0,2019_subdomains,2019_cnt,2019_prcnt,2020_subdomains,2020_cnt,2020_prcnt,2021_subdomains,2021_cnt,2021_prcnt,2022_subdomains,2022_cnt,2022_prcnt,2023_subdomains,2023_cnt,2023_prcnt
0,True,348,98.028169,False,79502,99.056804,False,785793,99.858052,False,703790,99.794538,False,1746510,99.237019
1,False,7,1.971831,True,757,0.943196,True,1117,0.141948,True,1449,0.205462,True,13428,0.762981


In [10]:
results_df_dict[keys[5]].columns

Index(['requestid', 'type', 'ext', 'firstReq', 'status', 'url',
       'unique_domain_count_before_filtration',
       'unique_domain_firstreq_count_before_filtration', 'contains_nel',
       'nel_max_age', 'nel_failure_fraction', 'nel_success_fraction',
       'nel_include_subdomains', 'nel_report_to_group',
       'nel_count_before_filtration', 'rt_group', 'rt_endpoints', 'rt_url',
       'rt_url_sld'],
      dtype='object')

In [12]:
tmp_df = results_df_dict[keys[5]][['nel_failure_fraction', 'rt_url_sld', 'requestid']].groupby(by=['rt_url_sld', 'nel_failure_fraction']).count().sort_values(by='requestid', ascending=False).reset_index()


In [13]:
tmp_df

Unnamed: 0,rt_url_sld,nel_failure_fraction,requestid
0,cafe24.com,0.1000,14985
1,office.net,1.0000,8127
2,cloudfront.net,0.0010,3318
3,nelreports.net,1.0000,1200
4,wikimedia.org,0.0500,759
...,...,...,...
74,movizor-info.ru,1.0000,1
75,report-uri.com,0.0001,1
76,planer.io,1.0000,1
77,raiffeisen.ch,0.2500,1
