In [1]:
from pathlib import Path

import pandas as pd

In [2]:
dates_to_visualize = [
                                                                                            "2018-09", "2018-10", "2018-11", "2018-12",
    "2019-01", "2019-02", "2019-03", "2019-04", "2019-05", "2019-06", "2019-07", "2019-08", "2019-09", "2019-10", "2019-11", "2019-12",
    "2020-01", "2020-02", "2020-03", "2020-04", "2020-05", "2020-06", "2020-07", "2020-08", "2020-09", "2020-10", "2020-11", "2020-12",
    "2021-01", "2021-02", "2021-03", "2021-04", "2021-05", "2021-06", "2021-07", "2021-08", "2021-09", "2021-10", "2021-11", "2021-12",
    "2022-01", "2022-02", "2022-03", "2022-04", "2022-05", "2022-06", "2022-07", "2022-08", "2022-09", "2022-10", "2022-11", "2022-12",
    "2023-01", "2023-02", "2023-03", "2023-04", "2023-05", "2023-06", "2023-07", "2023-08", "2023-09", "2023-10", "2023-11", "2023-12",
    "2024-01", "2024-02", "2024-03", "2024-04", 
]

In [3]:
available_data_files = list(Path("../data/httparchive_metrics/nel_resource_config_variability").glob("*.parquet"))

used_data_files = list(filter(lambda file: file.stem in dates_to_visualize, available_data_files))
used_data_files

### Aggregate result to visualize

In [4]:
from results.result_utils import date_to_text_format, concat_data_from_files

result = concat_data_from_files(used_data_files)

result['date_formatted'] = result['date'].map(date_to_text_format)

result

### TOP 3 most popular config variations per month

In [5]:
monthly_variations = pd.DataFrame()
for month in dates_to_visualize:
    current_month_data = result[result['date'] == month].copy()
    
    current_month_data = current_month_data.groupby(['nel_include_subdomains', 'nel_failure_fraction', 'nel_success_fraction', 'nel_max_age'], observed=True, as_index=False).agg(domains=('url_domain', 'count'))
    
    current_month_data.sort_values(by='domains', ascending=False, inplace=True)
    
    current_month_data['date'] = date_to_text_format(month)
    current_month_data = current_month_data.reindex(columns=['date', 'nel_include_subdomains', 'nel_failure_fraction', 'nel_success_fraction', 'nel_max_age', 'domains'])
    
    current_month_data.reset_index(drop=True, inplace=True)
    
    monthly_variations = pd.concat([monthly_variations, current_month_data])
    
monthly_variations = monthly_variations[monthly_variations.index < 3] 
monthly_variations

#### TABLE - Anually most popular collector provider config

In [47]:
data = monthly_variations.copy()
data = data[data.index == 0]
data.reset_index(drop=True, inplace=True)

target_dates = ['Dec ' + str(x) for x in range(2018, 2024)]
target_dates.append('Apr 2024')

popular_config_table = data[data['date'].isin(target_dates)].copy()
popular_config_table['domains'] = popular_config_table['domains'].astype('float32')

popular_config_table.rename(columns={
    'date': 'Dátum',
    'nel_include_subdomains': 'IS',
    'nel_failure_fraction': 'FF',
    'nel_success_fraction': 'SF',
    'nel_max_age': 'MA',
    'domains': 'Počet domén',
}, inplace=True)

# popular_config_table['Konfigurácia'] = popular_config_table.agg(
#     lambda x: f"IS: {x['include_subdomains']},\tFF: {x['failure_fraction']},\tSF {x['success_fraction']},\tMA: {x['max_age']}", axis=1)

popular_config_table = popular_config_table[['Dátum', 'IS', 'FF', 'SF', 'MA', 'Počet domén']]
popular_config_table.to_latex("tables/httparchive_nel_domain_config_popular.tex", float_format=lambda x: '{:,.0f}'.format(x), index=False)
popular_config_table.reset_index(drop=True)

### Number of NEL Config variations found on the analyzed domains 

In [6]:
monthly_variation_count_data = pd.DataFrame()
for month in dates_to_visualize:
    current_month_data = result[result['date'] == month][['date', 'url_domain', 'nel_include_subdomains']].copy()
    
    config_variations_by_url_domain = current_month_data.groupby(['url_domain'])['nel_include_subdomains'].count()
    current_month_data['variation_count'] = current_month_data['url_domain'].map(config_variations_by_url_domain)
    current_month_data.drop(columns=['nel_include_subdomains'], inplace=True)
        
    current_month_data = current_month_data.groupby(['variation_count']).agg(domains=('url_domain', 'count'))
    current_month_data.reset_index(inplace=True)    
    
    current_month_data['date'] = date_to_text_format(month)
    current_month_data = current_month_data[['date', 'variation_count', 'domains']]
    
    monthly_variation_count_data = pd.concat([monthly_variation_count_data, current_month_data])
    
monthly_variation_count_data

#### TABLE - Variations pivot table

In [44]:
pd.options.display.float_format = '{:,.0f}'.format

data = monthly_variation_count_data.copy()
data['Dátum'] = data['date'].map(lambda date: pd.to_datetime(date, format="%b %Y"))
data['Dátum'] = data['Dátum'].map(lambda date: date.strftime('%Y-%m'))

data.rename(columns={'variation_count': 'Počet variácií'}, inplace=True)

data['domains'] = data['domains'].astype('int64')

variations_table = data.pivot(index='Počet variácií', columns='Dátum', values='domains')
variations_table = variations_table.fillna(0)

last_month_col_name = pd.to_datetime("Apr 2024", format="%b %Y").strftime("%Y-%m")  # TODO change to Apr 2024!!!!!!!

target_col_names = list(filter(lambda col_name: col_name.endswith('12'), variations_table.columns))
target_col_names.append(last_month_col_name)

variations_table = variations_table[target_col_names]


variations_table.to_latex("tables/httparchive_nel_domain_config_variations.tex", float_format=lambda x: '{:,.0f}'.format(x))
variations_table