In [1]:
from pathlib import Path

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import matplotlib.dates as mdates
import matplotlib.ticker as ticker
import seaborn as sns

In [2]:
pd.options.display.float_format = '{:,.2f}'.format

In [3]:
dates_to_visualize = [
                                                                                            "2018-09", "2018-10", "2018-11", "2018-12",
    "2019-01", "2019-02", "2019-03", "2019-04", "2019-05", "2019-06", "2019-07", "2019-08", "2019-09", "2019-10", "2019-11", "2019-12",
    "2020-01", "2020-02", "2020-03", "2020-04", "2020-05", "2020-06", "2020-07", "2020-08", "2020-09", "2020-10", "2020-11", "2020-12",
    "2021-01", "2021-02", "2021-03", "2021-04", "2021-05", "2021-06", "2021-07", "2021-08", "2021-09", "2021-10", "2021-11", "2021-12",
    "2022-01", "2022-02", "2022-03", "2022-04", "2022-05", "2022-06", "2022-07", "2022-08", "2022-09", "2022-10", "2022-11", "2022-12",
    "2023-01", "2023-02", "2023-03", "2023-04", "2023-05", "2023-06", "2023-07", "2023-08", "2023-09", "2023-10", "2023-11", "2023-12",
    "2024-01", "2024-02", "2024-03", "2024-04", 
]

In [4]:
available_data_files = list(Path("../data/httparchive_metrics/nel_deployment").glob("*.parquet"))

used_data_files = list(filter(lambda file: file.stem in dates_to_visualize, available_data_files))
used_data_files

### Aggregate result to visualize

In [5]:
from results.result_utils import date_to_text_format, concat_data_from_files

result = concat_data_from_files(used_data_files)

result['total_crawled_domains'] = np.floor(pd.to_numeric(result['total_crawled_domains'], errors='coerce')).astype(
    'UInt32')
result['total_crawled_domains_with_nel'] = np.floor(
    pd.to_numeric(result['total_crawled_domains_with_nel'], errors='coerce')).astype('UInt32')
result['total_crawled_domains_with_correct_nel'] = np.floor(
    pd.to_numeric(result['total_crawled_domains_with_correct_nel'], errors='coerce')).astype('UInt32')

result['total_crawled_resources'] = np.floor(pd.to_numeric(result['total_crawled_resources'], errors='coerce')).astype(
    'UInt32')
result['total_crawled_resources_with_nel'] = np.floor(
    pd.to_numeric(result['total_crawled_resources_with_nel'], errors='coerce')).astype('UInt32')
result['total_crawled_resources_with_correct_nel'] = np.floor(
    pd.to_numeric(result['total_crawled_resources_with_correct_nel'], errors='coerce')).astype('UInt32')

result['date'] = result['date'].map(date_to_text_format)
result["nel_percentage"] = \
    np.uint32(result['total_crawled_domains_with_correct_nel']) / np.uint32(result['total_crawled_domains']) * 100

result

### DOMAINS queried within each month and those responding with valid NEL headers

In [6]:
result[['date', 'total_crawled_domains', 'total_crawled_domains_with_correct_nel', 'nel_percentage']]

#### Deloyment state graph

In [7]:
to_plot = result.copy()
to_plot['date_fmt'] = to_plot['date'].map(lambda date: pd.to_datetime(date))
# to_plot[('date_fmt')]

fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.yaxis.get_major_formatter().set_scientific(False)
ax.yaxis.get_major_formatter().set_useOffset(False)

sns.lineplot(to_plot, x='date_fmt', y='total_crawled_domains_with_correct_nel', color='green', ax=ax)

ax.xaxis.remove_overlapping_locs = False

years = mdates.YearLocator()
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(mdates.DateFormatter("\n\n%Y"))

months = mdates.MonthLocator(interval=1)
months_fmt = mdates.DateFormatter('%b')
ax.xaxis.set_minor_locator(months)
ax.xaxis.set_minor_formatter(months_fmt)

# ax.tick_params('x', length=4, width=4, which='both')
ax.tick_params(axis='x', which="minor", rotation=90, labelsize=9)


ax.set(xlabel='Dátum', ylabel='Domény so správne nasadeným NEL')
# plt.setp(ax.xaxis.get_majorticklabels(), )
plt.setp(ax.xaxis.get_minorticklabels(), rotation=90)

plt.grid(True, which='minor', linestyle='--',)
# ax.set(ylim = (0,2_400_000))

ax.yaxis.set_major_locator(ticker.MultipleLocator(150_000))
ax.yaxis.set_major_formatter(ticker.EngFormatter())
ax.tick_params(labelleft=False, labelright=True)

ax.lines[0].set_marker('.')
ax.lines[0].set_markersize(10)
ax.lines[0].set_markeredgecolor('gray')
ax.lines[0].set_markeredgewidth(1.5)


fig.tight_layout()
# fig.subplots_adjust(bottom=0.2) # or whatever
plt.savefig('graphs/httparchive_nel_deployment.pdf')


#### Deployment ratio values graph

In [8]:
to_plot = result.copy()

to_plot.index = to_plot['date']
to_plot.index = pd.to_datetime(to_plot.index, format="%b %Y")

idx = pd.date_range('01-01-2018', '31-12-2024', freq='MS')
to_plot = to_plot.reindex(idx, fill_value=0)

to_plot.drop(columns=['date'], inplace=True)
to_plot.reset_index(inplace=True)

to_plot['Rok'] = to_plot['index'].map(lambda date: date.year)


g = sns.FacetGrid(to_plot, col='Rok', col_wrap=3, margin_titles=True, sharex=False, sharey=False, height=3.5, aspect=1.1)

g.map_dataframe(sns.barplot, x="total_crawled_domains", y="index", color="lightgray")
for idx, (col_val, ax) in enumerate(g.axes_dict.items()):
    ax.bar_label(ax.containers[-1], fmt='{:,.0f}', label_type='edge')
    ax.set(xlim=(ax.get_ylim()[0], max(ax.containers[-1].datavalues) * 1.5))

g.map_dataframe(sns.barplot, x="total_crawled_domains_with_correct_nel", y="index", color="green")
for idx, (col_val, ax) in enumerate(g.axes_dict.items()):
    ax.bar_label(ax.containers[-1], fmt='{:,.0f}', label_type='edge')


for idx, (col_val, ax) in enumerate(g.axes_dict.items()):
    ax.xaxis.get_major_formatter().set_scientific(False)
    ax.xaxis.get_major_formatter().set_useOffset(False)

    # ax.xaxis.remove_overlapping_locs = False
    ax.xaxis.set_major_locator(ticker.NullLocator())

    # months = mdates.MonthLocator()
    # months_fmt = mdates.DateFormatter('%b')
    # ax.yaxis.set_major_locator(months)
    # ax.yaxis.set_major_formatter(months_fmt)

    # This just would not work forever... I really AM sorry, no more time to waste
    tick_labels = [date_to_text_format("-".join(x.get_text().split('-')[:2])).split(' ')[0] for x in ax.get_yticklabels()]
    ax.set_yticklabels(tick_labels)


    ax.set(xlabel='NEL Domény / Domény celkom', ylabel='Mesiac')
    fig.tight_layout()

plt.savefig('graphs/httparchive_nel_deployment_ratio_values.pdf')


### RESOURCES queried within each month and those with valid NEL headers

In [9]:
result['nel_resource_percentage'] = result['total_crawled_resources_with_correct_nel'] / result[
    'total_crawled_resources'] * 100
result[['date', 'total_crawled_resources', 'total_crawled_resources_with_correct_nel', 'nel_resource_percentage']]

#### GRAPH - Resources with NEL Over time

In [13]:
to_plot = result.copy()
to_plot['date_fmt'] = to_plot['date'].map(lambda date: pd.to_datetime(date))
# to_plot[('date_fmt')]

fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.yaxis.get_major_formatter().set_scientific(False)
ax.yaxis.get_major_formatter().set_useOffset(False)

sns.lineplot(to_plot, x='date_fmt', y='total_crawled_resources_with_correct_nel', color='darkgreen', ax=ax)

ax.xaxis.remove_overlapping_locs = False

years = mdates.YearLocator()
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(mdates.DateFormatter("\n\n%Y"))

months = mdates.MonthLocator(interval=1)
months_fmt = mdates.DateFormatter('%b')
ax.xaxis.set_minor_locator(months)
ax.xaxis.set_minor_formatter(months_fmt)

# ax.tick_params('x', length=4, width=4, which='both')
ax.tick_params(axis='x', which="minor", rotation=90, labelsize=9)


ax.set(xlabel='Dátum', ylabel='Zdroje správne monitorované pomocou NEL')
# plt.setp(ax.xaxis.get_majorticklabels(), )
plt.setp(ax.xaxis.get_minorticklabels(), rotation=90)

plt.grid(True, which='minor', linestyle='--',)

ax.yaxis.set_major_formatter(ticker.EngFormatter())
ax.tick_params(labelleft=False, labelright=True)

ax.lines[0].set_marker('.')
ax.lines[0].set_markersize(10)
ax.lines[0].set_markeredgecolor('gray')
ax.lines[0].set_markeredgewidth(1.5)

# fig.suptitle("Rast v počte NEL monitorovaných zdrojov")
fig.tight_layout()
plt.savefig('graphs/httparchive_nel_deployment_resources.pdf')

### Incorrect NEL deployment

In [11]:
result['domains_with_incorrect_nel'] = result['total_crawled_domains_with_nel'] - result[
    'total_crawled_domains_with_correct_nel']
result['resources_with_incorrect_nel'] = result['total_crawled_resources_with_nel'] - result[
    'total_crawled_resources_with_correct_nel']

# TODO ratio to the correct NEL

result[['date', 'domains_with_incorrect_nel', 'resources_with_incorrect_nel']]

#### GRAPH - Incorrect NEL deployment, Domains

In [12]:
to_plot = result.copy()
to_plot['date_fmt'] = to_plot['date'].map(lambda date: pd.to_datetime(date))
# to_plot[('date_fmt')]

fig, ax = plt.subplots(1, 1, figsize=(10, 5))
ax.yaxis.get_major_formatter().set_scientific(False)
ax.yaxis.get_major_formatter().set_useOffset(False)

sns.lineplot(to_plot, x='date_fmt', y='domains_with_incorrect_nel', color='red', ax=ax)

ax.xaxis.remove_overlapping_locs = False

years = mdates.YearLocator()
ax.xaxis.set_major_locator(years)
ax.xaxis.set_major_formatter(mdates.DateFormatter("\n\n%Y"))

months = mdates.MonthLocator(interval=1)
months_fmt = mdates.DateFormatter('%b')
ax.xaxis.set_minor_locator(months)
ax.xaxis.set_minor_formatter(months_fmt)


ax.tick_params(axis='x', which="minor", rotation=90, labelsize=9)


ax.set(xlabel='Dátum', ylabel='Domény s nesprávne nasadeným NEL')
plt.setp(ax.xaxis.get_minorticklabels(), rotation=90)

plt.grid(True, which='minor', linestyle='--',)

ax.yaxis.set_major_locator(ticker.MultipleLocator(200))
# ax.yaxis.set_major_formatter(ticker.EngFormatter())
ax.tick_params(labelleft=False, labelright=True)

ax.lines[0].set_marker('.')
ax.lines[0].set_markersize(10)
ax.lines[0].set_markeredgecolor('gray')
ax.lines[0].set_markeredgewidth(1.5)

fig.tight_layout()
plt.savefig('graphs/httparchive_nel_deployment_incorrect.pdf')