In [None]:
import pandas as pd 
import wandb

import plotly.graph_objects as go
from plotly.subplots import make_subplots

import numpy as np



In [None]:
# login with your API key
# this takes about 10 minutes

api = wandb.Api()

# Project is specified by <entity/project-name>
runs = api.runs("kriza-upjs/loss_security")

summary_list, config_list, name_list, create_time_list = [], [], [], []
for run in runs: 
    # .summary contains the output keys/values for metrics like accuracy.
    #  We call ._json_dict to omit large files 
    summary_list.append(run.summary._json_dict)

    # .config contains the hyperparameters.
    #  We remove special values that start with _.
    config_list.append(
        {k: v for k,v in run.config.items()
          if not k.startswith('_')})

    # .name is the human-readable name of the run.
    name_list.append(run.name)
    
    create_time_list.append(run.created_at)

runs_df = pd.DataFrame({
    "summary": summary_list,
    "config": config_list,
    "name": name_list,
    "create_time": create_time_list,
    })

# can save to coserve time
# runs_df.to_pickle("runs_df.pkl.xz", compression="xz")

In [None]:
# # load if saved
# runs_df = pd.read_pickle("runs_df.pkl.xz", compression="xz")

In [None]:
runs_df

In [None]:
runs_df = runs_df.sort_values(by='create_time', ascending=True)
# runs_df = runs_df.iloc[:-2]

In [None]:
maes = []
mases = []
losses_old = []
best_epochs = []
for i, run in runs_df.iterrows():
    # if run config contains "loss" and "MAE" in summary
    if "loss" in run["config"] and "MAE" in run["summary"]:
        # print(run["summary"])
        maes.append(run["summary"]["MAE"])
        mases.append(run["summary"]["MASE"])
        losses_old.append(run["config"]["loss"])
        best_epochs.append(run["summary"]["best_epoch"])
    
    if "loss" not in run["config"]:
        maes.append(run["summary"]["MAE"])
        mases.append(run["summary"]["MASE"])
        losses_old.append("NAIVE")
        best_epochs.append(0)
    
    if "MAE" not in run["summary"]:
        maes.append(-1)
        mases.append(-1)
        losses_old.append(run["config"]["loss"])
        best_epochs.append(0)
            
runs_df["MAE"] = maes
runs_df["MASE"] = mases
runs_df["loss"] = losses_old
runs_df["best_epoch"] = best_epochs
losses_old = set(losses_old)

In [None]:
runs_df

In [None]:
# remove lines from runs_df where MAE is "N/A" and loss is "NAIVE"
runs_df = runs_df[~((runs_df["loss"] == "NAIVE"))]


In [None]:
runs_df

In [None]:
list_of_losses = [
    'huber',
    'kernelMSE',
    'logCosh',
    'mae',
    'mape',
    'mase',
    'mbe',
    'mse',
    'msle',
    'nrmse',
    'poisson',
    'quantile25',
    'quantile75',
    'rrmse',
    'rse',
    'rae',
    'rmse',
    'rmsse',
    'rmsle',
    'angle_loss',

    ]

In [None]:
dict_of_losses_to_paper_names = {
    'huber': "Huber Loss",
    'kernelMSE': "KernelMSE Loss",
    'logCosh': "Log-Cosh Loss",
    'mae': "MAE",
    'mape': "MAPE",
    'mase': "MASE",
    'mbe': "MBE",
    'mse': "MSE",
    'msle': "MSLE",
    'nrmse': "NRMSE",
    'poisson': "Poisson Loss",
    'quantile25': "Quantile Loss 0.25",
    'quantile75': "Quantile Loss 0.75",
    'rrmse': "RRMSE",
    'rse': "RSE",
    'rae': "RAE",
    'rmse': "RMSE",
    'rmsle': "RMSLE",
    'rmsse': "RMSSE",
    'angle_loss': "Angle Loss",
}

In [None]:
dict_of_losses = dict()
for loss in list_of_losses:
    dict_of_losses[loss] = runs_df[runs_df['loss'] == loss].sort_values(by='create_time', ascending=True)

In [None]:
dict_of_losses.keys()

In [None]:
dict_of_losses_manual = dict()
dict_of_losses_optuna = dict()
for loss in list_of_losses:
    dict_of_losses_manual[loss] = dict_of_losses[loss].iloc[:108]
    dict_of_losses_optuna[loss] = dict_of_losses[loss].iloc[108:]

In [None]:
dict_of_losses_manual["mae"]

In [None]:
# best MAE values
best_results_mae = {}
best_results_mase = {}
for i, (loss_name, df) in enumerate(dict_of_losses.items()):
    best_results_mae[loss_name] = round(df['MAE'].min(), 2)
    best_results_mase[loss_name] = round(df['MASE'].min(), 2)
print(best_results_mae)
print(best_results_mase)

In [None]:
def draw_box_plots(losses_structure, less_than=None, max_graph_y = None, line=None, height=800):
    # Create a subplot figure with one row per key
    fig = make_subplots(
        rows=1, 
        cols=1, 
        # subplot_titles=list_of_losses,  # Use keys as titles
        shared_xaxes=True  # Share the x-axis across boxplots
    )

    # Add a boxplot for each key
    for loss_name in list_of_losses:
    
        if loss_name not in losses_structure:
            continue
        df = losses_structure[loss_name]

        if less_than is None:
            mae_values = df['MAE']
        else:
            mae_values = df['MAE'].apply(lambda mae: mae if mae <= less_than else np.nan)

        # Add boxplot to the subplot
        fig.add_trace(
            go.Box(
                y=mae_values,
                name=dict_of_losses_to_paper_names[loss_name],  # Legend entry
                boxmean=True,  # Show mean as a line in the boxplot
                marker=dict(color='black', opacity=0.6),  # Customize color and transparency
                boxpoints="all",
                # boxpoints="suspectedoutliers"
                # boxpoints='outliers'
            ),
            row=1,
            col=1
        )
    if line:
        fig.add_shape(
            type="line",
            x0=0, x1=1,  # 0 to 1 in "paper" coordinates means full width
            y0=line, y1=line,  # y-value where you want the line
            xref="paper", yref="y",
            line=dict(color="red", width=2, dash="dash"),  # Line style
        )
    # Update layout
    fig.update_layout(
        # title={
        #     'text': 'Boxplots of MAE Values for Each Loss Type',
        #     'font': {'size': 24}  # Title font size
        # },
        xaxis_title={
            'text': 'Loss function',
            'font': {'size': 18}  # X-axis title font size
        },
        yaxis_title={
            'text': 'mae metric',
            'font': {'size': 18}  # Y-axis title font size
        },
        font=dict(
            family="Computer Modern",
            size=20  # General font size for tick labels and subplot titles
        ),
        width=1000,
        height=height,  # Adjust height based on the number of keys
        showlegend=False,  # Hide legend (each subplot has its title)
        margin=dict(l=5, r=5, t=5, b=10),
        # yaxis=dict(range=[850, 1500])
        xaxis=dict(
            tickangle=90  # Rotates x-axis tick labels (box names) vertically
        )
    )
    if max_graph_y:
        fig.update_layout(
            yaxis=dict(range=[850, max_graph_y])
        )
    # Show the interactive plot
    return fig

In [None]:
# # box_plot = draw_box_plots(dict_of_losses, 1_000_000)
# box_plot = draw_box_plots({'mbe': dict_of_losses['mbe']})
# box_plot.write_image("box_plot.pdf", format="pdf")
# box_plot

In [None]:
# box plots from paper
box_plot = draw_box_plots(dict_of_losses, max_graph_y=10000, height=500)
# saving box plots to file
box_plot.write_image("box_plot_all_less_10k.pdf", format="pdf")
box_plot

In [None]:
# box_plot = draw_box_plots(dict_of_losses, max_graph_y=1600)
# box_plot

In [None]:
print(len(dict_of_losses))
dict_of_losses.pop("mbe")
dict_of_losses.pop("rrmse")
dict_of_losses.pop("poisson")
dict_of_losses.pop("msle")
dict_of_losses.pop("mape")
dict_of_losses.pop("rmsle")
print(len(dict_of_losses))

In [None]:
# box plots from paper
box_plot = draw_box_plots(dict_of_losses, less_than=1600, max_graph_y=1600)
# saving box plots to file
# box_plot.write_image("box_plot_1600_removed.pdf", format="pdf")
box_plot

In [None]:
# box_plot = draw_box_plots(dict_of_losses_manual)
# # box_plot.write_image("box_plot.pdf", format="pdf")
# box_plot

In [None]:
# box_plot = draw_box_plots(dict_of_losses_optuna)
# # box_plot.write_image("box_plot.pdf", format="pdf")
# box_plot

In [None]:
# number of MAE values bigger than 1500
bigger_than = {}
for i, (loss_name, df) in enumerate(dict_of_losses.items(), start=1):
    # bigger = []
    # Filter MAE values (exclude NaN if any)
    mae_values = df['MAE']
    bigger = mae_values.values[mae_values.values > 1600]
    bigger.sort()
    # = (mae_values > 1600).sum().sum()
    bigger_than[loss_name] = bigger
bigger_than
    