# Overview Plot Performance Estimates

ODD21 the paper plot that shows the different performance estimates for each algorithm

## Imports

In [1]:
import pandas as pd
import numpy as np
from pathlib import Path

from tqdm import tqdm
import altair as alt

alt.data_transformers.disable_max_rows()
# alt.renderers.enable("png")
from collections import defaultdict

In [2]:
%load_ext autoreload
%autoreload 2

## Config

In [3]:
VERSION = 1
algorithms = ["CBLOF", "HBOS", "IForest", "KNN", "LOF", "OCSVM"]
grid_versions_to_use = defaultdict(lambda: 1)
grid_versions_to_use["HBOS"] = 2
grid_versions_to_use["CBLOF"] = 2
grid_versions_to_use["IForest"] = 2
result_path = Path() / "results"
processed_path = Path() / "processed_results_v3"

## Functions

In [4]:
def make_chart_paper_ready(
    chart,
    axis_title_fs=25,
    axis_label_fs=25,
    legend_title_fs=20,
    legend_label_fs=20,
    title_fs=35,
    subtitle_fs=25,
):
    return (
        chart.configure_axis(
            labelFontSize=axis_label_fs,
            titleFontSize=axis_title_fs,
            labelLimit=0,
            #         labelAngle = 45
        )
        .configure_legend(
            titleFontSize=legend_title_fs,
            labelFontSize=legend_label_fs,
            strokeColor="black",
            strokeWidth=3,
            # fillColor="#EEEEEEC8",
            padding=10,
            cornerRadius=0,
            symbolSize=300,
            symbolStrokeWidth=4,
            labelLimit=0,
            titleLimit=0,
            #         orient = 'bottom'
            orient="right",
        )
        .configure_title(fontSize=title_fs, subtitleFontSize=subtitle_fs)
    )

In [5]:
def average_compare_plot(
    result_df, line_settings=["out-of-the-box", "peak"], sort="peak", reference=None
):

    domains = df["Performance Estimate"].unique().tolist()
    shape_range = ["cross" if e in {"tuned"} else "circle" for e in domains]

    result_df = result_df.set_index("Performance Estimate")

    # convert to improvement over reference
    if reference is not None:
        reference_df = (
            result_df.loc[reference]
            .reset_index()[["algorithm_name", "auc"]]
            .reset_index(drop=True)
            .rename(columns={"auc": "reference_auc"})
        )
        result_df = result_df.reset_index().merge(reference_df, on=["algorithm_name"])
        result_df["auc"] = result_df["auc"] - result_df["reference_auc"]
        result_df = result_df.set_index("Performance Estimate")

    # line df
    line_df = pd.pivot(
        result_df.loc[line_settings, :].reset_index(),
        index="algorithm_name",
        columns="Performance Estimate",
        values="auc",
    ).reset_index()

    #     point_df = result_df.drop(line_settings).reset_index()[['algorithm_name', 'reference', 'auc']]
    point_df = result_df.reset_index()

    # add sort
    #     sort_df = result_df.loc[sort][['dataset_id', 'auc']].rename(columns = {'auc': 'sort'})
    #     line_df = line_df.merge(sort_df, on = 'dataset_id')
    #     point_df = point_df.merge(sort_df, on = 'dataset_id')

    line_df["color"] = 0
    line_chart = (
        alt.Chart(line_df)
        .mark_rule(opacity=0.8, strokeWidth=4)
        .encode(
            y=alt.Y(
                "algorithm_name:N", title=None
            ),  # , sort= alt.EncodingSortField(field="sort", op="min")),
            x=alt.X(
                f"{line_settings[0]}:Q",
                scale=alt.Scale(zero=False),
                title="average AUC",
            ),
            x2=f"{line_settings[1]}:Q",
            color=alt.Color("color", legend=None),
        )
    )

    point_chart = (
        alt.Chart(point_df)
        .mark_point(size=200, filled=True, opacity=1)
        .encode(
            y=alt.Y(
                "algorithm_name:N"
            ),  # sort= alt.EncodingSortField(field="sort", op="min"), title = 'Datasets'),
            x=alt.X(
                "auc:Q",
            ),
            shape=alt.Shape(
                "Performance Estimate:N",
                scale=alt.Scale(range=shape_range, domain=domains),
                legend=None,
            ),
            color="Performance Estimate:N",
        )
    )
    return alt.layer(line_chart, point_chart)


#

## Make dataframe with all the relevant information in a good format to plot

In [48]:
df = (
    pd.read_csv(Path()/'comparisons'/'statistical_validation_set_size'/'comparison.csv')
    .pipe(lambda x: x[(x.reference != "tuned") | (x["size"] == 0.05)])
    .drop(columns="size")
    .groupby(["reference", "algorithm_name"])
    .mean()
    .reset_index()
)
df = df.rename(columns=dict(reference="performance", algorithm_name="algorithm"))

# adf = df.groupby("performance").mean().reset_index()
# adf["algorithm"] = "Average"
# df = pd.concat([df, adf])

ld = (
    pd.pivot(
        df.set_index("performance").loc[["out-of-the-box", "peak"], :].reset_index(),
        index="algorithm",
        columns="performance",
        values="auc",
    )
    .reset_index()
    .set_index("algorithm")
    .to_dict(orient="index")
)

df["lb"] = df.apply(lambda r: ld.get(r.algorithm).get("out-of-the-box"), axis=1)
df["ub"] = df.apply(lambda r: ld.get(r.algorithm).get("peak"), axis=1)
df["bs"] = df.apply(lambda r: r.ub - r.lb, axis=1)
df["color"] = 1
df["size"] = df.apply(lambda r: 10 if r.performance == "tuned" else 1, axis=1)
# df = df.sort_values(by="bs")

df = df.rename(columns=dict(performance="Performance", algorithm="Algorithm"))

df.head()

Unnamed: 0,Performance,Algorithm,auc,ap,lb,ub,bs,color,size
0,best-default,CBLOF,0.829088,0.425568,0.803797,0.867125,0.063328,1,1
1,best-default,HBOS,0.800956,0.361452,0.799944,0.829418,0.029473,1,1
2,best-default,IForest,0.839045,0.395585,0.821566,0.880479,0.058913,1,1
3,best-default,KNN,0.83291,0.390402,0.823568,0.864912,0.041344,1,1
4,best-default,LOF,0.811393,0.375329,0.767826,0.867635,0.099809,1,1


# Variations of plot df to make other figures
Change df_to_plot below to first, second or third_df

In [49]:
first_df = df[(df['Performance'] != 'tuned') & (df['Performance']!= 'best-default')]
second_df = df[df['Performance']!= 'best-default'].set_index(['Performance','Algorithm']).drop([('tuned', 'CBLOF'), ('tuned', 'OCSVM')]).reset_index()
third_df = df[df['Performance']!= 'best-default']

# Actual plot figure

In [54]:
df_to_plot = df
performance_col = "Performance"
algorithm_col = "Algorithm"
tuned_performance_name = "tuned"

shape_domain = df_to_plot[performance_col].unique().tolist()
shape_range = [
    "diamond" if e in {tuned_performance_name} else "square" for e in shape_domain
]

if tuned_performance_name not in shape_domain: 
    # for figure without tuned performance
    stroke_size = 10
    shape_small_size = 30
else: 
# figure with tuned performance
    stroke_size = 10
    shape_small_size = stroke_size * 10


# size_range = [600 if e in {tuned_performance_name} else 100 for e in shape_domain]
# print(size_range)

shape = alt.Shape(
    "{}:N".format(performance_col),
    scale=alt.Scale(range=shape_range, domain=shape_domain),
    legend=None,
)

l = (
    alt.Chart(df_to_plot)
    .mark_rule(opacity=0.05, strokeWidth=stroke_size, color="Gray")
    .encode(
        y=alt.Y(
            "{}:N".format(algorithm_col),
            title=None,
            sort=alt.EncodingSortField(field="bs", op="min", order="descending"),
        ),
        x=alt.X("lb", scale=alt.Scale(zero=False), title="average AUC"),
        x2="ub",
        # color=alt.Color("color:N"),
    )
)

p = (
    alt.Chart(df_to_plot)
    .mark_point(filled=True, opacity=1)
    .encode(
        y=alt.Y(
            "{}:N".format(algorithm_col),
            sort=alt.EncodingSortField(field="bs", op="min", order="descending"),
        ),
        x=alt.X(
            "auc:Q",
            scale=alt.Scale(domain = [0.74, 0.90]),
        ),
        shape=alt.Shape(
            "{}:N".format(performance_col),
            scale=alt.Scale(range=shape_range),
            # legend=None,
        ),
        size=alt.Size(
            "size",
            scale=alt.Scale(range=[shape_small_size, shape_small_size * 5]),
            legend=None,
        ),
        color=alt.Color(
            "{}:N".format(performance_col),
            scale = alt.Scale(scheme = 'category10')
        ),
        tooltip=["auc"],
    )
)

f = l + p
make_chart_paper_ready(f).properties(height=300, width=400)