In [1]:
import os
import operator
import math

import matplotlib
import networkx

import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from scipy.stats import wilcoxon
from scipy.stats import friedmanchisquare

In [2]:
matplotlib.rcParams['font.family'] = 'sans-serif'
matplotlib.rcParams['font.sans-serif'] = 'Arial'
matplotlib.use('agg')

In [3]:
# Author: Hassan Ismail Fawaz <hassan.ismail-fawaz@uha.fr>
#         Germain Forestier <germain.forestier@uha.fr>
#         Jonathan Weber <jonathan.weber@uha.fr>
#         Lhassane Idoumghar <lhassane.idoumghar@uha.fr>
#         Pierre-Alain Muller <pierre-alain.muller@uha.fr>
# License: GPL3

# inspired from orange3 https://docs.orange.biolab.si/3/data-mining-library/reference/evaluation.cd.html
def graph_ranks(avranks, names, p_values, cd=None, cdmethod=None, lowv=None, highv=None,
                width=6, textspace=1, reverse=False, filename=None, labels=False, **kwargs):
    """
    Draws a CD graph, which is used to display  the differences in methods'
    performance. See Janez Demsar, Statistical Comparisons of Classifiers over
    Multiple Data Sets, 7(Jan):1--30, 2006.

    Needs matplotlib to work.

    The image is ploted on `plt` imported using
    `import matplotlib.pyplot as plt`.

    Args:
        avranks (list of float): average ranks of methods.
        names (list of str): names of methods.
        cd (float): Critical difference used for statistically significance of
            difference between methods.
        cdmethod (int, optional): the method that is compared with other methods
            If omitted, show pairwise comparison of methods
        lowv (int, optional): the lowest shown rank
        highv (int, optional): the highest shown rank
        width (int, optional): default width in inches (default: 6)
        textspace (int, optional): space on figure sides (in inches) for the
            method names (default: 1)
        reverse (bool, optional):  if set to `True`, the lowest rank is on the
            right (default: `False`)
        filename (str, optional): output file name (with extension). If not
            given, the function does not write a file.
        labels (bool, optional): if set to `True`, the calculated avg rank
        values will be displayed
    """
    try:
        import matplotlib
        import matplotlib.pyplot as plt
        from matplotlib.backends.backend_agg import FigureCanvasAgg
    except ImportError:
        raise ImportError("Function graph_ranks requires matplotlib.")

    width = float(width)
    textspace = float(textspace)

    def nth(l, n):
        """
        Returns only nth elemnt in a list.
        """
        n = lloc(l, n)
        return [a[n] for a in l]

    def lloc(l, n):
        """
        List location in list of list structure.
        Enable the use of negative locations:
        -1 is the last element, -2 second last...
        """
        if n < 0:
            return len(l[0]) + n
        else:
            return n

    def mxrange(lr):
        """
        Multiple xranges. Can be used to traverse matrices.
        This function is very slow due to unknown number of
        parameters.

        >>> mxrange([3,5])
        [(0, 0), (0, 1), (0, 2), (1, 0), (1, 1), (1, 2)]

        >>> mxrange([[3,5,1],[9,0,-3]])
        [(3, 9), (3, 6), (3, 3), (4, 9), (4, 6), (4, 3)]

        """
        if not len(lr):
            yield ()
        else:
            # it can work with single numbers
            index = lr[0]
            if isinstance(index, int):
                index = [index]
            for a in range(*index):
                for b in mxrange(lr[1:]):
                    yield tuple([a] + list(b))

    def print_figure(fig, *args, **kwargs):
        canvas = FigureCanvasAgg(fig)
        canvas.print_figure(*args, **kwargs)

    sums = avranks

    nnames = names
    ssums = sums

    if lowv is None:
        lowv = min(1, int(math.floor(min(ssums))))
    if highv is None:
        highv = max(len(avranks), int(math.ceil(max(ssums))))

    cline = 0.4

    k = len(sums)

    lines = None

    linesblank = 0
    scalewidth = width - 2 * textspace

    def rankpos(rank):
        if not reverse:
            a = rank - lowv
        else:
            a = highv - rank
        return textspace + scalewidth / (highv - lowv) * a

    distanceh = 0.25

    cline += distanceh

    # calculate height needed height of an image
    minnotsignificant = max(2 * 0.2, linesblank)
    height = cline + ((k + 1) / 2) * 0.2 + minnotsignificant

    fig = plt.figure(figsize=(width, height))
    fig.set_facecolor('white')
    ax = fig.add_axes([0, 0, 1, 1])  # reverse y axis
    ax.set_axis_off()

    hf = 1. / height  # height factor
    wf = 1. / width

    def hfl(l):
        return [a * hf for a in l]

    def wfl(l):
        return [a * wf for a in l]

    # Upper left corner is (0,0).
    ax.plot([0, 1], [0, 1], c="w")
    ax.set_xlim(0, 1)
    ax.set_ylim(1, 0)

    def line(l, color='k', **kwargs):
        """
        Input is a list of pairs of points.
        """
        ax.plot(wfl(nth(l, 0)), hfl(nth(l, 1)), color=color, **kwargs)

    def text(x, y, s, *args, **kwargs):
        ax.text(wf * x, hf * y, s, *args, **kwargs)

    line([(textspace, cline), (width - textspace, cline)], linewidth=2)

    bigtick = 0.3
    smalltick = 0.15
    linewidth = 2.0
    linewidth_sign = 4.0

    tick = None
    for a in list(np.arange(lowv, highv, 0.5)) + [highv]:
        tick = smalltick
        if a == int(a):
            tick = bigtick
        line([(rankpos(a), cline - tick / 2),
              (rankpos(a), cline)],
             linewidth=2)

    for a in range(lowv, highv + 1):
        text(rankpos(a), cline - tick / 2 - 0.05, str(a),
             ha="center", va="bottom", size=16)

    k = len(ssums)

    def filter_names(name):
        return name

    space_between_names = 0.24

    for i in range(math.ceil(k / 2)):
        chei = cline + minnotsignificant + i * space_between_names
        line([(rankpos(ssums[i]), cline),
              (rankpos(ssums[i]), chei),
              (textspace - 0.1, chei)],
             linewidth=linewidth)
        if labels:
            text(textspace + 0.3, chei - 0.075, format(ssums[i], '.4f'), ha="right", va="center", size=10)
        text(textspace - 0.2, chei, filter_names(nnames[i]), ha="right", va="center", size=16)

    for i in range(math.ceil(k / 2), k):
        chei = cline + minnotsignificant + (k - i - 1) * space_between_names
        line([(rankpos(ssums[i]), cline),
              (rankpos(ssums[i]), chei),
              (textspace + scalewidth + 0.1, chei)],
             linewidth=linewidth)
        if labels:
            text(textspace + scalewidth - 0.3, chei - 0.075, format(ssums[i], '.4f'), ha="left", va="center", size=10)
        text(textspace + scalewidth + 0.2, chei, filter_names(nnames[i]),
             ha="left", va="center", size=16)

    # no-significance lines
    def draw_lines(lines, side=0.05, height=0.1):
        start = cline + 0.2

        for l, r in lines:
            line([(rankpos(ssums[l]) - side, start),
                  (rankpos(ssums[r]) + side, start)],
                 linewidth=linewidth_sign)
            start += height
            print('drawing: ', l, r)

    # draw_lines(lines)
    start = cline + 0.2
    side = -0.02
    height = 0.1

    # draw no significant lines
    # get the cliques
    cliques = form_cliques(p_values, nnames)
    i = 1
    achieved_half = False
    print(nnames)
    for clq in cliques:
        if len(clq) == 1:
            continue
        print(clq)
        min_idx = np.array(clq).min()
        max_idx = np.array(clq).max()
        if min_idx >= len(nnames) / 2 and achieved_half == False:
            start = cline + 0.25
            achieved_half = True
        line([(rankpos(ssums[min_idx]) - side, start),
              (rankpos(ssums[max_idx]) + side, start)],
             linewidth=linewidth_sign)
        start += height


def form_cliques(p_values, nnames):
    """
    This method forms the cliques
    """
    # first form the numpy matrix data
    m = len(nnames)
    g_data = np.zeros((m, m), dtype=np.int64)
    for p in p_values:
        if p[3] == False:
            i = np.where(nnames == p[0])[0][0]
            j = np.where(nnames == p[1])[0][0]
            min_i = min(i, j)
            max_j = max(i, j)
            g_data[min_i, max_j] = 1

    g = networkx.Graph(g_data)
    return networkx.find_cliques(g)


def draw_cd_diagram(df_perf=None, alpha=0.05, title=None, labels=False, filename='cd_diagram.png'):
    """
    Draws the critical difference diagram given the list of pairwise classifiers that are
    significant or not
    """
    p_values, average_ranks, _ = wilcoxon_holm(df_perf=df_perf, alpha=alpha)

    print(average_ranks)

    for p in p_values:
        print(p)


    graph_ranks(average_ranks.values, average_ranks.keys(), p_values,
                cd=None, reverse=True, width=9, textspace=1.5, labels=labels)

    font = {'family': 'sans-serif',
        'color':  'black',
        'weight': 'normal',
        'size': 22,
        }
    if title:
        plt.title(title,fontdict=font, y=0.9, x=0.5)
    plt.savefig(filename ,bbox_inches='tight')

def wilcoxon_holm(alpha=0.05, df_perf=None):
    """
    Applies the wilcoxon signed rank test between each pair of algorithm and then use Holm
    to reject the null's hypothesis
    """
    print(pd.unique(df_perf['algorithm']))
    # count the number of tested datasets per classifier
    df_counts = pd.DataFrame({'count': df_perf.groupby(
        ['algorithm']).size()}).reset_index()
    # get the maximum number of tested datasets
    max_nb_datasets = df_counts['count'].max()
    # get the list of classifiers who have been tested on nb_max_datasets
    classifiers = list(df_counts.loc[df_counts['count'] == max_nb_datasets]
                       ['algorithm'])
    # test the null hypothesis using friedman before doing a post-hoc analysis
    friedman_p_value = friedmanchisquare(*(
        np.array(df_perf.loc[df_perf['algorithm'] == c]['score'])
        for c in classifiers))[1]
    if friedman_p_value >= alpha:
        # then the null hypothesis over the entire classifiers cannot be rejected
        print('the null hypothesis over the entire classifiers cannot be rejected')
        exit()
    # get the number of classifiers
    m = len(classifiers)
    # init array that contains the p-values calculated by the Wilcoxon signed rank test
    p_values = []
    # loop through the algorithms to compare pairwise
    for i in range(m - 1):
        # get the name of classifier one
        classifier_1 = classifiers[i]
        # get the performance of classifier one
        perf_1 = np.array(df_perf.loc[df_perf['algorithm'] == classifier_1]['score']
                          , dtype=np.float64)
        for j in range(i + 1, m):
            # get the name of the second classifier
            classifier_2 = classifiers[j]
            # get the performance of classifier one
            perf_2 = np.array(df_perf.loc[df_perf['algorithm'] == classifier_2]
                              ['score'], dtype=np.float64)
            # calculate the p_value
            p_value = wilcoxon(perf_1, perf_2, zero_method='pratt')[1]
            # appen to the list
            p_values.append((classifier_1, classifier_2, p_value, False))
    # get the number of hypothesis
    k = len(p_values)
    # sort the list in acsending manner of p-value
    p_values.sort(key=operator.itemgetter(2))

    # loop through the hypothesis
    for i in range(k):
        # correct alpha with holm
        new_alpha = float(alpha / (k - i))
        # test if significant after holm's correction of alpha
        if p_values[i][2] <= new_alpha:
            p_values[i] = (p_values[i][0], p_values[i][1], p_values[i][2], True)
        else:
            # stop
            break
    # compute the average ranks to be returned (useful for drawing the cd diagram)
    # sort the dataframe of performances
    sorted_df_perf = df_perf.loc[df_perf['algorithm'].isin(classifiers)]. \
        sort_values(['algorithm', 'dataset_name'])
    # get the rank data
    rank_data = np.array(sorted_df_perf['score']).reshape(m, max_nb_datasets)

    # create the data frame containg the accuracies
    df_ranks = pd.DataFrame(data=rank_data, index=np.sort(classifiers), columns=
    np.unique(sorted_df_perf['dataset_name']))

    # number of wins
    dfff = df_ranks.rank(ascending=False)
    print(dfff[dfff == 1.0].sum(axis=1))

    # average the ranks
    average_ranks = df_ranks.rank(ascending=False).mean(axis=1).sort_values(ascending=False)
    # return the p-values and the average ranks
    return p_values, average_ranks, max_nb_datasets

In [4]:
ALGOS = ['baseline', 'briggs', 'sattler', 'kmeans', 'affinity', 'meanshift', 'HDBSCAN']
DATASETS = ['MNIST', 'FMNIST', 'CIFAR10', 'PATHMNIST']
OVERLAPS = ['overlaping', 'nonoverlaping']
BALANCES = ['balanced', 'imbalanced']
EXPERIMENTS = ['experiment1A', 'experiment1B', 'experiment1C']
TOTAL_ROWS = len(ALGOS) * len(DATASETS) * len(OVERLAPS) * len(BALANCES)
EXPERIMENTS_ROOT = os.path.join(os.getcwd(), 'explanations')

# Section I: Calculation of a Mean INDE values across clusters

## Section I Part A (In-Distribution Experiments)

In [5]:
root = os.path.join(EXPERIMENTS_ROOT, EXPERIMENTS[0])
avg_results = []
for dataset in DATASETS:
    for overlap in OVERLAPS:
        for balance in BALANCES:
            for algo in ALGOS:
                frame = pd.read_csv(os.path.join(root, dataset, f"{dataset}_{overlap}_{balance}_15_{algo}.csv"))
                frame_info = frame[['ins_mean', 'del_mean']].mean(axis=0)
                avg_results.append(
                    {
                        'algorithm': algo,
                        'dataset_name': f'{dataset} {overlap} {balance}',
                        'insertion': frame_info['ins_mean'],
                        'deletion': frame_info['del_mean']
                    }
                )

expa_frame = pd.DataFrame(avg_results)
expa_frame.to_csv(os.path.join(root, 'INDE_avg.csv'))

## Section I Part B (Out-of-Distribution Experiments)

In [7]:
root = os.path.join(EXPERIMENTS_ROOT, EXPERIMENTS[1])
avg_results = []
for dataset in DATASETS:
    for overlap in OVERLAPS:
        for balance in BALANCES:
            for algo in ALGOS:
                frame = pd.read_csv(os.path.join(root, dataset, f"{dataset}_{overlap}_{balance}_15_{algo}.csv"))
                frame_info = frame[['ins_mean', 'del_mean']].mean(axis=0)
                avg_results.append(
                    {
                        'algorithm': algo,
                        'dataset_name': f'{dataset} {overlap} {balance}',
                        'insertion': frame_info['ins_mean'],
                        'deletion': frame_info['del_mean']
                    }
                )

expb_frame = pd.DataFrame(avg_results)
expb_frame.to_csv(os.path.join(root, 'INDE_avg.csv'))

## Section I Part C (Orchestrator Test Set Experiments)

In [8]:
root = os.path.join(EXPERIMENTS_ROOT, EXPERIMENTS[2])
avg_results = []
for dataset in DATASETS:
    for overlap in OVERLAPS:
        for balance in BALANCES:
            for algo in ALGOS:
                frame = pd.read_csv(os.path.join(root, dataset, f"{dataset}_{overlap}_{balance}_15_{algo}.csv"))
                frame_info = frame[['ins_mean', 'del_mean']].mean(axis=0)
                avg_results.append(
                    {
                        'algorithm': algo,
                        'dataset_name': f'{dataset} {overlap} {balance}',
                        'insertion': frame_info['ins_mean'],
                        'deletion': frame_info['del_mean']
                    }
                )

expb_frame = pd.DataFrame(avg_results)
expb_frame.to_csv(os.path.join(root, 'INDE_avg.csv'))

# Section II: Critical Difference Plots

## Section II Part A (In-Distribution Experiments)

### Insertion

In [15]:
root = os.path.join(EXPERIMENTS_ROOT, EXPERIMENTS[0])
expa_frame = pd.read_csv(os.path.join(root, 'INDE_avg.csv'))
expa_frame.drop(['deletion'], axis=1, inplace=True)
expa_frame.rename(columns={"insertion":"score"}, inplace=True)
expa_frame['algorithm'] = expa_frame['algorithm'].replace(['baseline', 'briggs', 'sattler', 'kmeans', 'affinity', 'meanshift', 'HDBSCAN'], ['BNC', 'BCL', 'SCL', 'OCFL-KM', 'OCFL-AFF', 'OCFL-MS', 'OCFL-HDB'])
draw_cd_diagram(df_perf=expa_frame, title='Insertion', labels=True, filename=os.path.join(root, 'Insertion_CD_plot.png'))

['BNC' 'BCL' 'SCL' 'OCFL-KM' 'OCFL-AFF' 'OCFL-MS' 'OCFL-HDB']
BCL         5.0
BNC         0.0
OCFL-AFF    1.0
OCFL-HDB    5.0
OCFL-KM     2.0
OCFL-MS     3.0
SCL         0.0
dtype: float64
BNC         6.0625
SCL         5.7500
OCFL-AFF    5.0625
BCL         3.6250
OCFL-MS     2.9375
OCFL-KM     2.5000
OCFL-HDB    2.0625
dtype: float64
('BNC', 'OCFL-HDB', 3.0517578125e-05, True)
('BNC', 'OCFL-KM', 3.0517578125e-05, True)
('BNC', 'OCFL-MS', 3.0517578125e-05, True)
('OCFL-HDB', 'SCL', 3.0517578125e-05, True)
('OCFL-KM', 'SCL', 3.0517578125e-05, True)
('OCFL-AFF', 'OCFL-HDB', 0.000152587890625, True)
('OCFL-AFF', 'OCFL-KM', 0.000152587890625, True)
('OCFL-MS', 'SCL', 0.000762939453125, True)
('OCFL-AFF', 'OCFL-MS', 0.002685546875, True)
('BCL', 'BNC', 0.00335693359375, True)
('BCL', 'SCL', 0.00335693359375, True)
('BCL', 'OCFL-HDB', 0.02899169921875, False)
('BNC', 'OCFL-AFF', 0.02899169921875, False)
('OCFL-AFF', 'SCL', 0.05767822265625, False)
('BCL', 'OCFL-KM', 0.065399169921875, False)

### Deletion

In [16]:
root = os.path.join(EXPERIMENTS_ROOT, EXPERIMENTS[0])
expa_frame = pd.read_csv(os.path.join(root, 'INDE_avg.csv'))
expa_frame.drop(['insertion'], axis=1, inplace=True)
expa_frame.rename(columns={"deletion":"score"}, inplace=True)
expa_frame['algorithm'] = expa_frame['algorithm'].replace(['baseline', 'briggs', 'sattler', 'kmeans', 'affinity', 'meanshift', 'HDBSCAN'], ['BNC', 'BCL', 'SCL', 'OCFL-KM', 'OCFL-AFF', 'OCFL-MS', 'OCFL-HDB'])
draw_cd_diagram(df_perf=expa_frame, title='Deletion', labels=True, filename=os.path.join(root, 'Deletion_CD_plot.png'))

['BNC' 'BCL' 'SCL' 'OCFL-KM' 'OCFL-AFF' 'OCFL-MS' 'OCFL-HDB']
BCL         2.0
BNC         0.0
OCFL-AFF    2.0
OCFL-HDB    4.0
OCFL-KM     3.0
OCFL-MS     5.0
SCL         0.0
dtype: float64
BNC         6.1250
SCL         5.2500
BCL         3.9375
OCFL-AFF    3.5625
OCFL-MS     3.3750
OCFL-HDB    2.8750
OCFL-KM     2.8750
dtype: float64
('BNC', 'OCFL-KM', 0.000213623046875, True)
('BNC', 'OCFL-MS', 0.00030517578125, True)
('BNC', 'OCFL-HDB', 0.00042724609375, True)
('BNC', 'OCFL-AFF', 0.001312255859375, True)
('OCFL-HDB', 'SCL', 0.004180908203125, False)
('OCFL-KM', 'SCL', 0.00762939453125, False)
('OCFL-AFF', 'SCL', 0.009185791015625, False)
('OCFL-MS', 'SCL', 0.021392822265625, False)
('BCL', 'BNC', 0.02496337890625, False)
('BCL', 'OCFL-HDB', 0.02496337890625, False)
('BNC', 'SCL', 0.09344482421875, False)
('BCL', 'OCFL-KM', 0.14385986328125, False)
('BCL', 'SCL', 0.252227783203125, False)
('OCFL-AFF', 'OCFL-KM', 0.274444580078125, False)
('OCFL-AFF', 'OCFL-HDB', 0.40374755859375, Fal

## Section II Part B (Out-of-Distribution Experiments)

### Insertion

In [17]:
root = os.path.join(EXPERIMENTS_ROOT, EXPERIMENTS[1])
expa_frame = pd.read_csv(os.path.join(root, 'INDE_avg.csv'))
expa_frame.drop(['deletion'], axis=1, inplace=True)
expa_frame.rename(columns={"insertion":"score"}, inplace=True)
expa_frame['algorithm'] = expa_frame['algorithm'].replace(['baseline', 'briggs', 'sattler', 'kmeans', 'affinity', 'meanshift', 'HDBSCAN'], ['BNC', 'BCL', 'SCL', 'OCFL-KM', 'OCFL-AFF', 'OCFL-MS', 'OCFL-HDB'])
draw_cd_diagram(df_perf=expa_frame, title='Insertion', labels=True, filename=os.path.join(root, 'Insertion_CD_plot.png'))

['BNC' 'BCL' 'SCL' 'OCFL-KM' 'OCFL-AFF' 'OCFL-MS' 'OCFL-HDB']
BCL         1.0
BNC         0.0
OCFL-AFF    2.0
OCFL-HDB    7.0
OCFL-KM     3.0
OCFL-MS     3.0
SCL         0.0
dtype: float64
BCL         3.555556
OCFL-AFF    3.214286
OCFL-MS     2.866667
OCFL-KM     2.250000
OCFL-HDB    2.125000
BNC              NaN
SCL              NaN
dtype: float64
('BCL', 'BNC', nan, False)
('BCL', 'OCFL-AFF', nan, False)
('BCL', 'OCFL-HDB', nan, False)
('BCL', 'OCFL-KM', nan, False)
('BCL', 'OCFL-MS', nan, False)
('BCL', 'SCL', nan, False)
('BNC', 'OCFL-AFF', nan, False)
('BNC', 'OCFL-HDB', nan, False)
('BNC', 'OCFL-KM', nan, False)
('BNC', 'OCFL-MS', nan, False)
('BNC', 'SCL', nan, False)
('OCFL-AFF', 'OCFL-HDB', nan, False)
('OCFL-AFF', 'OCFL-KM', nan, False)
('OCFL-AFF', 'OCFL-MS', nan, False)
('OCFL-AFF', 'SCL', nan, False)
('OCFL-HDB', 'OCFL-KM', 0.668548583984375, False)
('OCFL-HDB', 'OCFL-MS', nan, False)
('OCFL-HDB', 'SCL', nan, False)
('OCFL-KM', 'OCFL-MS', nan, False)
('OCFL-KM', 'SCL', nan

### Deletion

In [18]:
root = os.path.join(EXPERIMENTS_ROOT, EXPERIMENTS[1])
expa_frame = pd.read_csv(os.path.join(root, 'INDE_avg.csv'))
expa_frame.drop(['insertion'], axis=1, inplace=True)
expa_frame.rename(columns={"deletion":"score"}, inplace=True)
expa_frame['algorithm'] = expa_frame['algorithm'].replace(['baseline', 'briggs', 'sattler', 'kmeans', 'affinity', 'meanshift', 'HDBSCAN'], ['BNC', 'BCL', 'SCL', 'OCFL-KM', 'OCFL-AFF', 'OCFL-MS', 'OCFL-HDB'])
draw_cd_diagram(df_perf=expa_frame, title='Deletion', labels=True, filename=os.path.join(root, 'Deletion_CD_plot.png'))

['BNC' 'BCL' 'SCL' 'OCFL-KM' 'OCFL-AFF' 'OCFL-MS' 'OCFL-HDB']
BCL         0.0
BNC         0.0
OCFL-AFF    8.0
OCFL-HDB    4.0
OCFL-KM     2.0
OCFL-MS     2.0
SCL         0.0
dtype: float64
BCL         3.777778
OCFL-MS     3.000000
OCFL-KM     2.812500
OCFL-HDB    2.500000
OCFL-AFF    1.857143
BNC              NaN
SCL              NaN
dtype: float64
('BCL', 'BNC', nan, False)
('BCL', 'OCFL-AFF', nan, False)
('BCL', 'OCFL-HDB', nan, False)
('BCL', 'OCFL-KM', nan, False)
('BCL', 'OCFL-MS', nan, False)
('BCL', 'SCL', nan, False)
('BNC', 'OCFL-AFF', nan, False)
('BNC', 'OCFL-HDB', nan, False)
('BNC', 'OCFL-KM', nan, False)
('BNC', 'OCFL-MS', nan, False)
('BNC', 'SCL', nan, False)
('OCFL-AFF', 'OCFL-HDB', nan, False)
('OCFL-AFF', 'OCFL-KM', nan, False)
('OCFL-AFF', 'OCFL-MS', nan, False)
('OCFL-AFF', 'SCL', nan, False)
('OCFL-HDB', 'OCFL-KM', 0.5618896484375, False)
('OCFL-HDB', 'OCFL-MS', nan, False)
('OCFL-HDB', 'SCL', nan, False)
('OCFL-KM', 'OCFL-MS', nan, False)
('OCFL-KM', 'SCL', nan, 

## Section II Part C (Orchestrator Test Set Experiments)

### Insertion

In [19]:
root = os.path.join(EXPERIMENTS_ROOT, EXPERIMENTS[2])
expa_frame = pd.read_csv(os.path.join(root, 'INDE_avg.csv'))
expa_frame.drop(['deletion'], axis=1, inplace=True)
expa_frame.rename(columns={"insertion":"score"}, inplace=True)
expa_frame['algorithm'] = expa_frame['algorithm'].replace(['baseline', 'briggs', 'sattler', 'kmeans', 'affinity', 'meanshift', 'HDBSCAN'], ['BNC', 'BCL', 'SCL', 'OCFL-KM', 'OCFL-AFF', 'OCFL-MS', 'OCFL-HDB'])
draw_cd_diagram(df_perf=expa_frame, title='Insertion', labels=True, filename=os.path.join(root, 'Insertion_CD_plot.png'))

['BNC' 'BCL' 'SCL' 'OCFL-KM' 'OCFL-AFF' 'OCFL-MS' 'OCFL-HDB']
BCL         7.0
BNC         0.0
OCFL-AFF    0.0
OCFL-HDB    5.0
OCFL-KM     2.0
OCFL-MS     2.0
SCL         0.0
dtype: float64
BNC         5.7500
SCL         5.0625
OCFL-AFF    4.8125
OCFL-MS     3.4375
BCL         3.3750
OCFL-HDB    2.8750
OCFL-KM     2.6875
dtype: float64
('BNC', 'OCFL-KM', 0.000152587890625, True)
('BNC', 'OCFL-HDB', 0.00030517578125, True)
('BNC', 'OCFL-MS', 0.00030517578125, True)
('OCFL-KM', 'SCL', 0.000579833984375, True)
('OCFL-HDB', 'SCL', 0.001007080078125, True)
('OCFL-AFF', 'OCFL-KM', 0.001312255859375, True)
('OCFL-AFF', 'OCFL-HDB', 0.00213623046875, True)
('BCL', 'BNC', 0.009185791015625, False)
('OCFL-AFF', 'OCFL-MS', 0.021392822265625, False)
('BCL', 'SCL', 0.02899169921875, False)
('OCFL-MS', 'SCL', 0.02899169921875, False)
('BCL', 'OCFL-AFF', 0.033538818359375, False)
('BNC', 'OCFL-AFF', 0.07391357421875, False)
('BNC', 'SCL', 0.116668701171875, False)
('BCL', 'OCFL-KM', 0.375457763671875, 

### Deletion

In [20]:
root = os.path.join(EXPERIMENTS_ROOT, EXPERIMENTS[2])
expa_frame = pd.read_csv(os.path.join(root, 'INDE_avg.csv'))
expa_frame.drop(['insertion'], axis=1, inplace=True)
expa_frame.rename(columns={"deletion":"score"}, inplace=True)
expa_frame['algorithm'] = expa_frame['algorithm'].replace(['baseline', 'briggs', 'sattler', 'kmeans', 'affinity', 'meanshift', 'HDBSCAN'], ['BNC', 'BCL', 'SCL', 'OCFL-KM', 'OCFL-AFF', 'OCFL-MS', 'OCFL-HDB'])
draw_cd_diagram(df_perf=expa_frame, title='Deletion', labels=True, filename=os.path.join(root, 'Deletion_CD_plot.png'))

['BNC' 'BCL' 'SCL' 'OCFL-KM' 'OCFL-AFF' 'OCFL-MS' 'OCFL-HDB']
BCL         6.0
BNC         1.0
OCFL-AFF    0.0
OCFL-HDB    3.0
OCFL-KM     1.0
OCFL-MS     4.0
SCL         1.0
dtype: float64
BNC         5.6875
SCL         5.0625
BCL         4.0000
OCFL-AFF    3.8750
OCFL-KM     3.3750
OCFL-MS     3.2500
OCFL-HDB    2.7500
dtype: float64
('BNC', 'OCFL-KM', 0.001007080078125, True)
('BNC', 'OCFL-MS', 0.001007080078125, True)
('BNC', 'OCFL-HDB', 0.001312255859375, True)
('BNC', 'OCFL-AFF', 0.002685546875, True)
('OCFL-KM', 'SCL', 0.00628662109375, False)
('OCFL-HDB', 'SCL', 0.00762939453125, False)
('BCL', 'BNC', 0.021392822265625, False)
('OCFL-MS', 'SCL', 0.033538818359375, False)
('OCFL-AFF', 'SCL', 0.05767822265625, False)
('OCFL-AFF', 'OCFL-HDB', 0.09344482421875, False)
('OCFL-AFF', 'OCFL-KM', 0.129730224609375, False)
('OCFL-AFF', 'OCFL-MS', 0.14385986328125, False)
('BCL', 'OCFL-HDB', 0.1590576171875, False)
('BCL', 'OCFL-KM', 0.2978515625, False)
('BNC', 'SCL', 0.375457763671875, F