# For reproducibility purposes: we used the following 'repository versions'
- MINOTAUR: https://github.com/Mirandatz/minotaur/tree/10ef885019e86dd4ff659584ae3445a5f6c3d076
- Datasets and scripts used in pre-processing: https://github.com/Mirandatz/minotaur.datasets/tree/82b2a67206cdcc352160ba842d71c4d6021531e0
- Python scripts used to run MINOTAUR: https://github.com/Mirandatz/minotaur.experiments/blob/fd9e9e53a5305b0aff547923ba7e50415dcdaa44/minotaur.python/main.py

In [1]:
from pathlib import Path
from typing import Iterable, Dict
from itertools import product

import pandas as pd
import numpy as np

In [2]:
FOLD_COUNT = 10
SINGLE_LABEL_DATASETS = ['iris', 'breast-cancer-wisconsin', 'wine', 'madelon']
MULTI_LABEL_DATASETS = ['yeast', 'emotions', 'scene', 'CAL500']
MINOTAUR_OUTPUT_DIR = Path('c:/') / 'source' / 'minotaur.output'

We ran MINOTAUR multiple times on each dataset, changing the value of the 'cfsbe-target-instance-coverage' hyperparameter.

The results of each run are stored in different directories, with the last part of the directory name indicating the hyperparameter value used.

E.g.: in the directory 'iris-cfsbe-16' we have the results of running MINOTAUR with --cfsbe-target-instance-coverage=16

The function below is used to query which values were used for a given dataset

In [3]:
def get_cfsbe_values(dataset_name: str) -> Iterable[int]:
    dataset_dir = MINOTAUR_OUTPUT_DIR / dataset_name
    sub_directories = dataset_dir.glob('*')
    for sub_dir in map(str, sub_directories):
        sub_dir_parts = sub_dir.split('-')
        yield int(sub_dir_parts[-1])

In [4]:
def parse_minotaur_output(dataset_name: str, cfsbe_value: int, fold_nr: int) -> pd.DataFrame:
    # The script used to run MINOTAUR redundantly stores the dataset filename (a bunch of times) in the directory hierarchy 
    csv_path = MINOTAUR_OUTPUT_DIR / dataset_name
    csv_path = csv_path / f'{dataset_name}-cfsbe-{cfsbe_value}'
    csv_path = csv_path / dataset_name / f'fold-{fold_nr}' / 'generations_log.csv'    
    
    # We skip the first row because it contains the header...
    # The header is rather useless, since MINOTAUR doesn't write which metrics are used 
    # in the train dataset and which are used in the test dataset.
    column_names = ['generation', 'id', 'parent_id', 'train_fitness', 'test_fitness']
    column_types = {'generation':'int32',
                    'id':'int64',
                    'parent_id':'int64',
                    'train_fitness':'float32',
                    'test_fitness':'float32'}
    
    df = pd.read_csv(filepath_or_buffer=csv_path, skiprows=1, header=None,
                     names=column_names,
                     dtype=column_types)
    df['cfsbe_value'] = int(cfsbe_value)
    return df

In [5]:
def get_cross_fold_fitness(dataset_name: str, cfsbe_value:int) -> float:
    cv_best = []
    for fold_nr in range(FOLD_COUNT):
        df = parse_minotaur_output(dataset_name=dataset_name,
                                   cfsbe_value=cfsbe_value,
                                   fold_nr=fold_nr)
        fold_best = df['test_fitness'].max()
        cv_best.append(fold_best)
    
    return np.mean(cv_best)

In [6]:
def get_cfsbe_values_and_fitnesses(dataset_name: str) -> Dict[int, float]:
    cfsbe_values = get_cfsbe_values(dataset_name)
    cfsbe_and_fitnesses = {}
    for cfsbe_value in cfsbe_values:
        cfsbe_and_fitnesses[cfsbe_value] = get_cross_fold_fitness(dataset_name=dataset_name,
                                                                  cfsbe_value=cfsbe_value)
    return cfsbe_and_fitnesses

In [7]:
single_label_datasets_fitnesses = {name: get_cfsbe_values_and_fitnesses(name) for name in SINGLE_LABEL_DATASETS}

sl_df = pd.DataFrame(data=single_label_datasets_fitnesses)
sl_df = sl_df.sort_index()
sl_df.index.name = 'cfsbe_value'
sl_df.style.format(lambda x: '-' if pd.isnull(x) else f'{x:.2f}')

Unnamed: 0_level_0,iris,breast-cancer-wisconsin,wine,madelon
cfsbe_value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,0.51,0.56,0.24,0.33
8,0.85,0.77,0.47,-
16,0.89,0.90,0.65,0.33
32,0.93,0.94,0.85,-
64,0.93,0.96,0.91,0.33
128,0.87,0.97,-,0.34
256,-,0.97,-,0.38
512,-,-,-,0.48
1024,-,-,-,0.57
2048,-,-,-,0.56


In [8]:
multi_label_datasets_fitnesses = {name: get_cfsbe_values_and_fitnesses(name) for name in MULTI_LABEL_DATASETS}

ml_df = pd.DataFrame(data=multi_label_datasets_fitnesses)
ml_df = ml_df.sort_index()
ml_df.index.name = 'cfsbe_value'
ml_df.style.format(lambda x: '-' if pd.isnull(x) else f'{x:.2f}')

Unnamed: 0_level_0,yeast,emotions,scene,CAL500
cfsbe_value,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
2,0.48,0.00,0.01,0.30
8,0.48,0.00,0.01,0.30
16,0.48,0.05,0.01,0.31
32,0.48,0.12,0.02,0.32
64,0.49,0.27,0.04,0.32
128,0.51,0.41,0.12,0.33
256,0.53,0.47,0.27,0.34
512,0.55,0.47,0.31,-
1024,0.54,-,0.31,-
2048,0.54,-,0.34,-
