# For reprodutibility, we used the following 'versions' of the code and datasets
- MINOTAUR commit used: https://github.com/Mirandatz/minotaur/tree/511ad218c26c864a8258d7426c3c53e95ae630c3
- DATASET commit used: https://github.com/Mirandatz/minotaur.datasets/tree/4b8bf15bc2b46884d1e30c8b468ef137bef937ef

In [1]:
import subprocess
from pathlib import Path

import pandas as pd

In [2]:
MINOTAUR_LINE_SEP = '==============================================================================='
MINOTAUR_PATH =  Path('c:/') / 'source' / 'minotaur' / 'minotaur' / 'minotaur' / 'bin' / 'x64' / 'release' / 'netcoreapp3.0' / 'Minotaur.exe'
DATASET_DIR =  Path('c:/') / 'source' / 'minotaur.datasets' / 'iris' / '2-ready-for-minotaur'
OUTPUT_DIR = Path.cwd() / 'iris-output'
FOLD_COUNT = 10

In [3]:
# Create the output directories, if necessary
OUTPUT_DIR.mkdir(parents=True, exist_ok=True)
        
for fold_nr in range(FOLD_COUNT):
    fold_output_dir = OUTPUT_DIR / f'fold-{fold_nr}'
    fold_output_dir.mkdir(parents=True, exist_ok=True)

#### In the next cells we run MINOTAUR for each fold of the dataset.

The following command line arguments are mandatory; they are paths to csv files:
- `--train-data`
- `--train-labels`
- `--test-data`
- `--test-labels`

The train files are used to guide the evolutionary process.
The test files are used to compute the final population's fitnesses.

The `--output-directory` argument defines a path for MINOTAUR to create its output files, `final-population-fitnesses.txt`, containing the fitnesses of the individuals from the final population, and `final-population-individuals.txt`, containing the individuals from the final population. Both files are stored in a human readable format; each individual is separated by a line of with containing only "=" symbols.

The `--classification-type` argument defines how the dataset labels should be parsed and how the consequent of rules should be generated / modified. It also defines how some metrics (like FScore) are computed.

The remaining arguments are self-explanatory

In [4]:
def run_minotaur(formated_args: str):
    subprocess_args = str(MINOTAUR_PATH) + ' ' +  formated_args
    subprocess.call(subprocess_args)

In [5]:
%%time
dfs = []
for fold_nr in range(FOLD_COUNT):
    args = '--train-data=' + str(DATASET_DIR / f'fold-{fold_nr}' / 'train-data.csv')
    args += ' --train-labels=' + str(DATASET_DIR / f'fold-{fold_nr}' / 'train-labels.csv')
    args += ' --test-data=' + str(DATASET_DIR / f'fold-{fold_nr}' / 'test-data.csv')
    args += ' --test-labels=' + str(DATASET_DIR / f'fold-{fold_nr}' / 'test-labels.csv')
    args += ' --output-directory=' + str(OUTPUT_DIR / f'fold-{fold_nr}')
    args += ' --classification-type=singlelabel'
    args += ' --fitness-metrics=fscore'
    args += ' --fitness-metrics=average-rule-volume'
    args += ' --max-generations=100'
    args += ' --population-size=100'
    args += ' --mutants-per-generation=150'
    args += ' --cfsbe-target-instance-coverage=50'
    args += ' --sanity-checks=false'
    run_minotaur(args)

Wall time: 8.75 s


#### In the next cells we load, parse and post-process the outputs of MINOTAUR 
One of MINOTAUR's current implementation limitation is that it assumes that all objectives must be maximized, thus, in order to optimize the `rule-count` metric, we must compute is as negative value during the MINOTAUR's execution.
In order to make `rule-count` more readable, after we parse it we multiply it by -1 to obtain the correct value.

In [6]:
def parse_fitnesses(path: Path) -> pd.DataFrame:
    # The output of the algorithm, including the fitnesses,
    # are stored in a human-readable format, so we must do some 
    # work to parse it
    lines = path.read_text().split('\n')
        
    # Removing 'line separators'
    lines = [l for l in lines if MINOTAUR_LINE_SEP not in l]
    
    # Removing the endline, '[' and ']' characters
    lines = [l[:-1].replace('[','').replace(']','').split(',') for l in lines]
    
    # Creating the dataframe, fixing the column names and data types
    df = pd.DataFrame(lines)
    df = df.rename(columns={0: 'fscore', 1:'avg-rule-vol'})
    df = df.astype({'fscore': 'float', 'avg-rule-vol':'float'})
    return df

In [7]:
for fold_nr in range(FOLD_COUNT):
    fold_path = OUTPUT_DIR / f'fold-{fold_nr}' / 'final-population-fitnesses.txt'
    df = parse_fitnesses(fold_path)
    df['fold'] = fold_nr
    dfs.append(df)
df = pd.concat(dfs)    

In [8]:
# Lets take a peek of the dataframe, Each line represents a given individuals fitness in a given fold.
df.head()

Unnamed: 0,fscore,avg-rule-vol,fold
0,0.738721,7.283749,0
1,0.738721,7.283749,0
2,0.738721,7.283749,0
3,0.738721,7.283749,0
4,0.738721,7.283749,0


In [9]:
# Here we are select the best individual of each fold, with 'best' meaning highest f-score
distinct_values = df.sort_values(by='fscore', ascending=False).drop_duplicates(['fold', 'fscore'])
indices_of_best = distinct_values.groupby('fold')['fscore'].transform(max) == distinct_values['fscore']
best_individuals = distinct_values[indices_of_best].sort_values(by='fold')

In [10]:
best_individuals

Unnamed: 0,fscore,avg-rule-vol,fold
0,0.738721,7.283749,0
98,0.93266,3.37315,1
72,0.93266,2.412199,2
94,0.865993,3.562074,3
93,0.93266,6.98,4
0,0.861111,12.196065,5
19,0.866667,7.408399,6
88,0.866667,2.1613,7
55,0.796296,4.665333,8
54,0.87037,3.27775,9


In [11]:
best_individuals.drop(columns=['fold', 'avg-rule-vol']).mean()

fscore    0.866381
dtype: float64