# For reprodutibility, we used the following 'versions' of the code and datasets
- MINOTAUR commit used: https://github.com/Mirandatz/minotaur/tree/29c97d1ebc8b701c0dc4692df2f20b64616acbe3
- DATASET commit used: https://github.com/Mirandatz/minotaur.datasets/tree/99240f2ead1037c6396aec0230caabb24b0ff443

In [1]:
import sys
import subprocess
import os
import pandas as pd

In [2]:
MINOTAUR_LINE_SEP = '==============================================================================='
MINOTAUR_PATH = r'C:\Source\minotaur\Minotaur\Minotaur\bin\x64\Release\netcoreapp3.0\Minotaur.exe'
DATASET_DIR = r'C:\Source\minotaur.datasets\iris\2-ready-for-minotaur'
OUTPUT_DIR = r'C:\Source\minotaur.experiments\notebooks\iris-temp'
FOLD_COUNT = 10

In [3]:
# Create the output directories, if necessary
if not os.path.isdir(OUTPUT_DIR):
    os.mkdir(OUTPUT_DIR)
        
for fold_nr in range(FOLD_COUNT):
    if not os.path.isdir(OUTPUT_DIR + f'\\fold-{fold_nr}'):
        os.mkdir(OUTPUT_DIR + f'\\fold-{fold_nr}')

#### In the next cells we run MINOTAUR for each fold of the dataset.

The following command line arguments are mandatory; they are paths to csv files:
- `--train-data`
- `--train-labels`
- `--test-data`
- `--test-labels`

The train files are used to guide the evolutionary process.
The test files are used to compute the final population's fitnesses.

The `--output-directory` argument defines a path for MINOTAUR to create its output files, `final-population-fitnesses.txt`, containing the fitnesses of the individuals from the final population, and `final-population-individuals.txt`, containing the individuals from the final population. Both files are stored in a human readable format; each individual is separated by a line of with containing only "=" symbols.

The `--classification-type` argument defines how the dataset labels should be parsed and how the consequent of rules should be generated / modified. It also defines how some metrics (like FScore) are computed.

The remaining arguments are self-explanatory

In [4]:
def run_minotaur(formated_args: str):
    subprocess_args = MINOTAUR_PATH + ' ' +  formated_args
    subprocess.call(subprocess_args)

In [12]:
%%time
dfs = []
for fold_nr in range(FOLD_COUNT):
    args = f'--train-data={DATASET_DIR}\\fold-{fold_nr}\\train-data.csv'
    args += f' --train-labels={DATASET_DIR}\\fold-{fold_nr}\\train-labels.csv'
    args += f' --test-data={DATASET_DIR}\\fold-{fold_nr}\\test-data.csv'
    args += f' --test-labels={DATASET_DIR}\\fold-{fold_nr}\\test-labels.csv'
    args += f' --output-directory={OUTPUT_DIR}\\fold-{fold_nr}'
    args += f' --classification-type=singlelabel'
    args += f' --max-generations=100'
    args += f' --population-size=100'
    run_minotaur(args)

Wall time: 9.4 s


#### In the next cells we load, parse and post-process the outputs of MINOTAUR 
One of MINOTAUR's current implementation limitation is that it assumes that all objectives must be maximized, thus, in order to optimize the `rule-count` metric, we must compute is as negative value during the MINOTAUR's execution.
In order to make `rule-count` more readable, after we parse it we multiply it by -1 to obtain the correct value.

In [6]:
def parse_fitnesses(filename: str) -> pd.DataFrame:
    # The output of the algorithm, including the fitnesses,
    # are stored in a human-readable format, so we must do some 
    # work to parse it
    with open(filename, 'r') as file:
        lines = file.readlines()
        
    # Removing 'line separators'
    lines = [l for l in lines if MINOTAUR_LINE_SEP not in l]
    
    # Removing the endline, '[' and ']' characters
    lines = [l[:-1].replace('[','').replace(']','').split(',') for l in lines]
    
    # Creating the dataframe, fixing the column names and data types
    df = pd.DataFrame(lines)
    df = df.rename(columns={0: 'fscore', 1:'rule-count', 2:'avg-rule-vol'})
    df = df.astype({'fscore': 'float', 'rule-count':'int', 'avg-rule-vol':'float'})
    return df

In [7]:
for fold_nr in range(FOLD_COUNT):
    df = parse_fitnesses(f'{OUTPUT_DIR}\\fold-{fold_nr}\\final-population-fitnesses.txt')
    df['rule-count'] = df['rule-count'] * -1
    df['fold'] = fold_nr
    dfs.append(df)
df = pd.concat(dfs)    

In [8]:
# Lets take a peek of the dataframe, Each line represents a given individuals fitness in a given fold.
df.head()

Unnamed: 0,fscore,rule-count,avg-rule-vol,fold
0,0.738721,3,4.6876,0
1,0.738721,3,4.6876,0
2,0.731313,2,6.486499,0
3,0.678115,3,4.785832,0
4,0.678115,3,4.785832,0


In [9]:
# Here we are select the best individual of each fold, with 'best' meaning highest f-score
distinct_values = df.sort_values(by='fscore', ascending=False).drop_duplicates(['fold', 'fscore'])
indices_of_best = distinct_values.groupby('fold')['fscore'].transform(max) == distinct_values['fscore']
best_individuals = distinct_values[indices_of_best].sort_values(by='fold')

In [10]:
best_individuals

Unnamed: 0,fscore,rule-count,avg-rule-vol,fold
0,0.738721,3,4.6876,0
0,0.93266,2,7.592149,1
1,0.78022,2,7.535749,2
56,0.865993,4,3.2206,3
58,0.93266,4,3.369175,4
26,0.866667,4,3.02435,5
13,0.866667,2,4.807399,6
9,0.866667,2,5.750599,7
0,0.664647,2,7.456299,8
5,1.0,3,8.559334,9


In [11]:
best_individuals.drop(columns='fold').mean()

fscore          0.851490
rule-count      2.800000
avg-rule-vol    5.600325
dtype: float64