In [None]:
import os
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from utils import get_directories

In [None]:
DIRS = get_directories(os.path.join(os.path.abspath(''), 'explore.ipynb'))
exp_id = 'ES_exp2'
DIRS['csv_exp'] = DIRS['csv'] + exp_id + os.sep
if not os.path.exists(DIRS['csv_exp']):
    os.mkdir(DIRS['csv_exp'])

for n, p in DIRS.items():
    print(f'{n}: {p}')

In [None]:
def plot_split(dfs: list[pd.DataFrame], headers: list[str], title: str) -> plt.Figure:
    """
    Takes number of (min 2, max 4) dataframes and plots the "raw" results along with a histogram in separate subplots.
    2 & 3 dfs will be shown in one row, 4 will result in a 2x2 grid.
    Headers are used as titles for the subplots, so they should be in the same order as the dataframes.
    The title is used for the whole figure.
    """
    
    assert len(dfs) > 1 and len(dfs) < 5, "Please only give 2, 3 or 4 dataframes"
    assert len(dfs) == len(headers), "Please give as many headers as dataframes"
    ndfs = len(dfs)

    if ndfs != 4:
        fig, axes = plt.subplots(1, ndfs, figsize=(5*ndfs, 5), sharey=True)
    else:
        fig, axes = plt.subplots(2, 2, figsize=(10, 10))
        # spread axes so we can loop over them
        axes = list(axes[0]) + list(axes[1])
    
    # will be used to set bounds for the histograms, as y-axis is shared
    full_df = pd.concat(dfs, ignore_index=True)
    bounds = (full_df.max_avg_value.min(), full_df.max_avg_value.max())
    # small delta to avoid clipping
    delta = (bounds[1] - bounds[0]) * 0.01
    bounds = (bounds[0] - delta, bounds[1] + delta)

    rev = lambda s: list(reversed(list(s)))
    for i, (df, ax) in enumerate(zip(dfs, axes)):
        # observations
        ax.scatter(df.index, rev(df.max_avg_value), c=f'C{i}', marker='|', s=10)
        ax.set_title(headers[i])
        if i == 0: ax.set_ylabel('validation accuracy')
        ax.set_xlabel('frequency / ranking')
        ax.set_xticks([], [])
        ax.set_ylim(bounds)
    
        # histogram
        ax2 = ax.twinx()
        bins = np.arange(bounds[0], bounds[1], (bounds[1]-bounds[0])/50)
        ax2.hist(df.max_avg_value, bins=bins, orientation='horizontal', color=f'C{i}', alpha=0.5)
        ax2.grid(False)
        ax2.set_yticks([], [])
        ax2.set_xticks([], [])
        ax2.set_title('')
        ax2.set_ylim(bounds)

    fig.suptitle(title, fontsize=16, weight='bold')
    fig.tight_layout()
    return fig

In [None]:
df_ES = pd.DataFrame(columns=['max_avg_value', 'pop_size', 'mu', 'lambda', 'recombination', 'sigma', 'tau', 'chunk_size', 'individual_sigmas'])
df_ES.index.name = 'run_id'

for file in os.listdir(DIRS['csv_exp']):
    if not file.endswith('.csv'):
        continue
    df_ = pd.read_csv(os.path.join(DIRS['csv_exp'], file), index_col=0)
    max_val = df_.mean(axis=1).iloc[-1]
    run_id = file[3:-4]  # trim 'ES_' and '.csv'
    ps, mu, lm, rec, sig, tau, chsz, isig = run_id.split('_')
    df_ES.loc[run_id] = [max_val, ps, mu, lm, rec, sig, tau, chsz, isig]

# sort by highest value
df_ES = df_ES.sort_values(by='max_avg_value', ascending=False)

# save to csv with same name as dir where all individual csv files are stored
df_ES.to_csv(os.path.join(DIRS['csv'], f'{exp_id}.csv'))

In [None]:
df_ES

In [None]:
# remove outliers (everything below 0.93)
df_ES = df_ES[df_ES.max_avg_value > 0.92]

In [None]:
# mu+lambda vs. mu,lambda
df_comma = df_ES[df_ES['lambda'] == df_ES['pop_size']]
df_plus = df_ES[df_ES['lambda'] != df_ES['pop_size']]
fig = plot_split([df_comma, df_plus], ['\u03bb,\u03bc', '\u03bb+\u03bc'], '\u03bb,\u03bc vs. \u03bb+\u03bc in ES')
fig.savefig(DIRS['plots'] + f'elitism_ES1.png', dpi=300)

In [None]:
# pop_size 40 vs 100
df_40 = df_ES[df_ES['pop_size'] == '40']
df_100 = df_ES[df_ES['pop_size'] == '100']
fig = plot_split([df_40, df_100], ['40', '100'], 'population size in ES')
fig.savefig(DIRS['plots'] + f'pop_size_ES1.png', dpi=300)

In [None]:
# different taus (0.1, 0.2, 0.5, 0.99)
df_tau1 = df_ES[df_ES['tau'] == '0.1']
df_tau2 = df_ES[df_ES['tau'] == '0.2']
df_tau5 = df_ES[df_ES['tau'] == '0.5']
df_tau99 = df_ES[df_ES['tau'] == '0.99']
fig = plot_split([df_tau1, df_tau2, df_tau5, df_tau99], ['0.1', '0.2', '0.5', '0.99'], r'$\mathbf{\tau}$ in ES')
fig.savefig(DIRS['plots'] + f'tau_ES1.png', dpi=300)

In [None]:
# different sigmas (0.01, 0.1, 0.5)
df_sig1 = df_ES[df_ES['sigma'] == '0.01']
df_sig2 = df_ES[df_ES['sigma'] == '0.1']
df_sig5 = df_ES[df_ES['sigma'] == '0.5']
fig = plot_split([df_sig1, df_sig2, df_sig5], ['0.01', '0.1', '0.5'], r'$\mathbf{\sigma}$ in ES')
fig.savefig(DIRS['plots'] + f'sigma_ES1.png', dpi=300)

In [None]:
# different recombination methods (d, i, dg, ig)
df_d = df_ES[df_ES['recombination'] == 'd']
df_i = df_ES[df_ES['recombination'] == 'i']
df_dg = df_ES[df_ES['recombination'] == 'dg']
df_ig = df_ES[df_ES['recombination'] == 'ig']
fig = plot_split([df_d, df_i, df_dg, df_ig], ['discrete', 'intermediate', 'discrete global', 'intermediate global'], 'recombination in ES')
fig.savefig(DIRS['plots'] + f'recombination_ES1.png', dpi=300)

In [None]:
# individual sigmas
df_isig = df_ES[df_ES['individual_sigmas'] == 'True']
df_no_isig = df_ES[df_ES['individual_sigmas'] == 'False']
fig = plot_split([df_isig, df_no_isig], ['True', 'False'], 'individual sigmas in ES')
fig.savefig(DIRS['plots'] + f'individual_sigmas_ES1.png', dpi=300)

In [None]:
# chunk sizes (3, 7)
df_chsz3 = df_ES[df_ES['chunk_size'] == '3']
df_chsz7 = df_ES[df_ES['chunk_size'] == '7']
fig = plot_split([df_chsz3, df_chsz7], ['3', '7'], 'chunk size in ES')