In [None]:
import random
import pandas as pd
from copy import deepcopy

In [None]:
from sklearn.metrics import mean_absolute_error, r2_score
from sklearn.model_selection import train_test_split

In [None]:
from genopt.optimizer import SGA
from genopt.scalers import sigma_trunc_scaling
from genopt.selectors import tournament_selection, roulette_wheel_selection, linear_rank_selection
from genopt.crossovers import one_point_crossover, two_point_crossover
from genopt.mutators import uniform_mutation

## Consensus search

In [None]:
# random search
def random_search(x):
    random_acc = []
    for i in range(1000):
        random_cons = random.sample(range(1, len(x.columns) - 1), k=10) # skip TRUE column
        acc = r2_score(x['TRUE'], x[x.columns[random_cons]].mean(axis=1))
        random_acc.append((random_cons, acc))
    #
    random_acc = sorted(random_acc, key=lambda x: x[1], reverse=True)
    best_cons = random_acc[0][0]
    return best_cons

# systematic search
def systematic_search(x):

    best_acc, best_cons = 0, None
    for n, model in enumerate(x.columns[1:], start=2): # skip TRUE column
        current_cons = x[x.columns[1:n]]
        pred_cons = current_cons.mean(axis=1)
        #
        acc = r2_score(x['TRUE'], pred_cons)
        if acc > best_acc:
            best_acc = acc
            best_cons = list(current_cons.columns)
    
    return best_cons

# genetic search 
def genetic_search(x, verbose=False):
    def objective(cons):
        y_cons = x[x.columns[cons]].mean(axis=1)
        y_true = x['TRUE']
        acc = r2_score(y_true, y_cons)
        return acc
    #
    space = range(1, len(x.columns) - 1)
    cons_size = 10
    #
    ga = SGA(task='maximize', pop_size=50, cross_prob=0.8, mut_prob=0.2, elitism=True)
    ga.set_fitness(objective)
    ga.set_crossover_type(one_point_crossover)
    ga.set_mutator_type(uniform_mutation)
    ga.set_selector_type(tournament_selection)
    ga.set_scaler_type(sigma_trunc_scaling)
    ga.initialize(space, steps=cons_size)   
    ga.run(n_iter=200, verbose=verbose)
    #
    best_individual = ga.best_individual()
    #
    return best_individual

## All benchmarks

In [None]:
import os
import numpy as np
from sklearn.model_selection import StratifiedKFold

In [None]:
RESULTS_LIST = []
for n, i in enumerate(os.listdir('bench_results')):
    results = pd.DataFrame()
    # load data
    df = pd.read_csv(os.path.join('bench_results', i), index_col='Unnamed: 0')
    df_val, df_test = train_test_split(df, test_size=0.3, random_state=42)
    tmp = sorted([(model, r2_score(df_val['TRUE'], df_val[model])) for model in df_val.columns], key=lambda x:x[1], reverse=True)
    df_val_sorted = df_val[[i[0] for i in tmp]]

    # best model
    best_model = df_val_sorted.columns[1:2]
    # random search
    best_individual_random = random_search(df_val_sorted)
    best_cons_random = df_val_sorted.columns[best_individual_random]
    # systematic search 
    best_cons_sys = systematic_search(df_val_sorted)
    # genetic search
    best_individual = genetic_search(df_val_sorted, verbose=False)
    best_cons_ga = df_val_sorted.columns[best_individual]
    #
    results.loc['BEST_MODEL', 'VAL'] = r2_score(df_val_sorted['TRUE'], df_val_sorted[best_model].mean(axis=1))
    results.loc['BEST_MODEL', 'TEST'] = r2_score(df_test['TRUE'], df_test[best_model])
    #
    results.loc[f'SYSTEMATIC', 'VAL'] = r2_score(df_val_sorted['TRUE'], df_val_sorted[best_cons_sys].mean(axis=1))
    results.loc[f'SYSTEMATIC', 'TEST'] = r2_score(df_test['TRUE'], df_test[best_cons_sys].mean(axis=1))
    #
    results.loc[f'RANDOM', 'VAL'] = r2_score(df_val_sorted['TRUE'], df_val_sorted[best_cons_random].mean(axis=1))
    results.loc[f'RANDOM', 'TEST'] = r2_score(df_test['TRUE'], df_test[best_cons_random].mean(axis=1))
    #
    results.loc[f'GENETIC', 'VAL'] = r2_score(df_val_sorted['TRUE'], df_val_sorted[best_cons_ga].mean(axis=1))
    results.loc[f'GENETIC', 'TEST'] = r2_score(df_test['TRUE'], df_test[best_cons_ga].mean(axis=1))
    #
    results = results.round(2)
    #
    RESULTS_LIST.append(results)
    #
    AVERAGE_ACC = sum(RESULTS_LIST) / len(RESULTS_LIST)
    AVERAGE_ACC = AVERAGE_ACC.round(2)
    #
    print(n,
          AVERAGE_ACC.loc['BEST_MODEL', 'TEST'],
          AVERAGE_ACC.loc['SYSTEMATIC', 'TEST'],
          AVERAGE_ACC.loc['RANDOM', 'TEST'],
          AVERAGE_ACC.loc['GENETIC', 'TEST'],
          end='\r')

In [None]:
print(f'Total number of datasets: {len(RESULTS_LIST)}')
AVERAGE_ACC = sum(RESULTS_LIST) / len(RESULTS_LIST)
AVERAGE_ACC = AVERAGE_ACC.round(2)
AVERAGE_ACC

In [None]:
AVERAGE_ACC