In [1]:
import os
import random
random.seed(42)
import pandas as pd
from copy import deepcopy

In [2]:
from sklearn.metrics import mean_absolute_error, r2_score, root_mean_squared_error
from sklearn.model_selection import train_test_split

In [3]:
from genopt.optimizer import GeneticAlgorithm
from genopt.scaler import sigma_trunc_scaling
from genopt.selector import tournament_selection, roulette_wheel_selection, linear_rank_selection
from genopt.crossover import one_point_crossover, two_point_crossover
from genopt.mutator import uniform_mutation

## Shared

In [4]:
def calc_accuracy(y_true, y_pred, metric='mae'):
    if metric == 'mae':
        acc = mean_absolute_error(y_true, y_pred)
    elif metric == 'r2':
        acc = r2_score(y_true, y_pred)
    elif metric == 'rmse':
        acc = root_mean_squared_error(y_true, y_pred)

    return acc

## Random search

In [5]:
class RandomSearchRegressor:
    def __init__(self, cons_size=10, n_iter=5000, metric='mae'):
        super().__init__()

        self.cons_size = cons_size
        self.n_iter = n_iter
        self.metric = metric

    def run(self, x, y):
        
        cons_list = []
        for _ in range(self.n_iter):
            
            random_cons = random.sample(range(len(x.columns)), k=self.cons_size) # skip TRUE column
            y_cons = x[x.columns[random_cons]].mean(axis=1)
            
            acc = calc_accuracy(y, y_cons, metric=self.metric)
            cons_list.append((random_cons, acc))
            #
            if self.metric in ['mae', 'rmse']:
                cons_list = sorted(cons_list, key=lambda x: x[1], reverse=False) # minimize
            elif self.metric in ['r2']:
                cons_list = sorted(cons_list, key=lambda x: x[1], reverse=True) # maximize
                
            best_cons = cons_list[0][0]
            best_cons = x.columns[best_cons]
            
            return best_cons

## Systematic search

In [6]:
class SystematicSearchRegressor:
    def __init__(self, cons_size=10, metric='mae'):
        super().__init__()

        self.cons_size = cons_size
        self.metric = metric

    def run(self, x, y):

        tmp = []
        for model in x.columns:
            acc = calc_accuracy(y, x[model], metric=self.metric)
            tmp.append((model, acc))
            
        if self.metric in ['mae', 'rmse']:
            tmp_sorted = sorted(tmp, key=lambda x:x[1], reverse=False) # minimize
        elif self.metric in ['r2']:
            tmp_sorted = sorted(tmp, key=lambda x:x[1], reverse=True) # maximize 

        x_sorted = x[[i[0] for i in tmp_sorted]]
            
        best_cons = x_sorted.columns[:self.cons_size]
        
        return best_cons

## Genetic search

In [7]:
# version without model duplication in consensus
def one_point_crossover(mother, father):
    sister = deepcopy(mother)
    brother = deepcopy(father)

    for _ in range(100):
        cut = random.randint(1, len(mother) - 1)
    
        sister[cut:] = father[cut:]
        brother[cut:] = mother[cut:]

        if len(set(sister.container)) == len(set(brother.container)) == len(sister.container):
            break

    return sister, brother

# version without model duplication in consensus
def uniform_mutation(individual, space, prob=0):

    for _ in range(100):
        for n, gen in enumerate(individual):
            if random.random() < prob:
                individual[n] = random.choice(space)
        if len(set(individual.container)) == len(individual.container):
            return individual
            
    return individual

class GeneticSearchRegressor:
    def __init__(self, cons_size=10, n_iter=200, metric='mae'):
        super().__init__()

        self.cons_size = cons_size
        self.n_iter = n_iter
        self.metric = metric

    def run(self, x, y):
        
        def objective(cons):
            y_cons = x[x.columns[cons]].mean(axis=1)
            acc = calc_accuracy(y, y_cons, metric=self.metric)
            return acc
        #
        space = range(len(x.columns))
        if self.metric in ['mae', 'rmse']:
            task = 'minimize'
        elif self.metric in ['r2']:
            task = 'maximize'
        #
        ga = GeneticAlgorithm(task=task, pop_size=50, cross_prob=0.8, mut_prob=0.2, elitism=True)
        ga.set_fitness(objective)
        ga.set_selector_type(tournament_selection)
        ga.set_crossover_type(one_point_crossover)
        ga.set_mutator_type(uniform_mutation)
        ga.set_scaler_type(sigma_trunc_scaling)
        #
        ga.initialize(space, steps=self.cons_size)   
        ga.run(n_iter=200, verbose=False)
        #
        best_cons = ga.best_individual()
        best_cons = x.columns[best_cons]
        #
        return best_cons

## Single dataset benchmark

In [8]:
method_list = [(SystematicSearchRegressor(cons_size=1, metric='rmse'), 'Best'),
               (RandomSearchRegressor(cons_size=10, n_iter=5000, metric='rmse'), 'Random'),
               (SystematicSearchRegressor(cons_size=10, metric='rmse'), 'Systematic'),
               (GeneticSearchRegressor(cons_size=10, metric='rmse'), 'Genetic')]

In [9]:
res_df_val = pd.DataFrame()
res_df_test = pd.DataFrame()
#
bench_file = 'benchmark/md/md_dens_2D_SIL.csv'

# load data
df = pd.read_csv(bench_file)
df_val, df_test = train_test_split(df, test_size=0.3, random_state=42)
#
x_val, y_val = df_val.iloc[:, 1:], df_val.iloc[:, 0]
x_test, y_test = df_test.iloc[:, 1:], df_test.iloc[:, 0]

# run ensemble search
for method_func, method_name in method_list:
    cons = method_func.run(x_val, y_val)
    y_pred = df_val[cons].mean(axis=1)
    #
    res_df_val.loc[method_name, 'RMSE'] = calc_accuracy(y_val, x_val[cons].mean(axis=1), metric='rmse')
    res_df_val.loc[method_name, 'MAE'] = calc_accuracy(y_val, x_val[cons].mean(axis=1), metric='mae')
    res_df_val.loc[method_name, 'R2'] = calc_accuracy(y_val, x_val[cons].mean(axis=1), metric='r2')
    #
    res_df_test.loc[method_name, 'RMSE'] = calc_accuracy(y_test, x_test[cons].mean(axis=1), metric='rmse')
    res_df_test.loc[method_name, 'MAE'] = calc_accuracy(y_test, x_test[cons].mean(axis=1), metric='mae')
    res_df_test.loc[method_name, 'R2'] = calc_accuracy(y_test, x_test[cons].mean(axis=1), metric='r2')

In [10]:
res_df_val.round(2)

Unnamed: 0,RMSE,MAE,R2
Best,79.23,57.59,0.8
Random,213.59,183.76,-0.47
Systematic,65.02,45.27,0.86
Genetic,55.09,36.78,0.9


In [11]:
res_df_test.round(2)

Unnamed: 0,RMSE,MAE,R2
Best,97.16,69.97,0.68
Random,203.86,179.27,-0.39
Systematic,71.52,55.48,0.83
Genetic,75.01,57.21,0.81


## Many datasets benchmark

In [12]:
import os
import numpy as np
from collections import Counter
from tqdm import tqdm

In [13]:
method_list = [(SystematicSearchRegressor(cons_size=1, metric='rmse'), 'Best'),
               (RandomSearchRegressor(cons_size=10, n_iter=5000, metric='rmse'), 'Random'),
               (SystematicSearchRegressor(cons_size=10, metric='rmse'), 'Systematic'),
               (GeneticSearchRegressor(cons_size=10, metric='rmse'), 'Genetic')]

In [14]:
res_df_val = pd.DataFrame()
res_df_test = pd.DataFrame()

# bench_dir = 'benchmark/chembl'
bench_dir = 'benchmark/molnet'
# bench_dir = 'benchmark/cliff'
# bench_dir = 'benchmark/md'
# bench_dir = 'benchmark/denmark'

data_path = os.listdir(bench_dir)
for bench_file in tqdm(data_path):

    bench_name = bench_file.split('.')[0]

    # load data
    df = pd.read_csv(os.path.join(bench_dir, bench_file))
    df_val, df_test = train_test_split(df, test_size=0.3, random_state=42)
    #
    x_val, y_val = df_val.iloc[:, 1:], df_val.iloc[:, 0]
    x_test, y_test = df_test.iloc[:, 1:], df_test.iloc[:, 0]
    #
    for method_func, method_name in method_list:
        cons = method_func.run(x_val, y_val)
        y_pred = x_val[cons].mean(axis=1)
        #
        res_df_val.loc[bench_name, method_name] = calc_accuracy(y_val, x_val[cons].mean(axis=1), metric='r2')
        res_df_test.loc[bench_name, method_name] = calc_accuracy(y_test, x_test[cons].mean(axis=1), metric='r2')

100%|█| 10/10 [01:44<00:00, 10.40s


In [15]:
res_df_val.round(2)

Unnamed: 0,Best,Random,Systematic,Genetic
malaria,0.38,0.33,0.4,0.42
lmc_mouse,0.29,0.23,0.3,0.33
PDBbindF,0.46,0.48,0.49,0.51
PDBbindR,0.41,0.41,0.43,0.46
solubility,0.89,0.87,0.92,0.93
lipophilicity,0.59,0.59,0.63,0.73
freesolv,0.93,0.82,0.94,0.94
lmc_human,0.42,0.38,0.45,0.49
PDBbindC,0.41,0.42,0.43,0.47
lmc_rat,0.43,0.44,0.46,0.52


In [16]:
res_df_test.round(2)

Unnamed: 0,Best,Random,Systematic,Genetic
malaria,0.27,0.26,0.33,0.34
lmc_mouse,0.41,0.49,0.47,0.43
PDBbindF,0.42,0.48,0.49,0.5
PDBbindR,0.55,0.53,0.58,0.6
solubility,0.9,0.81,0.89,0.86
lipophilicity,0.63,0.63,0.67,0.75
freesolv,0.95,0.84,0.95,0.95
lmc_human,0.41,0.4,0.45,0.47
PDBbindC,0.56,0.55,0.57,0.59
lmc_rat,0.43,0.55,0.52,0.54


## Meta stat

In [17]:
tmp = []
for dataset in res_df_test.index:
    min_i = res_df_test.loc[dataset].argmin()
    max_i = res_df_test.loc[dataset].argmax()
    #
    tmp.append(res_df_test.loc[dataset].index[max_i])

In [18]:
Counter(tmp)

Counter({'Genetic': 6, 'Random': 2, 'Best': 2})

In [19]:
res_df_val.mean(axis=0).round(2)

Best          0.52
Random        0.50
Systematic    0.55
Genetic       0.58
dtype: float64

In [20]:
res_df_test.mean(axis=0).round(2)

Best          0.55
Random        0.55
Systematic    0.59
Genetic       0.60
dtype: float64