In [26]:
import sys
import os
sys.path.insert(0, os.path.abspath(".."))

import ast
from slim_gsgp.datasets.data_loader import load_pandas_df
import pandas as pd
import numpy as np
from slim_gsgp.main_gp import gp
from slim_gsgp.main_slim import slim
from slim_gsgp.main_gsgp import gsgp
from slim_gsgp.evaluators.fitness_functions import *

from imblearn.over_sampling import SMOTENC, SMOTE

In [27]:
def load_and_adapt_data_info(filepath):
    #load
    data_info = pd.read_csv(filepath)
    #make dtype list for
    data_info['test_indices'] = data_info['test_indices'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    data_info['train_indices'] = data_info['train_indices'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    data_info['categoricals'] = data_info['categoricals'].apply(lambda x: ast.literal_eval(x) if isinstance(x, str) else x)
    return data_info

In [28]:
data_info = load_and_adapt_data_info('data/data_info.csv')
data_info

Unnamed: 0,name,n_samples,n_features,imbalance,categoricals,n_categoricals,train_indices,test_indices
0,blood,748,4,0.237968,[],0,"[[255, 174, 420, 341, 186, 512, 635, 190, 450,...","[[343, 644, 607, 22, 82, 105, 681, 292, 451, 5..."
1,clima,540,18,0.085185,[],0,"[[409, 132, 372, 52, 215, 156, 410, 76, 128, 2...","[[495, 424, 200, 281, 221, 500, 81, 515, 432, ..."
2,eeg,14980,14,0.448798,[],0,"[[4155, 5750, 12565, 7790, 12551, 963, 14035, ...","[[5237, 10277, 9601, 1187, 5396, 7470, 10820, ..."
3,fertility,100,9,0.12,"[season, child_diseases, accident, surgical_in...",7,"[[61, 5, 1, 70, 24, 40, 63, 52, 91, 99, 32, 47...","[[74, 46, 2, 33, 82, 25, 26, 11, 90, 15, 71, 8..."
4,gina,3153,970,0.491595,[],0,"[[2470, 503, 112, 1455, 1167, 1112, 2227, 1308...","[[49, 918, 536, 2682, 1079, 2440, 1168, 2148, ..."
5,hill,1212,100,0.5,[],0,"[[113, 547, 693, 906, 753, 537, 979, 803, 169,...","[[331, 231, 1044, 85, 882, 835, 155, 570, 253,..."
6,ilpd,583,10,0.286449,[],0,"[[213, 76, 343, 434, 44, 226, 532, 70, 493, 56...","[[208, 273, 442, 479, 478, 319, 224, 522, 474,..."
7,kc,2109,21,0.154576,[],0,"[[571, 2078, 248, 1348, 549, 603, 1085, 466, 1...","[[81, 968, 757, 193, 823, 1191, 1079, 1858, 17..."
8,liver,345,6,0.42029,[],0,"[[194, 281, 228, 140, 78, 40, 302, 285, 43, 15...","[[130, 52, 238, 125, 249, 204, 257, 307, 4, 79..."
9,musk,476,166,0.434874,[],0,"[[246, 107, 32, 180, 92, 46, 319, 132, 396, 14...","[[442, 423, 363, 43, 237, 407, 283, 271, 244, ..."


In [29]:
def oversample(df, categoricals = []):
    
    #if list is empty
    if not categoricals:
        sm = SMOTE(random_state = 42, categorical_features = categoricals)
        
    else:
        sm = SMOTENC(random_state = 42)
    
    sm
    
    return sm.fit_resample(df)

In [30]:
def return_train_test(df, train_indices, test_indices, oversampling = False, categoricals = []):
    
    train = df.iloc[train_indices]
    test = df.iloc[test_indices]
    
    if oversampling:
        train = oversample(df, categoricals)
    
    X_train, y_train = load_pandas_df(train, X_y=True)
    X_test, y_test = load_pandas_df(test, X_y=True)
        
    return X_train, y_train, X_test, y_test
    

In [31]:
def train_model(dataset_name, X_train, y_train, X_test, y_test, model, **model_config):
    
    if model== 'gp':
        best_individual = gp(
                    dataset_name=dataset_name, 
                    X_train=X_train, 
                    y_train = y_train, 
                    X_test = X_test, 
                    y_test = y_test, 
                    **model_config
                    )
    
    if model == 'slim':
        best_individual = slim(  
                    dataset_name=dataset_name, 
                    X_train=X_train, 
                    y_train = y_train, 
                    X_test = X_test, 
                    y_test = y_test, 
                    **model_config
                    )
    
    if model == 'gsgp':
        best_individual = gsgp(
                    dataset_name=dataset_name, 
                    X_train=X_train, 
                    y_train = y_train, 
                    X_test = X_test, 
                    y_test = y_test, 
                    **model_config
                    )
    
    return best_individual

In [None]:
def evaluate_prediction(y_true, y_pred):
    acc = accuracy(y_true, y_pred)
    roc = roc_auc(y_true, y_pred)
    f1 = f1_score(y_true, y_pred)
    prec = precision(y_true, y_pred)
    rec = recall(y_true, y_pred)
    
    return acc, roc, f1, prec, rec
    
    

In [33]:
def monte_carlo_cv(dataset_name, data_filepath, model_config, n_runs=30):
    
    df = pd.read_csv(f"{data_filepath}data_prepared/{dataset_name}.csv")
    data_info = load_and_adapt_data_info(f"{data_filepath}data_info.csv")
    categoricals = data_info.loc[data_info['name']== dataset_name, 'categoricals'][0]
    
    for i in range(1, n_runs+1, 1):
        
        model_config['seed'] = i
        
        train_indices = data_info.loc[data_info['name']== dataset_name, 'train_indices'][0][i-1]
        test_indices = data_info.loc[data_info['name']== dataset_name, 'test_indices'][0][i-1]
        
        X_train, y_train, X_test, y_test = return_train_test(df, train_indices, test_indices, model_config['oversampling'], categoricals)
        best_individual = train_model(dataset_name, X_train, y_train, X_test, y_test, model_config['name'], **model_config['config'])
        
        final_prediction =  best_individual.predict(X_test)
        acc = accuracy(y_test, final_prediction)
        f1 = f1_score(y_test, final_prediction)
        roc = roc_auc(y_test, final_prediction)
        
        
        print(f"Run {i} - Accuracy: {acc} - F1: {f1} - ROC: {roc}")
        
        

In [34]:
config_all = {
    'pop_size': 10, #100
    'n_iter': 100,  #2000
    'elitism': True,
    'n_elites': 1,
    'init_depth': 6,
    'initializer': 'rhh',
    'tournament_size': 2,
    'prob_const': 0.2,
    'tree_functions': ['add', 'subtract', 'multiply', 'divide'],
    'tree_constants': np.linspace(-10, 10, num=201).round(1).tolist(),
    
    'fitness_function': None, #must be defined
    'minimization' : None, #must be defined
    'seed': None, #must be defined
    
    'log_path' : None, #must be defined
    'verbose': False, 
    'log_level': 1,
    'test_elite': True
}

In [35]:
config_gp = {
    'p_xo': 0.8,
    'max_depth': 17
}

In [36]:
config_gsgp = {
    'p_xo': 0,
    'ms_lower': 0,
    'ms_upper': 1,
    'reconstruct' : True
}

In [37]:
config_slim = {
    'version': 'SLIM+SIG2',
    'ms_lower': 0,
    'ms_upper': 1,
    'p_inflate': 0.5,
    'reconstruct': True,
    'copy_parents': True
}

In [38]:
config_gsgp = config_all | config_gsgp
model_config_gsgp = {'name': 'gsgp', 'oversampling': False, 'config': config_gsgp}
model_config_gsgp['config']['fitness_function'] = 'f1_score'
model_config_gsgp['config']['minimization'] = False
model_config_gsgp['config']['log_path'] = 'logs/a.csv'
model_config_gsgp['config']['seed'] = 42



In [39]:
config_gp = config_all | config_gp
model_config_gp = {'name': 'gp', 'config': config_gp}
model_config_gp

{'name': 'gp',
 'config': {'pop_size': 10,
  'n_iter': 100,
  'elitism': True,
  'n_elites': 1,
  'init_depth': 6,
  'initializer': 'rhh',
  'tournament_size': 2,
  'prob_const': 0.2,
  'tree_functions': ['add', 'subtract', 'multiply', 'divide'],
  'tree_constants': [-10.0,
   -9.9,
   -9.8,
   -9.7,
   -9.6,
   -9.5,
   -9.4,
   -9.3,
   -9.2,
   -9.1,
   -9.0,
   -8.9,
   -8.8,
   -8.7,
   -8.6,
   -8.5,
   -8.4,
   -8.3,
   -8.2,
   -8.1,
   -8.0,
   -7.9,
   -7.8,
   -7.7,
   -7.6,
   -7.5,
   -7.4,
   -7.3,
   -7.2,
   -7.1,
   -7.0,
   -6.9,
   -6.8,
   -6.7,
   -6.6,
   -6.5,
   -6.4,
   -6.3,
   -6.2,
   -6.1,
   -6.0,
   -5.9,
   -5.8,
   -5.7,
   -5.6,
   -5.5,
   -5.4,
   -5.3,
   -5.2,
   -5.1,
   -5.0,
   -4.9,
   -4.8,
   -4.7,
   -4.6,
   -4.5,
   -4.4,
   -4.3,
   -4.2,
   -4.1,
   -4.0,
   -3.9,
   -3.8,
   -3.7,
   -3.6,
   -3.5,
   -3.4,
   -3.3,
   -3.2,
   -3.1,
   -3.0,
   -2.9,
   -2.8,
   -2.7,
   -2.6,
   -2.5,
   -2.4,
   -2.3,
   -2.2,
   -2.1,
   -2.0,
   -1

In [40]:
#monte_carlo_cv('blood', 'data/', model_config_gsgp, n_runs=30, oversampling = False)

In [61]:
class Experiment():
    
    def __init__ (
        self,
        dataset_name,
        data_filepath,
        model_configs, #list of dictionaries
        n_runs=30,
        log = False,
        log_path = None,
        verbose = False
    ):
        self.dataset_name = dataset_name
        self.data_filepath = data_filepath
        self.model_configs = model_configs
        self.n_runs = n_runs
        self.data_info = load_and_adapt_data_info(f"{data_filepath}data_info.csv")
        self.data = pd.read_csv(f"{data_filepath}data_prepared/{dataset_name}.csv")
        self.verbose = verbose
    
    def run(self):
        
        for model_config in self.model_configs:
            
            for i in range(1, self.n_runs+1, 1):
                
                model_config['seed'] = i

                train_indices = self.data_info.loc[self.data_info['name']== self.dataset_name, 'train_indices'][0][i-1]
                test_indices = self.data_info.loc[self.data_info['name']== self.dataset_name, 'test_indices'][0][i-1]

                X_train, y_train, X_test, y_test = return_train_test(
                                                                        df = self.data, 
                                                                        train_indices = train_indices, 
                                                                        test_indices = test_indices, 
                                                                        oversampling = model_config['oversampling'], 
                                                                        categoricals = data_info.loc[data_info['name']== self.dataset_name, 'categoricals'][0]
                                                                    )

                best_individual = train_model(
                                                dataset_name = self.dataset_name, 
                                                X_train = X_train, 
                                                y_train = y_train, 
                                                X_test = X_test, 
                                                y_test = y_test,
                                                model = model_config['name'], 
                                                **model_config['config']
                                            )

                train_metrics = evaluate_prediction(y_train, best_individual.predict(X_train))
                test_metrics = evaluate_prediction(y_test, best_individual.predict(X_test))
                
                if self.verbose:
                    print(
                        f"Run {i} - Accuracy: {torch.round(train_metrics[0] * 1000) / 1000:.3f} | "
                        f"{torch.round(test_metrics[0] * 1000) / 1000:.3f} - "
                        f"F1: {torch.round(train_metrics[2] * 1000) / 1000:.3f} | "
                        f"{torch.round(test_metrics[2] * 1000) / 1000:.3f} - "
                        f"ROC: {torch.round(train_metrics[1] * 1000) / 1000:.3f} | "
                        f"{torch.round(test_metrics[1] * 1000) / 1000:.3f}"
                        )


        
        return None
        
        

In [62]:
experiment = Experiment(
    dataset_name='blood',
    data_filepath = 'data/',
    model_configs = [model_config_gsgp, model_config_gsgp],
    n_runs = 30,
    verbose=True
)
experiment.run()

Run 1 - Accuracy: 0.648 | 0.680 - F1: 0.497 | 0.532 - ROC: 0.741 | 0.754
Run 2 - Accuracy: 0.641 | 0.698 - F1: 0.489 | 0.553 - ROC: 0.723 | 0.794
Run 3 - Accuracy: 0.663 | 0.644 - F1: 0.522 | 0.474 - ROC: 0.765 | 0.701
Run 4 - Accuracy: 0.667 | 0.636 - F1: 0.511 | 0.500 - ROC: 0.755 | 0.721
Run 5 - Accuracy: 0.658 | 0.658 - F1: 0.507 | 0.510 - ROC: 0.746 | 0.739
Run 6 - Accuracy: 0.656 | 0.662 - F1: 0.503 | 0.519 - ROC: 0.733 | 0.771
Run 7 - Accuracy: 0.641 | 0.698 - F1: 0.484 | 0.564 - ROC: 0.722 | 0.796
Run 8 - Accuracy: 0.644 | 0.689 - F1: 0.486 | 0.557 - ROC: 0.721 | 0.798
Run 9 - Accuracy: 0.644 | 0.689 - F1: 0.500 | 0.527 - ROC: 0.743 | 0.748
Run 10 - Accuracy: 0.646 | 0.684 - F1: 0.490 | 0.548 - ROC: 0.723 | 0.794
Run 11 - Accuracy: 0.671 | 0.627 - F1: 0.522 | 0.475 - ROC: 0.755 | 0.721
Run 12 - Accuracy: 0.656 | 0.662 - F1: 0.511 | 0.500 - ROC: 0.753 | 0.728
Run 13 - Accuracy: 0.654 | 0.667 - F1: 0.507 | 0.510 - ROC: 0.744 | 0.747
Run 14 - Accuracy: 0.656 | 0.662 - F1: 0.505 | 

KeyboardInterrupt: 