In [1]:
#! git clone https://github.com/LeonardoDiCaterina/Neural_Evo_Learn

In [2]:
#! pip install git+https://github.com/DALabNOVA/slim.git

In [3]:
# Standard library imports
import itertools
import json
import datetime
import pathlib

# Third-party imports
import pandas as pd
import torch
from sklearn.model_selection import KFold

# Slim-GSGP imports
from slim_gsgp.datasets.data_loader import load_pandas_df
from slim_gsgp.utils.utils import train_test_split
#from slim_gsgp.main_gp import gp
from slim_gsgp.main_gsgp import gsgp
#from slim_gsgp.main_slim import slim
from slim_gsgp.evaluators.fitness_functions import rmse

import statistics
from collections import defaultdict
import itertools

from sklearn.model_selection import KFold
import numpy as np
import os
import random
from itertools import product

In [4]:

# os.chdir(os.path.join(os.getcwd(), os.pardir))

## Aux Functions

In [5]:
def fit_model_GridSearch(gp_model, fixed_params, param_grid, seed):
    models = []
    keys, values = zip(*param_grid.items())
    for combo in product(*values):
        dynamic_params = dict(zip(keys, combo))
        full_params = {**fixed_params, **dynamic_params}
        model = gp_model(**full_params, seed=seed)
        res = {'model': model}
        res.update({'rmse_train': model.fitness.item()})
        res.update({'rmse_test': model.test_fitness.item()})
        res.update({'dynamic_params': dynamic_params})
        models.append(res)
    return models

In [6]:
def group_and_median_rmse(results_data):
    """
    Groups results by 'dynamic_params' and calculates the median 'rmse_test' for each group.

    Args:
        results_data (list): A list of lists, where each inner list contains dictionaries
                             with 'dynamic_params' and 'rmse_test' keys.

    Returns:
        list: A list of dictionaries, each containing:
              {'dynamic_params': {...}, 'rmse_test_median': float}
    """

    #Flatten the list of lists into a single list of dictionaries
    flattened_results = list(itertools.chain.from_iterable(results_data))


    grouped_scores_data = {}

    for item in flattened_results:
        dynamic_params_dict = item['dynamic_params']
        rmse_test = item['rmse_test']

        # Sort params to ensure consistency
        # Convert to tuple to make it hashable, and so able to be used as a dictionary key
        hashable_dynamic_params = tuple(sorted(dynamic_params_dict.items()))

        # Check if combination does not exist in the dictionary
        if hashable_dynamic_params not in grouped_scores_data:

            # Create entry if not
            grouped_scores_data[hashable_dynamic_params] = {
                'dynamic_params': dynamic_params_dict,
                'rmse_test': []
            }
        grouped_scores_data[hashable_dynamic_params]['rmse_test'].append(rmse_test)

    # Calculate median for each group and format output
    final_output = []
    for group_info in grouped_scores_data.values():
        combination = group_info['dynamic_params']
        rmse_scores = group_info['rmse_test']

        # Calculate median RMSE
        median_rmse = statistics.median(rmse_scores)
        final_output.append({
            'dynamic_params': combination,
            'rmse_test_median': median_rmse
        })

    return final_output

## Load Data

In [7]:
# %cd Neural_Evo_Learn/

In [8]:
# Reading the desired dataset
df = pd.read_csv("../data/sustavianfeed.csv", sep=';')

# Dropping the first column (index) and renaming the columns
df = df.drop(columns= ['WING TAG', 'EMPTY MUSCULAR STOMACH'])

# Moving crude protein to the end of the dataframe
df = df[[col for col in df.columns if col != 'CRUDE PROTEIN'] + ['CRUDE PROTEIN']]

# Nested CV with Grid Search

In [9]:
seed = 42
random.seed(seed)

# Edit the name and log directory based on the model you want to run

#MODEL_NAME = 'GP'
MODEL_NAME = 'GSGP'
#MODEL_NAME = 'SLIM-GSGP'

DATASET_NAME = MODEL_NAME +'_sustavianfeed'
LOG_DIR = './log/' + MODEL_NAME + '/'

LOG_LEVEL = 2
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

In [10]:
k_outer = 10
k_inner = 5

In [11]:
# Turning df into X and y torch.Tensors
X, y = load_pandas_df(df, X_y=True)

In [12]:
FITNESS_FUNCTION = 'rmse'
MINIMIZATION = True

total_instances = X.shape[0]
outer_test_size = total_instances // k_outer
outer_train_size = total_instances - outer_test_size
inner_val_size = outer_train_size // k_inner
inner_train_size = outer_train_size - inner_val_size

print(f'Total Instances:\t{total_instances}\n--')
print(f'Outer Train set:\t{outer_train_size}')
print(f'Test set:\t\t{outer_test_size}\n--')
print(f'Inner Train set:\t{inner_train_size}')
print(f'Validation set:\t\t{inner_val_size}\n')

Total Instances:	96
--
Outer Train set:	87
Test set:		9
--
Inner Train set:	70
Validation set:		17



In [13]:
POP_SIZE = 20

fixed_params = {
    # ---
    # Search Space
    'initializer': 'rhh',
    'init_depth': 2,
    'tree_constants': [random.uniform(-1, 1) for _ in range(9)],
    'tree_functions': ['add', 'subtract','multiply','divide'],
    #'prob_const': 0.1,
    # ---
    # Problem Instance
    'dataset_name': DATASET_NAME,
    'fitness_function': 'rmse',
    'minimization': True,
    # ---
    # Model instance
    #'tournament_size': int(POP_SIZE*0.02) if POP_SIZE>100 else 2,
    'pop_size': POP_SIZE,


    # ---
    # Solve settings
    'n_iter': 500,
    'elitism': True,
    'n_elites': 2,
    'test_elite': True,
    'log_level': LOG_LEVEL,
    'verbose': 0,
    'n_jobs': 1,

    # ---
    # GP unique settings
    #'max_depth': 10,
    #'p_xo' : 0.5,


    # ---
    # GSGP unique settings
    #'p_xo' : 0.5,
    'ms_lower': 0,
    'ms_upper': 0.5,
    #'reconstruct': False,


    # ---
    # SLIM unique settings
    #'max_depth': 10,
    #'#p_inflate': 0.70,
    #'slim_version': 'SLIM+SIG2',
    #'copy_parent': True,
    # 'ms_lower': 0,
    # 'ms_upper': 1,
    #'reconstruct': False,


}


#SLIM_VERSIONS = ['SLIM+SIG2', 'SLIM+SIG1', 'SLIM+ABS', 'SLIM*SIG2', 'SLIM*SIG1', 'SLIM*ABS']

param_grid = {
        'p_xo' : [0.5, 0.7],
        'tournament_size': [int(POP_SIZE*0.20), int(POP_SIZE*0.10), int(POP_SIZE*0.15)],
        'prob_const': [0.1, 0.7],

}


In [14]:
def gp_nested_cross_validation(X, y, gp_model,  k_outer, k_inner, fixed_params, param_grid, seed, LOG_DIR, DATASET_NAME):
    """
    Perform nested cross-validation for a given model and dataset.

    Args:
        X (torch.Tensor): Feature matrix.
        y (torch.Tensor): Target vector.
        gp_model (callable): The gp model to be evaluated.
        k_outer (int): Number of outer folds.
        k_inner (int): Number of inner folds.
        fixed_params (dict): Fixed parameters for the model.
        param_grid (dict): Parameter grid for hyperparameter tuning.
        seed (int): Random seed for reproducibility.

    Returns:
        list: List of dictionaries containing model results.
    """


    cv_outer = KFold(n_splits=k_outer, random_state=seed, shuffle=True)
    cv_inner = KFold(n_splits=k_inner, random_state=seed, shuffle=True)

    data_cv_outer = [[learning_ix, test_ix] for learning_ix, test_ix in cv_outer.split(X, y)]

    models = []

    for i, (train_ix, test_ix) in enumerate(data_cv_outer):
        print(f'Outer fold {i+1}/{k_outer}')
        X_learning, y_learning = X[train_ix], y[train_ix]
        X_test, y_test = X[test_ix], y[test_ix]

        # Inner cross-validation
        results = []

        data_cv_inner = [[learning_ix, val_ix] for learning_ix, val_ix in cv_inner.split(X_learning, y_learning)]
        for j, (train_ix, val_ix) in enumerate(data_cv_inner):

            # Split the data into training and validation sets K times
            print(f'-----\n Inner fold {j+1}/{k_inner}')
            X_inner_train, y_inner_train = X_learning[train_ix], y_learning[train_ix]
            X_inner_val, y_inner_val = X_learning[val_ix], y_learning[val_ix]

            print(f'Training shape: {X_inner_train.shape}\nValidation shape: {X_inner_val.shape}\n')

            # Update the X and y values in the fixed_params dictionary
            fixed_params.update({
                'X_train': X_inner_train, 'y_train': y_inner_train,
                'X_test': X_inner_val, 'y_test': y_inner_val
            })

            # Update LOG_PATH in the fixed_params dictionary
            LOG_PATH = LOG_DIR+DATASET_NAME+'_'+'outer'+'_'+str(i)+'_'+'inner'+'_'+str(j)+'.csv'
            if os.path.exists(LOG_PATH):
                os.remove(LOG_PATH)
            fixed_params.update({'log_path': LOG_PATH})


            res = fit_model_GridSearch(gp_model=gp_model, fixed_params=fixed_params, param_grid=param_grid, seed=(seed+k_inner))

            # Log
            results.append(res)

        medians = group_and_median_rmse(results)

        # Find minimum median rmse
        best_dynamic_combo_median = min(medians, key=lambda x: x['rmse_test_median'])

        print(f'Best inner combination: {best_dynamic_combo_median["dynamic_params"]} with median RMSE: {best_dynamic_combo_median["rmse_test_median"]}')

        # Train the best model on the entire training set
        print('Training best combination on entire learning set')

        best_hyper_combo = best_dynamic_combo_median['dynamic_params']

        fixed_params.update({
                'X_train': X_learning, 'y_train': y_learning,
                'X_test': X_test, 'y_test': y_test
            })

        LOG_PATH = LOG_DIR+DATASET_NAME+'_'+'outer'+'_'+str(i)+'.csv'
        if os.path.exists(LOG_PATH):
            os.remove(LOG_PATH)
        fixed_params.update({'log_path': LOG_PATH})

        full_params = {**fixed_params, **best_hyper_combo}

        outer_model = gp_model(**full_params, seed=(seed+k_outer))

        res = {'model': outer_model}
        res.update({'rmse_train': outer_model.fitness.item()})
        res.update({'rmse_test': outer_model.test_fitness.item()})
        res.update({'dynamic_params': best_hyper_combo})

        models.append(res)

    return models

In [15]:
outer_results = gp_nested_cross_validation(X, y, gp_model=gsgp, k_outer=k_outer, k_inner=k_inner, fixed_params=fixed_params, param_grid=param_grid, seed=seed, LOG_DIR=LOG_DIR, DATASET_NAME=DATASET_NAME)

Outer fold 1/10
-----
 Inner fold 1/5
Training shape: torch.Size([68, 12])
Validation shape: torch.Size([18, 12])

-----
 Inner fold 2/5
Training shape: torch.Size([69, 12])
Validation shape: torch.Size([17, 12])

-----
 Inner fold 3/5
Training shape: torch.Size([69, 12])
Validation shape: torch.Size([17, 12])

-----
 Inner fold 4/5
Training shape: torch.Size([69, 12])
Validation shape: torch.Size([17, 12])

-----
 Inner fold 5/5
Training shape: torch.Size([69, 12])
Validation shape: torch.Size([17, 12])

Best inner combination: {'p_xo': 0.5, 'tournament_size': 3, 'prob_const': 0.1} with median RMSE: 8.442061424255371
Training best combination on entire learning set
Outer fold 2/10
-----
 Inner fold 1/5
Training shape: torch.Size([68, 12])
Validation shape: torch.Size([18, 12])

-----
 Inner fold 2/5
Training shape: torch.Size([69, 12])
Validation shape: torch.Size([17, 12])

-----
 Inner fold 3/5
Training shape: torch.Size([69, 12])
Validation shape: torch.Size([17, 12])

-----
 Inner

In [None]:
outer_results_df = pd.DataFrame(outer_results)
outer_results_df.to_csv(LOG_DIR+DATASET_NAME+'_outer_results.csv', index=False)


Unnamed: 0,model,rmse_train,rmse_test,dynamic_params
0,<slim_gsgp.algorithms.GSGP.representations.tre...,9.992109,20.563236,"{'p_xo': 0.5, 'tournament_size': 3, 'prob_cons..."
1,<slim_gsgp.algorithms.GSGP.representations.tre...,11.179463,13.075053,"{'p_xo': 0.5, 'tournament_size': 2, 'prob_cons..."
2,<slim_gsgp.algorithms.GSGP.representations.tre...,11.272282,12.082158,"{'p_xo': 0.5, 'tournament_size': 4, 'prob_cons..."
3,<slim_gsgp.algorithms.GSGP.representations.tre...,11.479264,11.100879,"{'p_xo': 0.5, 'tournament_size': 4, 'prob_cons..."
4,<slim_gsgp.algorithms.GSGP.representations.tre...,12.01131,4.89237,"{'p_xo': 0.5, 'tournament_size': 3, 'prob_cons..."
5,<slim_gsgp.algorithms.GSGP.representations.tre...,11.675235,8.870728,"{'p_xo': 0.5, 'tournament_size': 2, 'prob_cons..."
6,<slim_gsgp.algorithms.GSGP.representations.tre...,11.39606,11.496885,"{'p_xo': 0.5, 'tournament_size': 2, 'prob_cons..."
7,<slim_gsgp.algorithms.GSGP.representations.tre...,11.755133,8.659302,"{'p_xo': 0.5, 'tournament_size': 4, 'prob_cons..."
8,<slim_gsgp.algorithms.GSGP.representations.tre...,11.677572,7.032644,"{'p_xo': 0.5, 'tournament_size': 3, 'prob_cons..."
9,<slim_gsgp.algorithms.GSGP.representations.tre...,11.529833,11.594569,"{'p_xo': 0.5, 'tournament_size': 3, 'prob_cons..."


# Visualizations

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.subplots as sp
from plotly.subplots import make_subplots
import ast
from collections import defaultdict

In [None]:
df = pd.read_csv("../log/gp.csv", header=None)
settings_df = pd.read_csv("../log/gp_settings.csv", header=None)
unique_settings_df = settings_df.drop_duplicates(0) #the comb id is now indexed

In [None]:
unique_settings_df.head()

In [None]:
# 0  - Algorithm
# 1  - Instance ID
# 2  - Dataset
# 3  - Seed
# 4  - Generation
# 5  - Fitness
# 6  - Running time
# 7  - Population nodes
# 8  - Test fitness
# 9  - Elite nodes
# 10 - niche entropy
"""From here on, it doesnt appear on df"""
# 11 - sd(pop.fit)
# 12 - Log level

In [None]:
'''def get_combination_str(setting_str, unique_settings_df = pd.DataFrame(df[1].unique())):
    comb_str = unique_settings_df[unique_settings_df[0]==setting_str][1][0]
    return comb_str'''

def param_in_combination(param:str, comb_str: str):
    """
    Parameters that the function can receive:
    'log' / 'verbose'/ 'test_elite' / 'n_jobs' / 'max_depth' / 'n_elites' / 'elistism' / 'n_iter'
    'settings_dict' / 'p_xo' / 'pop_size' / 'seed' / 'p_m' / 'p_c' / 'init_depth' / 'init_pop_size'
    """
    variable_init = comb_str.find(param)
    param_init = variable_init + len(param) + 3 #advance 3 steps to account for the quote, the dots and the space.
    param_end = comb_str.find(',', variable_init)
    param_value = comb_str[param_init:param_end]
    if param_value.startswith('<') or param_value.startswith('['):
        print('The value of the parameter given cannot be converted to its original class:')
    else:
        try:
            return ast.literal_eval(param_value)
        except SyntaxError as s:
            print(f'Parameter does not exist in the combination string or it cannot be accessed. ({s})')

#example
#print(get_combination_str('865732c8-3014-11f0-b37b-baa0ecd080fe'))
#print(param_in_combination('log', get_combination_str('865732c8-3014-11f0-b37b-baa0ecd080fe')))



In [None]:
'''def pop_fitness_diversity(df, train_color='blue'):
     """
     Out of Bounds
     """
     dif_combs = np.unique(df[[1]])
     for comb in dif_combs:
          y = df[df[1]==comb]
          #comb_dict = get_combination(comb)
          fig = go.Figure()
          fig.add_trace(go.Scatter(y=y.iloc[:,11].values,
                                   mode='lines', name='Train', line=dict(color=train_color)))
          fig.update_layout(
          height=400, width=800,
          margin=dict(t=50),
          yaxis_range=[0,None],
          title_text=f'GP - Population Fitness Diversity\nCombination:',
          xaxis_title='Generation', yaxis_title='Fitness Standard Deviation'
          )
          fig.show()'''



In [None]:
def train_test_fit(df, train_color='blue', test_color='orange', rows=5, cols=4):
    dif_combs = df[1].unique()  # Get unique combinations
    unique_setting_df = pd.DataFrame(dif_combs)
    num_plots = len(dif_combs)
    assert rows*cols==num_plots

    # Create subplot grid
    fig = sp.make_subplots(rows=rows, cols=cols,
                           subplot_titles=[f"Combination index: {unique_settings_df[unique_settings_df[0]==comb].index[0]}"
                                           for comb in dif_combs])

    for i, comb in enumerate(dif_combs):
        y = df[df[1] == comb]
        algo = y.iloc[0,0]
        row = (i // cols) + 1  #Calculate row position
        col = (i % cols) + 1   #Calculate column position

        fig.add_trace(
            go.Scatter(y=y.iloc[:, 5].values, mode='lines', name='Train', line=dict(color=train_color),
                       showlegend=(i==0)),
            row=row, col=col
        )

        fig.add_trace(
            go.Scatter(y=y.iloc[:, 8].values, mode='lines', name='Test', line=dict(color=test_color),
                       showlegend=(i==0)),
            row=row, col=col
        )

        fig.update_yaxes(range=[0, None], row=row, col=col)

    fig.update_layout(
        height=150 * rows,
        width=250 * cols,
        margin=dict(t=50),
        title_text=f'{algo} - Train vs Test Fitness (x=Generation, y=RMSE)',
        showlegend=True
    )

    fig.update_annotations(font_size=10)
    fig.show()

In [None]:
def train_test_fit_and_size(df, comb_idxs: list | int = [i for i in range(pd.DataFrame(df[1].unique()).shape[0])],
                            train_color='blue', test_color='orange'):
     unique_setting_df = pd.DataFrame(df[1].unique())
     for comb_idx in comb_idxs:
          comb = unique_settings_df.iloc[comb_idx, 0]
          y = df[df[1]==comb]
          algo = y.iloc[0,0]
          fig = make_subplots(
          rows=1, cols=2,
          subplot_titles=(f'{algo} - Fitness evolution\nCombination:', f'{algo} - Size evolution')
          )

          fig.add_trace(go.Scatter(y=y.iloc[:,5].values,
                                   mode='lines', name='Train', line=dict(color=train_color)), row=1, col=1)
          fig.add_trace(go.Scatter(y=y.iloc[:,8].values,
                                   mode='lines', name='Test', line=dict(color=test_color)), row=1, col=1)
          fig.add_trace(go.Scatter(y=y.iloc[:,9].values,
                                   mode='lines', name='Size'), row=1, col=2)

          fig.update_xaxes(title_text="Generation")

          fig.update_layout(
          width=1000,
          height=400,
          showlegend=True,
          yaxis_range=[0,None],
          )
          fig.show()

In [None]:
def niche_entropy(df, train_color='blue', rows=5, cols=4):
    dif_combs = df[1].unique()  # Get unique combinations
    unique_setting_df = pd.DataFrame(dif_combs) # array to df
    num_plots = len(dif_combs)
    assert rows*cols==num_plots, "The number of combinations does not correspond to the grid size defined (rows/cols)."

    fig = sp.make_subplots(rows=rows, cols=cols,
                           subplot_titles=[f"Combination index: {unique_settings_df[unique_settings_df[0]==comb].index[0]}"
                                           for comb in dif_combs])

    for i, comb in enumerate(dif_combs):
        y = df[df[1] == comb]
        algo = y.iloc[0,0]
        row = (i // cols) + 1
        col = (i % cols) + 1

        fig.add_trace(
            go.Scatter(
                y=y.iloc[:, 10].values,
                mode='lines',
                name='Niche Entropy',
                line=dict(color=train_color),
                showlegend=(i == 0)), row=row, col=col
                )

    fig.update_layout(
        height=150 * rows,
        width=250 * cols,
        margin=dict(t=50),
        title_text=f'{algo} - Niche Entropy (x=Generation, y=Entropy)',
    )

    fig.show()

In [None]:
def plot_combs_together_test(df, comb_idxs: list | int = [i for i in range(pd.DataFrame(df[1].unique()).shape[0])],
                             colors = ['#FF0000', '#0000FF', '#00FF00', '#FFA500', '#800080',
                                       '#FF00FF', '#00FFFF', '#FFFF00', '#1F77B4', '#FF7F0E',
                                       '#2CA02C', '#D62728', '#9467BD', '#8C564B', '#E377C2',
                                       '#7F7F7F', '#AEC7E8', '#FFBB78', '#98DF8A', '#FF9896'],
                              ):

     assert len(colors)>=len(comb_idxs), "Not enough colors for all combinations"

     unique_settings_df = pd.DataFrame(df[1].unique())
     fig = go.Figure()
     for i, comb_idx in enumerate(comb_idxs):
          comb = unique_settings_df.iloc[comb_idx, 0]
          y = df[df[1]==comb]
          algo = y.iloc[0,0]

          fig.add_trace(go.Scatter(y=y.iloc[:,8].values,
                                   mode='lines', name=f'Test Comb {comb_idx}',
                                   line=dict(color=colors[i])))#, row=1, col=1)

          fig.update_xaxes(title_text="Generation")

     fig.update_layout(
          width=1000,
          height=400,
          title_text = f"{algo} - Test Fitness (Combinations indexes: {comb_idxs})",
          showlegend=True,
          yaxis_range=[0,None],
          )

     fig.show()



In [None]:
train_test_fit(df)

In [None]:
train_test_fit_and_size(df, comb_idxs=[3,19])

In [None]:
niche_entropy(df)

In [None]:
plot_combs_together_test(df,comb_idxs=[1,2,3])

Modular functions for different versions/hyperparameters combinations
ASSUMING THAT:
-

In [None]:
SLIM_VERSIONS = ['SLIM+SIG2', 'SLIM+SIG1', 'SLIM+ABS', 'SLIM*SIG2', 'SLIM*SIG1', 'SLIM*ABS']
COMBINATIONS = [ for i in df[1].unique()]

"""param_grid = {
    'slim_version': SLIM_VERSIONS
}"""

In [None]:
rmse_by_config = defaultdict(list)

for split in results:
    rmse_train = []
    rmse_test = []

    for result in split:
        key = ''
        for k, v in result['dynamic_params'].items():
            key += k+': '+str(v)+' <br /> '
        rmse_by_config[key].append(result['rmse_test'])

fig = go.Figure()
for config, rmse_values in rmse_by_config.items():
    fig.add_trace(go.Box(
        y=rmse_values,
        boxpoints='all',
        jitter=0.5,
        pointpos=0,
        line=dict(color='orange'),
        name=config
    ))

fig.update_layout(
    title=DATASET_NAME+' dataset',
    xaxis_title='',
    yaxis_title='Test RMSE',
    height=500, width=1100,
    xaxis_tickangle=-90,
    yaxis_range=[0,None],
    margin=dict(l=50, r=50, t=50, b=20),
    showlegend=False,
    template='plotly_white'
)

fig.show()

In [None]:
# Plot settings
df_log = []
for i_inner in range(k_inner):
    tmp = pd.read_csv(LOG_DIR+'slim_'+DATASET_NAME+'_'+str(i_inner)+'.csv', header=None)
    tmp['cv'] = i_inner
    df_log.append(tmp)
df_log = pd.concat(df_log, ignore_index=True)

n_rows = 2
n_cols = 3

In [None]:
make_evolution_plots(n_rows, n_cols, SLIM_VERSIONS, df_log,
                     plot_title = 'SLIM - Train vs Test Fitness ('+DATASET_NAME+' dataset)')
[fixed_params['pop_size'], fixed_params['tournament_size']]

In [None]:
make_evolution_plots(n_rows, n_cols, SLIM_VERSIONS, df_log, var='size'
                     plot_title = 'SLIM -Size ('+DATASET_NAME+' dataset)')