# Neural and Evolutionary Leaning Project 

Group members: 

- Iris Moreira - 20240659
- Leonardo Di Caterina - 20240485
- Rafael Borges - 20240497

## Third Delivery - SLIM-GSGP

In [None]:
# Third-party imports
import pandas as pd #type: ignore
import torch #type: ignore
from sklearn.model_selection import KFold #type: ignore

# Slim-GSGP imports
from slim_gsgp.datasets.data_loader import load_pandas_df #type: ignore
from slim_gsgp.utils.utils import train_test_split #type: ignore
#from slim_gsgp.main_gp import gp
#from slim_gsgp.main_gsgp import gsgp
from slim_gsgp.main_slim import slim #type: ignore
from slim_gsgp.evaluators.fitness_functions import rmse #type: ignore

from collections import defaultdict

import numpy as np #type: ignore
import os
import random
from itertools import product

In [4]:
os.chdir(os.path.join(os.getcwd(), os.pardir))
from utils.grid_search import gp_nested_cross_validation
from utils.visualization_funcs import *
%cd notebooks/

c:\Users\rafas\Documents\University\NEL\Neural_Evo_Learn\notebooks


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Load Data

In [5]:
# Reading the desired dataset
df = pd.read_csv("../data/sustavianfeed.csv", sep=';')

# Dropping the first column (index) and renaming the columns
df = df.drop(columns= ['WING TAG', 'EMPTY MUSCULAR STOMACH'])

# Moving crude protein to the end of the dataframe
df = df[[col for col in df.columns if col != 'CRUDE PROTEIN'] + ['CRUDE PROTEIN']]

# Nested CV with Grid Search

In [6]:
seed = 42
random.seed(seed)

# Edit the name and log directory based on the model you want to run

#MODEL_NAME = 'GP'
#MODEL_NAME = 'GSGP'
MODEL_NAME = 'SLIM-GSGP'

DATASET_NAME = MODEL_NAME +'_sustavianfeed'
LOG_DIR = './log/' + MODEL_NAME + '/'

LOG_LEVEL = 2
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

In [7]:
k_outer = 10
k_inner = 5

In [8]:
# Turning df into X and y torch.Tensors
X, y = load_pandas_df(df, X_y=True)

In [9]:
FITNESS_FUNCTION = 'rmse'
MINIMIZATION = True

total_instances = X.shape[0]
outer_test_size = total_instances // k_outer
outer_train_size = total_instances - outer_test_size
inner_val_size = outer_train_size // k_inner
inner_train_size = outer_train_size - inner_val_size

print(f'Total Instances:\t{total_instances}\n--')
print(f'Outer Train set:\t{outer_train_size}')
print(f'Test set:\t\t{outer_test_size}\n--')
print(f'Inner Train set:\t{inner_train_size}')
print(f'Validation set:\t\t{inner_val_size}\n')

Total Instances:	96
--
Outer Train set:	87
Test set:		9
--
Inner Train set:	70
Validation set:		17



In [10]:
POP_SIZE = 50

fixed_params = {
    # ---
    # Search Space
    'initializer': 'rhh',
    'init_depth': 2,
    'tree_constants': [random.uniform(-1, 1) for _ in range(9)],
    'tree_functions': ['add', 'subtract','multiply','divide'],
    #'prob_const': 0.1,
    # ---
    # Problem Instance
    'dataset_name': DATASET_NAME,
    'fitness_function': 'rmse',
    'minimization': True,
    # ---
    # Model instance
    #'tournament_size': int(POP_SIZE*0.02) if POP_SIZE>100 else 2,
    'pop_size': POP_SIZE,
    'ms_lower': 0,
    'ms_upper': 0.5,


    # ---
    # Solve settings
    'n_iter': 250,
    'elitism': True,
    'n_elites': 2,
    'test_elite': True,
    'log_level': LOG_LEVEL,
    'verbose': 0,
    'n_jobs': 1,

    # ---
    # GP unique settings
    #'max_depth': 10,
    #'p_xo' : 0.5,


    # ---
    # GSGP unique settings
    #'p_xo' : 0.5,
    
    #'reconstruct': False,


    # ---
    # SLIM unique settings
    #'max_depth': 10,
    #'#p_inflate': 0.70,
    #'copy_parent': True,
    #'reconstruct': False,


}


param_grid = {
        
        'max_depth': [10, 15],
        'tournament_size': [int(POP_SIZE*0.05), int(POP_SIZE*0.10), int(POP_SIZE*0.15)],
        'prob_const': [0.1,0.4, 0.7],
        'slim_version':  ['SLIM+SIG2','SLIM+SIG1','SLIM+ABS', 'SLIM*SIG2','SLIM*SIG1', 'SLIM*ABS'], 
}


In [None]:
outer_results = gp_nested_cross_validation(X, y, gp_model=slim, k_outer=k_outer, k_inner=k_inner, fixed_params=fixed_params, param_grid=param_grid, seed=seed, LOG_DIR=LOG_DIR, DATASET_NAME=DATASET_NAME)

Saving results and configs to a .csv 

In [12]:
outer_results_df = pd.DataFrame(outer_results)
outer_results_df.to_csv(LOG_DIR+DATASET_NAME+'_outer_results.csv', index=False)


## Visualizations 

In [11]:
train_test_best_combs(model_name=MODEL_NAME)         

In [12]:
test_best_combs(model_name=MODEL_NAME)

In [11]:
fit_and_size_per_outer(k_outer=10, model_name=MODEL_NAME)

In [None]:
fit_or_size_per_comb(k_outer, MODEL_NAME)

## Brief Bloat, Overfitting and Hyperparameter Discussion 


- A constant probability of 0.7 was always chosen in the inner-cycle.

- Slim variants using multiplication consistently outperformed those using summation in inner-cycle.

- All models converged within the first few iterations and then plateaued.

- The lowest error occurred at max_depth=10, tournament_size=5, prob_const=0.7, slim_version='SLIM*SIG1'; with only one sample, we can’t confirm these settings are statistically superior.

- Parameter sets in plots 1 and 4 show underfitting. Possibly it is not learning all it should from the data.

- In plot 6, one run exhibits overfitting.

- After plateauing, model complexity continued to grow, indicating bloating.

In [15]:
fit_or_size_per_comb(k_outer, MODEL_NAME, size=True)

In [27]:
niche_entropy(k_outer, MODEL_NAME, skip_n_gens=2)

In [12]:
def pop_fitness_diversity(k_outer, model_name, skip_n_gens:int = None):
    LOG_DIR = "./log/" + model_name + "/" + model_name + "_sustavianfeed"
    for i_outer in range(k_outer):
        LOG_PATH = LOG_DIR + f"_outer_{i_outer}.csv"
        df = pd.read_csv(LOG_PATH, header=None)
        param_str = df[13][0] #first string
        if skip_n_gens:
            df = df.drop(index=df.head(skip_n_gens).index)

        fig = go.Figure()
        fig.add_trace(
            go.Scatter(
                y=df.iloc[:, 11].values,
                mode="lines",
                name="Train",
                line=dict(color="orange"),
            )
        )
        fig.update_layout(
            height=400,
            width=900,
            yaxis_range=[0, None],
            title_text=f"{model_name} - Population Fitness Diversity<br>(Outer fold {i_outer}: Comb {param_str})",
            title_font = dict(size=15),
            xaxis_title="Generation",
            yaxis_title="Fitness Standard Deviation",
            title_y=0.93,
            margin=dict(t=80, l=50, r=80, b=50),
        )
        fig.show()

In [13]:
pop_fitness_diversity(k_outer, MODEL_NAME, skip_n_gens=2)

## Early Convergence Discussion 

- Diversity peaks initially, reflecting the random initialization of the population.

- The population seems to sustains a decent level of diversity across subsequent generations.


## Future Work

- In class, we observed that expanding the mutation step boundaries improved the slim variants using summation—this could be worth further exploration.
 