# Neural and Evolutionary Leaning Project 

Group members: 

- Iris Moreira - 20240659
- Leonardo Di Caterina - 20240485
- Rafael Borges - 20240497

## Second Delivery - GSGP

In [1]:
# Third-party imports
import pandas as pd
import torch
from sklearn.model_selection import KFold

# Slim-GSGP imports
from slim_gsgp.datasets.data_loader import load_pandas_df
from slim_gsgp.utils.utils import train_test_split
#from slim_gsgp.main_gp import gp
from slim_gsgp.main_gsgp import gsgp
#from slim_gsgp.main_slim import slim
from slim_gsgp.evaluators.fitness_functions import rmse

import statistics
from collections import defaultdict
import itertools

from sklearn.model_selection import KFold
import numpy as np
import os
import random
from itertools import product

In [2]:
os.chdir(os.path.join(os.getcwd(), os.pardir))
from utils.grid_search import gp_nested_cross_validation
from utils.visualization_funcs import *
%cd notebooks/

c:\Users\irism\OneDrive - NOVAIMS\Msc-NEL\Neural_Evo_Learn\notebooks


## Load Data

In [3]:
# Reading the desired dataset
df = pd.read_csv("../data/sustavianfeed.csv", sep=';')

# Dropping the first column (index) and renaming the columns
df = df.drop(columns= ['WING TAG', 'EMPTY MUSCULAR STOMACH'])

# Moving crude protein to the end of the dataframe
df = df[[col for col in df.columns if col != 'CRUDE PROTEIN'] + ['CRUDE PROTEIN']]

# Nested CV with Grid Search

In [4]:
seed = 42
random.seed(seed)

# Edit the name and log directory based on the model you want to run

#MODEL_NAME = 'GP'
MODEL_NAME = 'GSGP'
#MODEL_NAME = 'SLIM-GSGP'

DATASET_NAME = MODEL_NAME +'_sustavianfeed'
LOG_DIR = './log/' + MODEL_NAME + '/'

LOG_LEVEL = 2
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

In [5]:
k_outer = 10
k_inner = 5

In [6]:
# Turning df into X and y torch.Tensors
X, y = load_pandas_df(df, X_y=True)

In [7]:
FITNESS_FUNCTION = 'rmse'
MINIMIZATION = True

total_instances = X.shape[0]
outer_test_size = total_instances // k_outer
outer_train_size = total_instances - outer_test_size
inner_val_size = outer_train_size // k_inner
inner_train_size = outer_train_size - inner_val_size

print(f'Total Instances:\t{total_instances}\n--')
print(f'Outer Train set:\t{outer_train_size}')
print(f'Test set:\t\t{outer_test_size}\n--')
print(f'Inner Train set:\t{inner_train_size}')
print(f'Validation set:\t\t{inner_val_size}\n')

Total Instances:	96
--
Outer Train set:	87
Test set:		9
--
Inner Train set:	70
Validation set:		17



In [8]:
POP_SIZE = 50

fixed_params = {
    # ---
    # Search Space
    'initializer': 'rhh',
    'init_depth': 2,
    'tree_constants': [random.uniform(-1, 1) for _ in range(9)],
    'tree_functions': ['add', 'subtract','multiply','divide'],
    #'prob_const': 0.1,
    # ---
    # Problem Instance
    'dataset_name': DATASET_NAME,
    'fitness_function': 'rmse',
    'minimization': True,
    # ---
    # Model instance
    #'tournament_size': int(POP_SIZE*0.02) if POP_SIZE>100 else 2,
    'pop_size': POP_SIZE,


    # ---
    # Solve settings
    'n_iter': 250,
    'elitism': True,
    'n_elites': 2,
    'test_elite': True,
    'log_level': LOG_LEVEL,
    'verbose': 0,
    'n_jobs': 1,

    # ---
    # GP unique settings
    #'max_depth': 10,
    #'p_xo' : 0.5,


    # ---
    # GSGP unique settings
    #'p_xo' : 0.5,
    'ms_lower': 0,
    'ms_upper': 0.5,
    #'reconstruct': False,


    # ---
    # SLIM unique settings
    #'max_depth': 10,
    #'#p_inflate': 0.70,
    #'slim_version': 'SLIM+SIG2',
    #'copy_parent': True,
    # 'ms_lower': 0,
    # 'ms_upper': 1,
    #'reconstruct': False,


}


param_grid = {
        'p_xo' : [0.5, 0.7],
        'tournament_size': [int(POP_SIZE*0.05), int(POP_SIZE*0.10), int(POP_SIZE*0.15)],
        'prob_const': [0.1,0.4, 0.7],

}


In [9]:
outer_results = gp_nested_cross_validation(X, y, gp_model=gsgp, k_outer=k_outer, k_inner=k_inner, fixed_params=fixed_params, param_grid=param_grid, seed=seed, LOG_DIR=LOG_DIR, DATASET_NAME=DATASET_NAME)

Outer fold 1/10
-----
 Inner fold 1/5
Training shape: torch.Size([68, 12])
Validation shape: torch.Size([18, 12])

-----
 Inner fold 2/5
Training shape: torch.Size([69, 12])
Validation shape: torch.Size([17, 12])

-----
 Inner fold 3/5
Training shape: torch.Size([69, 12])
Validation shape: torch.Size([17, 12])

-----
 Inner fold 4/5
Training shape: torch.Size([69, 12])
Validation shape: torch.Size([17, 12])

-----
 Inner fold 5/5
Training shape: torch.Size([69, 12])
Validation shape: torch.Size([17, 12])

Best inner combination: {'p_xo': 0.7, 'tournament_size': 5, 'prob_const': 0.7} with median RMSE: 8.26526927947998
Training best combination on entire learning set
Outer fold 2/10
-----
 Inner fold 1/5
Training shape: torch.Size([68, 12])
Validation shape: torch.Size([18, 12])

-----
 Inner fold 2/5
Training shape: torch.Size([69, 12])
Validation shape: torch.Size([17, 12])

-----
 Inner fold 3/5
Training shape: torch.Size([69, 12])
Validation shape: torch.Size([17, 12])

-----
 Inner 

Saving results and configs to a .csv 

In [10]:
outer_results_df = pd.DataFrame(outer_results)
outer_results_df.to_csv(LOG_DIR+DATASET_NAME+'_outer_results.csv', index=False)


## Visualizations 

In [11]:
train_test_best_combs(model_name=MODEL_NAME)         

In [12]:
test_best_combs(model_name=MODEL_NAME)

In [13]:
fit_and_size_per_outer(k_outer=10, model_name=MODEL_NAME)

## Brief Bloat and Overfitting Discussion 

- For every hyper-parameter setting we tried, the RMSE plateaus after roughly 100–200 iterations, yet the program trees keep growing. That continued growth—typical of GSGP—signals classic bloating.

- Signs of overfitting are inconclusive: some runs show the expected pattern (training RMSE < test RMSE), while others reverse it (test RMSE < training RMSE). The conflicting results are probably a side-effect of the small dataset.


In [18]:
fit_and_size_per_comb(10,MODEL_NAME,n_rows=5,n_cols=1)

In [None]:
pop_fitness_diversity(10, MODEL_NAME)

## Early Convergence Discussion 

- The final plots reveal a sharp drop in population diversity during the first few iterations, likely a consequence of the small population size used in the grid search combined with tournament selection. Selection pressure need to be reviewed

## Future Work

- Increase population size 
- Review selection pressure 