In [13]:
# Standard library imports
import itertools
import json
import datetime
import pathlib

# Third-party imports
import pandas as pd
import torch
from sklearn.model_selection import KFold

# Slim-GSGP imports
from slim_gsgp.datasets.data_loader import load_pandas_df  
from slim_gsgp.utils.utils import train_test_split  
from slim_gsgp.main_gp import gp
#from slim_gsgp.main_gsgp import gsgp
from slim_gsgp.main_slim import slim
from slim_gsgp.evaluators.fitness_functions import rmse

import statistics
from collections import defaultdict
import itertools 

from sklearn.model_selection import KFold
import numpy as np
import os 
import random
from itertools import product

In [14]:

# os.chdir(os.path.join(os.getcwd(), os.pardir))

## Aux Functions

In [15]:
def fit_model_GridSearch(gp_model, fixed_params, param_grid, seed):
    models = []
    keys, values = zip(*param_grid.items())
    for combo in product(*values):
        dynamic_params = dict(zip(keys, combo))
        full_params = {**fixed_params, **dynamic_params}
        model = gp_model(**full_params, seed=seed)
        res = {'model': model}
        res.update({'rmse_train': model.fitness.item()})
        res.update({'rmse_test': model.test_fitness.item()})
        res.update({'dynamic_params': dynamic_params})
        models.append(res)
    return models      

In [16]:
def group_and_median_rmse(results_data):
    """
    Groups results by 'dynamic_params' and calculates the median 'rmse_test' for each group.

    Args:
        results_data (list): A list of lists, where each inner list contains dictionaries
                             with 'dynamic_params' and 'rmse_test' keys.

    Returns:
        list: A list of dictionaries, each containing:
              {'dynamic_params': {...}, 'rmse_test_median': float}
    """

    #Flatten the list of lists into a single list of dictionaries
    flattened_results = list(itertools.chain.from_iterable(results_data))


    grouped_scores_data = {}

    for item in flattened_results:
        dynamic_params_dict = item['dynamic_params']
        rmse_test = item['rmse_test']

        # Sort params to ensure consistency 
        # Convert to tuple to make it hashable, and so able to be used as a dictionary key 
        hashable_dynamic_params = tuple(sorted(dynamic_params_dict.items()))

        # Check if combination does not exist in the dictionary
        if hashable_dynamic_params not in grouped_scores_data:

            # Create entry if not
            grouped_scores_data[hashable_dynamic_params] = {
                'dynamic_params': dynamic_params_dict, 
                'rmse_test': []
            }
        grouped_scores_data[hashable_dynamic_params]['rmse_test'].append(rmse_test)

    # Calculate median for each group and format output
    final_output = []
    for group_info in grouped_scores_data.values():
        combination = group_info['dynamic_params']
        rmse_scores = group_info['rmse_test']

        # Calculate median RMSE
        median_rmse = statistics.median(rmse_scores)
        final_output.append({
            'dynamic_params': combination,
            'rmse_test_median': median_rmse
        })

    return final_output

## Load Data 

In [17]:
# Reading the desired dataset
df = pd.read_csv("../data/sustavianfeed.csv", sep=';')

# Dropping the first column (index) and renaming the columns
df = df.drop(columns= ['WING TAG', 'EMPTY MUSCULAR STOMACH'])

# Moving crude protein to the end of the dataframe
df = df[[col for col in df.columns if col != 'CRUDE PROTEIN'] + ['CRUDE PROTEIN']] 

# Nested CV with Grid Search

In [18]:
seed = 42
 
# Edit the name and log directory based on the model you want to run

#MODEL_NAME = 'GP'
#MODEL_NAME = 'GSGP'
MODEL_NAME = 'SLIM-GSGP'

DATASET_NAME = MODEL_NAME +'_sustavianfeed'
LOG_DIR = './log/' + MODEL_NAME + '/'

LOG_LEVEL = 2
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

In [19]:
k_outer = 5
k_inner = 3

In [20]:
# Turning df into X and y torch.Tensors
X, y = load_pandas_df(df, X_y=True)

In [21]:
FITNESS_FUNCTION = 'rmse'
MINIMIZATION = True

total_instances = X.shape[0]
outer_test_size = total_instances // k_outer
outer_train_size = total_instances - outer_test_size
inner_val_size = outer_train_size // k_inner
inner_train_size = outer_train_size - inner_val_size

print(f'Total Instances:\t{total_instances}\n--')
print(f'Outer Train set:\t{outer_train_size}')
print(f'Test set:\t\t{outer_test_size}\n--')
print(f'Inner Train set:\t{inner_train_size}')
print(f'Validation set:\t\t{inner_val_size}\n')

Total Instances:	96
--
Outer Train set:	77
Test set:		19
--
Inner Train set:	52
Validation set:		25



In [22]:
POP_SIZE = 20

fixed_params = {
    # ---
    # Search Space
    'initializer': 'rhh',
    'init_depth': 2,
    'tree_constants': [random.uniform(0, 1) for _ in range(9)]+[ -1.], 
    'tree_functions': ['add', 'subtract'],
    'prob_const': 0.1,
    # ---
    # Problem Instance
    'dataset_name': DATASET_NAME,
    'fitness_function': 'rmse',
    'minimization': True,
    # ---
    # Model instance 
    'tournament_size': int(POP_SIZE*0.02) if POP_SIZE>100 else 2,
    'pop_size': POP_SIZE,
    'ms_lower': 0,
    'ms_upper': 0.5,
    'reconstruct': False,
    # ---
    # Solve settings
    'n_iter': 500,
    'elitism': True,
    'n_elites': 2, 
    'test_elite': True,
    'log_level': LOG_LEVEL,
    'verbose': 0,
    'n_jobs': 1,

    # ---
    # GP unique settings
    #'max_depth': 10, 
    #'p_xo' : 0.5,

    # ---
    # GSGP unique settings
    

    # ---
    # SLIM unique settings
    'max_depth': 10,
    'p_inflate': 0.70,
    #'slim_version': 'SLIM+SIG2',
    #'copy_parent': True,


}


SLIM_VERSIONS = ['SLIM+SIG2', 'SLIM+SIG1', 'SLIM+ABS', 'SLIM*SIG2', 'SLIM*SIG1', 'SLIM*ABS']

param_grid = {
    'slim_version': SLIM_VERSIONS,
    
}


In [23]:
def gp_nested_cross_validation(X, y, gp_model,  k_outer, k_inner, fixed_params, param_grid, seed, LOG_DIR, DATASET_NAME):
    """
    Perform nested cross-validation for a given model and dataset.

    Args:
        X (torch.Tensor): Feature matrix.
        y (torch.Tensor): Target vector.
        gp_model (callable): The gp model to be evaluated.
        k_outer (int): Number of outer folds.
        k_inner (int): Number of inner folds.
        fixed_params (dict): Fixed parameters for the model.
        param_grid (dict): Parameter grid for hyperparameter tuning.
        seed (int): Random seed for reproducibility.

    Returns:
        list: List of dictionaries containing model results.
    """


    cv_outer = KFold(n_splits=k_outer, random_state=seed, shuffle=True)
    cv_inner = KFold(n_splits=k_inner, random_state=seed, shuffle=True)

    data_cv_outer = [[learning_ix, test_ix] for learning_ix, test_ix in cv_outer.split(X, y)]

    models = []

    for i, (train_ix, test_ix) in enumerate(data_cv_outer):
        print(f'Outer fold {i+1}/{k_outer}')
        X_learning, y_learning = X[train_ix], y[train_ix]
        X_test, y_test = X[test_ix], y[test_ix]

        # Inner cross-validation
        results = []    

        data_cv_inner = [[learning_ix, val_ix] for learning_ix, val_ix in cv_inner.split(X_learning, y_learning)]
        for j, (train_ix, val_ix) in enumerate(data_cv_inner):

            # Split the data into training and validation sets K times 
            print(f'-----\n Inner fold {j+1}/{k_inner}')
            X_inner_train, y_inner_train = X_learning[train_ix], y_learning[train_ix]
            X_inner_val, y_inner_val = X_learning[val_ix], y_learning[val_ix]

            print(f'Training shape: {X_inner_train.shape}\nValidation shape: {X_inner_val.shape}\n')

            # Update the X and y values in the fixed_params dictionary
            fixed_params.update({
                'X_train': X_inner_train, 'y_train': y_inner_train,
                'X_test': X_inner_val, 'y_test': y_inner_val
            })

            # Update LOG_PATH in the fixed_params dictionary
            LOG_PATH = LOG_DIR+DATASET_NAME+'_'+'outer'+'_'+str(i)+'_'+'inner'+'_'+str(j)+'.csv'
            if os.path.exists(LOG_PATH):
                os.remove(LOG_PATH)
            fixed_params.update({'log_path': LOG_PATH})


            res = fit_model_GridSearch(gp_model=gp_model, fixed_params=fixed_params, param_grid=param_grid, seed=(seed+k_inner))
            
            # Log
            results.append(res)

        medians = group_and_median_rmse(results) 

        # Find minimum median rmse
        best_dynamic_combo_median = min(medians, key=lambda x: x['rmse_test_median'])

        print(f'Best inner combination: {best_dynamic_combo_median["dynamic_params"]} with median RMSE: {best_dynamic_combo_median["rmse_test_median"]}')

        # Train the best model on the entire training set
        print('Training best combination on entire learning set')

        best_hyper_combo = best_dynamic_combo_median['dynamic_params']

        fixed_params.update({
                'X_train': X_learning, 'y_train': y_learning,
                'X_test': X_test, 'y_test': y_test
            })
        
        LOG_PATH = LOG_DIR+DATASET_NAME+'_'+'outer'+'_'+str(i)+'.csv'
        if os.path.exists(LOG_PATH):
            os.remove(LOG_PATH)
    
        fixed_params.update({'log_path': LOG_PATH})
        
        full_params = {**fixed_params, **best_hyper_combo}

        outer_model = gp_model(**full_params, seed=(seed+k_outer))

        res = {'model': outer_model}  
        res.update({'rmse_train': outer_model.fitness.item()})
        res.update({'rmse_test': outer_model.test_fitness.item()})
        res.update({'dynamic_params': best_hyper_combo})

        models.append(res)

    return models 

In [24]:
gp_nested_cross_validation(X, y, gp_model=slim, k_outer=k_outer, k_inner=k_inner, fixed_params=fixed_params, param_grid=param_grid, seed=seed, LOG_DIR=LOG_DIR, DATASET_NAME=DATASET_NAME)

Outer fold 1/5
-----
 Inner fold 1/3
Training shape: torch.Size([50, 12])
Validation shape: torch.Size([26, 12])

-----
 Inner fold 2/3
Training shape: torch.Size([51, 12])
Validation shape: torch.Size([25, 12])

-----
 Inner fold 3/3
Training shape: torch.Size([51, 12])
Validation shape: torch.Size([25, 12])

Best inner combination: {'slim_version': 'SLIM*ABS'} with median RMSE: 12.191201210021973
Training best combination on entire learning set
Outer fold 2/5
-----
 Inner fold 1/3
Training shape: torch.Size([51, 12])
Validation shape: torch.Size([26, 12])

-----
 Inner fold 2/3
Training shape: torch.Size([51, 12])
Validation shape: torch.Size([26, 12])

-----
 Inner fold 3/3
Training shape: torch.Size([52, 12])
Validation shape: torch.Size([25, 12])

Best inner combination: {'slim_version': 'SLIM*SIG2'} with median RMSE: 14.297585487365723
Training best combination on entire learning set
Outer fold 3/5
-----
 Inner fold 1/3
Training shape: torch.Size([51, 12])
Validation shape: torch

[{'model': <slim_gsgp.algorithms.SLIM_GSGP.representations.individual.Individual at 0x1bdfe41e2b0>,
  'rmse_train': 12.13796329498291,
  'rmse_test': 20.066356658935547,
  'dynamic_params': {'slim_version': 'SLIM*ABS'}},
 {'model': <slim_gsgp.algorithms.SLIM_GSGP.representations.individual.Individual at 0x1bdfe41e150>,
  'rmse_train': 11.974928855895996,
  'rmse_test': 12.361209869384766,
  'dynamic_params': {'slim_version': 'SLIM*SIG2'}},
 {'model': <slim_gsgp.algorithms.SLIM_GSGP.representations.individual.Individual at 0x1bdfe41c680>,
  'rmse_train': 12.581201553344727,
  'rmse_test': 10.441032409667969,
  'dynamic_params': {'slim_version': 'SLIM*SIG2'}},
 {'model': <slim_gsgp.algorithms.SLIM_GSGP.representations.individual.Individual at 0x1bdfe41dc80>,
  'rmse_train': 12.042012214660645,
  'rmse_test': 12.532037734985352,
  'dynamic_params': {'slim_version': 'SLIM*SIG2'}},
 {'model': <slim_gsgp.algorithms.SLIM_GSGP.representations.individual.Individual at 0x1bdfe41d5a0>,
  'rmse_tr

# Visualizations

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.subplots as sp
from plotly.subplots import make_subplots
import ast  
from collections import defaultdict

from utils.vizualizations_functions import test_best_combs, train_test_best_combs, train_test_fit_and_size2
from utils.vizualizations_functions import train_test_fit, train_test_fit_and_size1, niche_entropy, plot_combs_together_test

In [None]:
df = pd.read_csv("../log/gp.csv", header=None)
settings_df = pd.read_csv("../log/gp_settings.csv", header=None)
#unique_settings_df = settings_df.drop_duplicates(0) #the comb id is now indexed


In [None]:
# 0  - Algorithm
# 1  - Instance ID
# 2  - Dataset
# 3  - Seed
# 4  - Generation
# 5  - Fitness
# 6  - Running time
# 7  - Population nodes
# 8  - Test fitness
# 9  - Elite nodes
# 10 - niche entropy
"""From here on, it doesnt appear on df"""
# 11 - sd(pop.fit)
# 12 - Log level 

'From here on, it doesnt appear on df'

In [None]:
'''def pop_fitness_diversity(df, train_color='blue'):
     """
     Out of Bounds
     """
     dif_combs = np.unique(df[[1]])
     for comb in dif_combs:
          y = df[df[1]==comb]
          #comb_dict = get_combination(comb)
          fig = go.Figure()
          fig.add_trace(go.Scatter(y=y.iloc[:,11].values, 
                                   mode='lines', name='Train', line=dict(color=train_color)))
          fig.update_layout(
          height=400, width=800, 
          margin=dict(t=50),
          yaxis_range=[0,None],
          title_text=f'GP - Population Fitness Diversity\nCombination:',
          xaxis_title='Generation', yaxis_title='Fitness Standard Deviation'
          )
          fig.show()'''



'def pop_fitness_diversity(df, train_color=\'blue\'):\n     """\n     Out of Bounds\n     """\n     dif_combs = np.unique(df[[1]])\n     for comb in dif_combs:\n          y = df[df[1]==comb]\n          #comb_dict = get_combination(comb)\n          fig = go.Figure()\n          fig.add_trace(go.Scatter(y=y.iloc[:,11].values, \n                                   mode=\'lines\', name=\'Train\', line=dict(color=train_color)))\n          fig.update_layout(\n          height=400, width=800, \n          margin=dict(t=50),\n          yaxis_range=[0,None],\n          title_text=f\'GP - Population Fitness Diversity\nCombination:\',\n          xaxis_title=\'Generation\', yaxis_title=\'Fitness Standard Deviation\'\n          )\n          fig.show()'

In [None]:
train_test_fit(df)

In [None]:
train_test_fit_and_size1(df, comb_idxs=[3,19])

In [None]:
niche_entropy(df)

In [None]:
plot_combs_together_test(df,comb_idxs=[1,2,3])

# Secondary Visualizations

In [None]:
test_best_combs(k_outer=2)

In [None]:
"""
I didnt experiment it yet with the new conditions for the new csv names
"""

train_test_fit_and_size2(k_outer=2, k_inner=3)

In [None]:
train_test_best_combs(k_outer=2, n_rows=1, n_cols=2, model_name='SLIM-GSGP')