In [15]:
#! git clone https://github.com/LeonardoDiCaterina/Neural_Evo_Learn

In [16]:
#! pip install git+https://github.com/DALabNOVA/slim.git

In [17]:
# Standard library imports
import itertools
import json
import datetime
import pathlib

# Third-party imports
import pandas as pd
import torch
from sklearn.model_selection import KFold

# Slim-GSGP imports
from slim_gsgp.datasets.data_loader import load_pandas_df
from slim_gsgp.utils.utils import train_test_split
from slim_gsgp.main_gp import gp
#from slim_gsgp.main_gsgp import gsgp
#from slim_gsgp.main_slim import slim
from slim_gsgp.evaluators.fitness_functions import rmse

import statistics
from collections import defaultdict
import itertools

from sklearn.model_selection import KFold
import numpy as np
import os
import random
from itertools import product

In [18]:
os.chdir(os.path.join(os.getcwd(), os.pardir))
from utils.grid_search import gp_nested_cross_validation
from utils.new_visualization_funcs import *
%cd notebooks/

c:\Users\rafas\Documents\University\NEL\Neural_Evo_Learn\notebooks


  self.shell.db['dhist'] = compress_dhist(dhist)[-100:]


## Load Data

In [19]:
# Reading the desired dataset
df = pd.read_csv("../data/sustavianfeed.csv", sep=';')

# Dropping the first column (index) and renaming the columns
df = df.drop(columns= ['WING TAG', 'EMPTY MUSCULAR STOMACH'])

# Moving crude protein to the end of the dataframe
df = df[[col for col in df.columns if col != 'CRUDE PROTEIN'] + ['CRUDE PROTEIN']]

# Nested CV with Grid Search

In [20]:
seed = 42
random.seed(seed)
# Edit the name and log directory based on the model you want to run

MODEL_NAME = 'TEST_GP'
#MODEL_NAME = 'GSGP'
#MODEL_NAME = 'SLIM-GSGP'

DATASET_NAME = MODEL_NAME +'_sustavianfeed'
LOG_DIR = './log/' + MODEL_NAME + '/'

LOG_LEVEL = 2
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

In [21]:
k_outer = 10
k_inner = 5

In [22]:
# Turning df into X and y torch.Tensors
X, y = load_pandas_df(df, X_y=True)

In [23]:
FITNESS_FUNCTION = 'rmse'
MINIMIZATION = True

total_instances = X.shape[0]
outer_test_size = total_instances // k_outer
outer_train_size = total_instances - outer_test_size
inner_val_size = outer_train_size // k_inner
inner_train_size = outer_train_size - inner_val_size

print(f'Total Instances:\t{total_instances}\n--')
print(f'Outer Train set:\t{outer_train_size}')
print(f'Test set:\t\t{outer_test_size}\n--')
print(f'Inner Train set:\t{inner_train_size}')
print(f'Validation set:\t\t{inner_val_size}\n')

Total Instances:	96
--
Outer Train set:	87
Test set:		9
--
Inner Train set:	70
Validation set:		17



In [None]:
POP_SIZE = 20

fixed_params = {
    # ---
    # Search Space
    'initializer': 'rhh',
    'init_depth': 2,
    'tree_constants': [random.uniform(-1, 1) for _ in range(9)],
    'tree_functions': ['add', 'subtract','multiply','divide'],
    #'prob_const': 0.1,
    # ---
    # Problem Instance
    'dataset_name': DATASET_NAME,
    'fitness_function': 'rmse',
    'minimization': True,
    # ---
    # Model instance
    #'tournament_size': int(POP_SIZE*0.02) if POP_SIZE>100 else 2,
    'pop_size': POP_SIZE,
    # ---
    # Solve settings
    'n_iter': 500,
    'elitism': True,
    'n_elites': 2,
    'test_elite': True,
    'log_level': LOG_LEVEL,
    'verbose': 0,
    'n_jobs': 1,

    # ---
    # GP unique settings
    #'max_depth': 10,
    #'p_xo' : 0.5,

    # ---
    # GSGP unique settings
    #'ms_lower': 0,
    #'ms_upper': 0.5,
    # ---
    # SLIM unique settings
    #'max_depth': 10,
    #'#p_inflate': 0.70,
    #'slim_version': 'SLIM+SIG2',
    #'copy_parent': True,
    #'ms_lower': 0,
    #'ms_upper': 0.5,
    #'reconstruct': False,

}


param_grid = {
    
        'p_xo' : [0.5, 0.7],
        'tournament_size': [int(POP_SIZE*0.20), int(POP_SIZE*0.10), int(POP_SIZE*0.15)],
        'prob_const': [0.1, 0.7],

}


In [None]:
outer_results = gp_nested_cross_validation(X, y, gp_model=gp, k_outer=k_outer, k_inner=k_inner, fixed_params=fixed_params, param_grid=param_grid, seed=seed, LOG_DIR=LOG_DIR, DATASET_NAME=DATASET_NAME)

In [None]:
outer_results_df = pd.DataFrame(outer_results)
outer_results_df.to_csv(LOG_DIR+DATASET_NAME+'_outer_results.csv', index=False)

In [None]:
#!zip -r logs.zip log/

  adding: log/ (stored 0%)
  adding: log/gp.csv (deflated 89%)
  adding: log/gp_settings.csv (deflated 94%)
  adding: log/GP/ (stored 0%)
  adding: log/GP/GP_sustavianfeed_outer_0.csv (deflated 84%)
  adding: log/GP/GP_sustavianfeedouter0inner_4.csv (deflated 85%)
  adding: log/GP/GP_sustavianfeed_outer_8.csv (deflated 84%)
  adding: log/GP/GP_sustavianfeedouter0inner_1.csv (deflated 84%)
  adding: log/GP/GP_sustavianfeedouter5inner_1_settings.csv (deflated 93%)
  adding: log/GP/GP_sustavianfeedouter1inner_2_settings.csv (deflated 93%)
  adding: log/GP/GP_sustavianfeedouter8inner_4.csv (deflated 85%)
  adding: log/GP/GP_sustavianfeedouter6inner_0.csv (deflated 85%)
  adding: log/GP/GP_sustavianfeedouter6inner_2.csv (deflated 85%)
  adding: log/GP/GP_sustavianfeedouter4inner_0.csv (deflated 85%)
  adding: log/GP/GP_sustavianfeedouter1inner_1_settings.csv (deflated 93%)
  adding: log/GP/GP_sustavianfeed_outer_8_settings.csv (deflated 65%)
  adding: log/GP/GP_sustavianfeedouter5inner_4_se

# Visualizations

In [None]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.subplots as sp
from plotly.subplots import make_subplots
import ast
from collections import defaultdict

In [None]:
df = pd.read_csv("../log/gp.csv", header=None)
settings_df = pd.read_csv("../log/gp_settings.csv", header=None)
unique_settings_df = settings_df.drop_duplicates(0) #the comb id is now indexed

In [None]:
unique_settings_df.head()

In [None]:
# 0  - Algorithm
# 1  - Instance ID
# 2  - Dataset
# 3  - Seed
# 4  - Generation
# 5  - Fitness
# 6  - Running time
# 7  - Population nodes
# 8  - Test fitness
# 9  - Elite nodes
# 10 - niche entropy
"""From here on, it doesnt appear on df"""
# 11 - sd(pop.fit)
# 12 - Log level

In [None]:
'''def get_combination_str(setting_str, unique_settings_df = pd.DataFrame(df[1].unique())):
    comb_str = unique_settings_df[unique_settings_df[0]==setting_str][1][0]
    return comb_str'''

def param_in_combination(param:str, comb_str: str):
    """
    Parameters that the function can receive:
    'log' / 'verbose'/ 'test_elite' / 'n_jobs' / 'max_depth' / 'n_elites' / 'elistism' / 'n_iter'
    'settings_dict' / 'p_xo' / 'pop_size' / 'seed' / 'p_m' / 'p_c' / 'init_depth' / 'init_pop_size'
    """
    variable_init = comb_str.find(param)
    param_init = variable_init + len(param) + 3 #advance 3 steps to account for the quote, the dots and the space.
    param_end = comb_str.find(',', variable_init)
    param_value = comb_str[param_init:param_end]
    if param_value.startswith('<') or param_value.startswith('['):
        print('The value of the parameter given cannot be converted to its original class:')
    else:
        try:
            return ast.literal_eval(param_value)
        except SyntaxError as s:
            print(f'Parameter does not exist in the combination string or it cannot be accessed. ({s})')

#example
#print(get_combination_str('865732c8-3014-11f0-b37b-baa0ecd080fe'))
#print(param_in_combination('log', get_combination_str('865732c8-3014-11f0-b37b-baa0ecd080fe')))



In [None]:
'''def pop_fitness_diversity(df, train_color='blue'):
     """
     Out of Bounds
     """
     dif_combs = np.unique(df[[1]])
     for comb in dif_combs:
          y = df[df[1]==comb]
          #comb_dict = get_combination(comb)
          fig = go.Figure()
          fig.add_trace(go.Scatter(y=y.iloc[:,11].values,
                                   mode='lines', name='Train', line=dict(color=train_color)))
          fig.update_layout(
          height=400, width=800,
          margin=dict(t=50),
          yaxis_range=[0,None],
          title_text=f'GP - Population Fitness Diversity\nCombination:',
          xaxis_title='Generation', yaxis_title='Fitness Standard Deviation'
          )
          fig.show()'''



In [None]:
def plot_combs_together_test(df, comb_idxs: list | int = [i for i in range(pd.DataFrame(df[1].unique()).shape[0])],
                             colors = ['#FF0000', '#0000FF', '#00FF00', '#FFA500', '#800080',
                                       '#FF00FF', '#00FFFF', '#FFFF00', '#1F77B4', '#FF7F0E',
                                       '#2CA02C', '#D62728', '#9467BD', '#8C564B', '#E377C2',
                                       '#7F7F7F', '#AEC7E8', '#FFBB78', '#98DF8A', '#FF9896'],
                              ):

     assert len(colors)>=len(comb_idxs), "Not enough colors for all combinations"

     unique_settings_df = pd.DataFrame(df[1].unique())
     fig = go.Figure()
     for i, comb_idx in enumerate(comb_idxs):
          comb = unique_settings_df.iloc[comb_idx, 0]
          y = df[df[1]==comb]
          algo = y.iloc[0,0]

          fig.add_trace(go.Scatter(y=y.iloc[:,8].values,
                                   mode='lines', name=f'Test Comb {comb_idx}',
                                   line=dict(color=colors[i])))#, row=1, col=1)

          fig.update_xaxes(title_text="Generation")

     fig.update_layout(
          width=1000,
          height=400,
          title_text = f"{algo} - Test Fitness (Combinations indexes: {comb_idxs})",
          showlegend=True,
          yaxis_range=[0,None],
          )

     fig.show()

