# Global settings

## Imports

In [None]:
import os
import pickle
import datetime
import torch
import random

import pandas as pd
import numpy as np
import plotly.graph_objects as go

from plotly.subplots import make_subplots
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, TensorDataset

from slim_gsgp.datasets.data_loader import *
from slim_gsgp.main_gsgp import gsgp


## Settings

In [None]:
seed = 1111
np.random.seed(seed)
random.seed(seed)


In [None]:
train_color = 'blue'
test_color = 'orange'


<br />

## Data simulation


$$
f(X) = x_1^2 + x_2^2 + x_3^2 + x_1 x_2 x_3 + N(0, 1)
$$


In [None]:
n_samples = 100

feature_1 = np.random.uniform(-10, 10, size=n_samples)
feature_2 = np.random.normal(0, 5, size=n_samples)
feature_3 = np.random.beta(2, 5, size=n_samples) * 20
noise = np.random.normal(0, 1, size=n_samples)

target = (
    feature_1**2 + feature_2**2 + feature_3**2 + 
    feature_1 * feature_2 * feature_3 +
    np.exp(feature_1) +
    noise
)

df = pd.DataFrame({
    'feature_1': feature_1,
    'feature_2': feature_2,
    'feature_3': feature_3,
    'target': target
})

df.head()


# Cross-validation objects

In [None]:
cv = KFold(n_splits=10, random_state=seed, shuffle=True)


<hr />

# GSGP

## Step 1: Problem Instance definition

- `X` and `y`: which dataset will be used?
- `fitnesss_function`: the fitness function that will be used to measure the algorithm learning.
- `minimization`: is this a minimization problem?


In [None]:
# DATASET = 'syn'
# DATASET = 'boston'
DATASET = 'bike'

if DATASET == 'syn':
    X = torch.tensor(df.values[:, :3], dtype=torch.float32)
    y = torch.tensor(df.values[:, 3], dtype=torch.float32)
    DATASET_NAME = 'Synthetic'
elif DATASET == 'boston':
    X, y = load_boston(X_y=True)
    DATASET_NAME = 'Boston'
elif DATASET == 'bike':
    X, y = load_bike_sharing(X_y=True)
    # X = X[:, :11]
    DATASET_NAME = 'Bike'

FITNESS_FUNCTION = 'rmse'
MINIMIZATION = True


In [None]:
data_cv = [[train_ix, test_ix] for train_ix, test_ix in cv.split(X, y)][0]

# Train and test split
X_train_tensor = X[data_cv[0], :]
y_train_tensor = y[data_cv[0]]
X_val_tensor = X[data_cv[1], :]
y_val_tensor = y[data_cv[1]]

[X_train_tensor.shape, y_train_tensor.shape, X_val_tensor.shape, y_val_tensor.shape]


## Step 2: Search space definition

- `initializer`: how new random trees are initialized. See [`slim_gsgp` initializers](https://github.com/DALabNOVA/slim/blob/main/slim_gsgp/initializers/initializers.py);
- `tree_constants`: the constants to be used in the terminal set;
- `tree_functions`: the function set (tree internal nodes);
- `prob_const`: the probability for choosing constants instead of dataset features on tree terminals;
- `init_depth`: max depth for tree initialisation;


In [None]:
INITIALIZER = 'rhh'
TREE_CONSTANTS = [random.uniform(0, 1) for _ in range(9)]+[ -1.]
TREE_FUNCTIONS = ['add', 'subtract']
PROB_CONSTANT = 0.9
MAX_INIT_DEPTH = 4


In [None]:
TREE_CONSTANTS
# [0.21760077176688164,
#  0.3443807346030824,
#  0.6422536234699076,
#  0.36413206493253214,
#  0.08358916437841302,
#  0.5040914040192876,
#  0.18743462930144428,
#  0.8842252761132199,
#  0.33821341140965044,
#  -1.0]

## Step 3: GSGP Instance

The following hyperparameter options are the same as for GP:

- `pop_size`: the size of the population of candidate solutions.
- `p_xo`: the probability of applying the cross-over genetic operator to candidate solutions.
- `elitism`: should the elite(s) be preserved at each generation?
- `n_elits`: if using elitism, how many solutions should be kept?
- Selection method. Only tournament selection in available on `slim_gsgp` libraya, as this is the most commonly used. It requires the definition of the `tournament_size` hyperparameter: how many solutions should participate in the tournament of tournament selection?

**Additionally, GSGP requires:**

- `ms_lower`: lower bound for generating the random number used as mutation step.
- `ms_upper`: upper bound for generating the random number used as mutation step.
- `reconstruct`: whether to store the structure of individuals.


In [None]:
POP_SIZE = 1000
P_XO = 0.9
ELISTISM = True
N_ELITES = 2
TOURNAMENT_SIZE = int(POP_SIZE*0.07)
print(f'TOURNAMENT_SIZE: {TOURNAMENT_SIZE}')

MS_LOWER = 0
MS_UPPER = 1
RECONSTRUCT = True


## Step 4: Solve settings

Same as available for GP:

In [None]:
GENERATIONS = 10
VERBOSE = 1

LOG_LEVEL = 2
LOG_DIR = './log/PC3/'
LOG_PATH = LOG_DIR+'gsgp_'+DATASET_NAME+'.csv'

if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

if os.path.exists(LOG_PATH):
    os.remove(LOG_PATH)

print(f'Total evaluations: {POP_SIZE*GENERATIONS}\n')


## Solve 

In [None]:
model = gsgp(
    # ---
    # Search Space
    init_depth=MAX_INIT_DEPTH,
    # max_depth=MAX_DEPTH,
    tree_constants=TREE_CONSTANTS,
    tree_functions=TREE_FUNCTIONS,
    prob_const = PROB_CONSTANT,
    # --
    # Problem Instance
    X_train=X_train_tensor, y_train=y_train_tensor, 
    X_test=X_val_tensor, y_test=y_val_tensor,
    dataset_name=DATASET_NAME,
    fitness_function=FITNESS_FUNCTION,
    minimization=MINIMIZATION,
    # --
    # GSGP instance 
    pop_size=POP_SIZE,
    p_xo = P_XO,
    initializer=INITIALIZER,
    tournament_size = TOURNAMENT_SIZE,
    ms_lower = MS_LOWER,
    ms_upper = MS_UPPER,
    reconstruct = RECONSTRUCT,
    # ---
    # Solve settings
    n_iter=GENERATIONS,
    elitism=ELISTISM,
    n_elites=N_ELITES,
    test_elite=True,
    log_path=LOG_PATH,
    log_level=LOG_LEVEL,
    verbose=VERBOSE,
    n_jobs=1,
    seed=seed
)


In [None]:
# Log level 2
# -----------
# 0  - Algorithm
# 1  - Instance ID
# 2  - Dataset
# 3  - Seed
# 4  - Generation
# 5  - Fitness
# 6  - Running time
# 7  - Population nodes
# 8  - Test fitness
# 9  - Elite nodes
# 10 - Genotype diversity: gsgp_pop_div_from_vectors (Calculate the diversity of a population from semantic vectors)
# 11 - Phenotype diversity: sd(pop.fit)
# 12 - Log level
pd.read_csv(LOG_PATH, header=None).head()


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=pd.read_csv(LOG_PATH, header=None).iloc[:,5].values, 
                         mode='lines', name='Train', line=dict(color=train_color)))
fig.add_trace(go.Scatter(y=pd.read_csv(LOG_PATH, header=None).iloc[:,8].values, 
                         mode='lines', name='Test', line=dict(color=test_color)))
fig.update_layout(
    height=400, width=800, 
    margin=dict(t=50),
    title_text='GSGP - Train vs Test Fitness ('+DATASET_NAME+' dataset)',
    xaxis_title='Generation', yaxis_title='RMSE'
)
fig.update_yaxes(range=[0, None])
fig.show()


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=pd.read_csv(LOG_PATH, header=None).iloc[:,9].values, 
                         mode='lines', name='Train', line=dict(color=train_color)))
fig.update_layout(
    height=400, width=800, 
    margin=dict(t=50),
    title_text='GSGP - Solution size ('+DATASET_NAME+' dataset)',
    # yaxis_type='log',
    xaxis_title='Generation', yaxis_title='Nodes count'
)
fig.update_yaxes(range=[0, None])
fig.show()


In [None]:
fig = go.Figure()
div_vector_log = pd.read_csv(LOG_PATH, header=None).iloc[:,10].values
div_vector_values = np.array([float(x.replace('tensor(', '').replace(')', '')) for x in div_vector_log])
fig.add_trace(go.Scatter(y=div_vector_values,
                         mode='lines', name='Train', line=dict(color=train_color)))
fig.update_layout(
    height=400, width=800, 
    margin=dict(t=50),
    title_text='GSGP - Population Semantic Diversity ('+DATASET_NAME+' dataset)',
    yaxis_range=[0,None],
    xaxis_title='Generation', yaxis_title='Semantic Diversity'
)
fig.show()


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=pd.read_csv(LOG_PATH, header=None).iloc[:,11].values, 
                         mode='lines', name='Train', line=dict(color=train_color)))
fig.update_layout(
    height=400, width=800, 
    margin=dict(t=50),
    yaxis_range=[0,None],
    title_text='GSGP - Population Fitness Diversity ('+DATASET_NAME+' dataset)',
    xaxis_title='Generation', yaxis_title='Fitness Standard Deviation'
)
fig.show()


In [None]:
# model.predict(X_val_tensor)


In [None]:
[model.fitness, model.test_fitness]


In [None]:
model.nodes

In [None]:
from slim_gsgp.algorithms.GSGP.representations.tree import Tree
def print_nested_list(nested_list):
    if isinstance(nested_list, list):
        if nested_list[0].__name__ == 'geometric_crossover':
            # T1 * TR
            print_nested_list(nested_list[1])
            print(' * ', end = '')
            print_nested_list(nested_list[3])

            # + T2 * 1 - TR)
            print(' + ', end = '')
            print_nested_list(nested_list[2])
            print(' * (1 - ', end = '')
            print_nested_list(nested_list[3])
            print(')\n')
            
        elif nested_list[0].__name__ == 'standard_geometric_mutation':
            # T 
            print_nested_list(nested_list[1])

            # + ms
            print(' + (', end = '')
            print(nested_list[4], end='')
            
            # * ( TR1 - TR 2 )
            print(' * (', end = '')
            print_nested_list(nested_list[2])
            print(' - ', end = '')
            print_nested_list(nested_list[3])
            print(')')
    elif isinstance(nested_list, tuple):
        print(nested_list)
    else:
        if isinstance(nested_list.structure, tuple):
            print(nested_list.structure, end='')
        else:
            print_nested_list(nested_list.structure)

if RECONSTRUCT:
    print_nested_list(model.structure)


<br />
<hr />

# Exercises (not graded)

- Experiment different synthetic datasets.
- Run the nested crossvalidation for hyperparameters tunning using the Boston dataset.

<br />