# Global settings

## Imports

In [None]:
import os
import pickle
import datetime
import torch
import random

import pandas as pd
import numpy as np
import plotly.graph_objects as go

from plotly.subplots import make_subplots
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, TensorDataset

from slim_gsgp.datasets.data_loader import *
from slim_gsgp.main_gp import gp


## Settings

In [None]:
seed = 1111
np.random.seed(seed)
random.seed(seed)


In [None]:
train_color = 'blue'
test_color = 'orange'


<br />

## Data simulation

<br />

In addition to the real datasets, simulated data can be a good approach for exploring the algorithms. As an example, following equation generates a dataset with three features ($x_1, x_2, x_3$) that are used to generate the target values ($f(X)$ with some randomness from a Normal distribution($N(0, 3)$):

$$
f(X) = x_1^2 + x_2^2 + x_3^2 + x_1 x_2 x_3 + N(0, 3)
$$


In [None]:
n_samples = 100

feature_1 = np.random.uniform(-10, 10, size=n_samples)
feature_2 = np.random.normal(0, 5, size=n_samples)
feature_3 = np.random.beta(2, 5, size=n_samples) * 20
noise = np.random.normal(0, 3, size=n_samples)

target = (
    feature_1**2 + feature_2**2 + feature_3**2 + 
    feature_1 * feature_2 * feature_3 +
    np.exp(feature_1) +
    noise
)

df = pd.DataFrame({
    'feature_1': feature_1,
    'feature_2': feature_2,
    'feature_3': feature_3,
    'target': target
})

df.head()


# Cross-validation object

In [None]:
cv = KFold(n_splits=10, random_state=seed, shuffle=True)


<hr />

# Exploring the slim_gsgp library


To begin exploring the slim_gsgp library, the first recommended resource is the official documentation: [Slim Documentation](https://slim-library.readthedocs.io/en/latest/).

Reading through the source code is also highly informative. _How should you navigate the slim_gsgp source code?_

<br />

<center>
    <img src='slim_framework.png' width=650 />
    <br />
    Figure 01. Overvoew of the slim_gsgp framwork.
</center>

- To run an algorithm, use the method named after the algorithm in its **main script (MAIN module in Figure 01)**. For example, to explore GP, open the main GP script: [https://github.com/DALabNOVA/slim/blob/main/slim_gsgp/main_gp.py](https://github.com/DALabNOVA/slim/blob/main/slim_gsgp/main_gp.py).
- This main method **instantiates an object of the algorithm's class (ALGORITHMS module in Figure 01)**. The class implementation can be found in the file named after the algorithm. For GP, see: [https://github.com/DALabNOVA/slim/blob/main/slim_gsgp/algorithms/GP/gp.py](https://github.com/DALabNOVA/slim/blob/main/slim_gsgp/algorithms/GP/gp.py).
- Finally, it can be helpful to inspect the **configuration file (CONFIG module in Figure 01)** for the algorithm. These files contain, for example, the default hyperparameter settings. For GP, refer to: [https://github.com/DALabNOVA/slim/blob/main/slim_gsgp/config/gp_config.py](https://github.com/DALabNOVA/slim/blob/main/slim_gsgp/config/gp_config.py).

##### These steps are also recommended for the other algorithms of the library.

<br />

##### _How to extend the library (implement your own methods or modify it?_
You can either create a branch on the library github repository or download the source codes and work locally. Check the **Developer tutorial** for instructions on how to modify the library: [https://github.com/DALabNOVA/slim/blob/main/CONTRIBUTING.md](https://github.com/DALabNOVA/slim/blob/main/CONTRIBUTING.md).

<br />

<hr />

# GP


When solving a symbolic regression problem with GP, it is a good practice to start by defining the problem instance and the search space. This is what it is being done in the next cells.

## Step 1: Problem Instance definition

- `X` and `y`: which dataset will be used?
- `fitnesss_function`: the fitness function that will be used to measure the algorithm learning.
- `minimization`: is this a minimization problem?


In [None]:
# DATASET = 'syn'
# DATASET = 'boston'
DATASET = 'bike' # https://archive.ics.uci.edu/dataset/275/bike+sharing+dataset

if DATASET == 'syn':
    X = torch.tensor(df.values[:, :3], dtype=torch.float32)
    y = torch.tensor(df.values[:, 3], dtype=torch.float32)
    DATASET_NAME = 'Synthetic'
elif DATASET == 'boston':
    X, y = load_boston(X_y=True)
    DATASET_NAME = 'Boston'
elif DATASET == 'bike':
    X, y = load_bike_sharing(X_y=True)
    # X = X[:, :11]
    DATASET_NAME = 'Bike'
    
FITNESS_FUNCTION = 'rmse'
MINIMIZATION = True


In [None]:
data_cv = [[train_ix, test_ix] for train_ix, test_ix in cv.split(X, y)][0]

# Train and test split
X_train_tensor = X[data_cv[0], :]
y_train_tensor = y[data_cv[0]]
X_val_tensor = X[data_cv[1], :]
y_val_tensor = y[data_cv[1]]

[X_train_tensor.shape, y_train_tensor.shape, X_val_tensor.shape, y_val_tensor.shape]


## Step 2: Search space definition

- `initializer`: how new random trees are initialized. See [`slim_gsgp` initializers](https://github.com/DALabNOVA/slim/blob/main/slim_gsgp/initializers/initializers.py);
- `tree_constants`: the constants to be used in the terminal set;
- `tree_functions`: the function set (tree internal nodes);
- `prob_const`: the probability for choosing constants instead of dataset features on tree terminals;
- `init_depth`: max depth for tree initialisation;
- `max_depth`: max depth of trees during algorithm evolution.


In [None]:
INITIALIZER = 'grow'
TREE_CONSTANTS = [random.uniform(0, 1) for _ in range(9)]+[ -1.]
TREE_FUNCTIONS = ['add', 'subtract']
PROB_CONSTANT = 0.9
MAX_INIT_DEPTH = 4
MAX_DEPTH = 7


In [None]:
TREE_CONSTANTS

# [0.21760077176688164,
#  0.3443807346030824,
#  0.6422536234699076,
#  0.36413206493253214,
#  0.08358916437841302,
#  0.5040914040192876,
#  0.18743462930144428,
#  0.8842252761132199,
#  0.33821341140965044,
#  -1.0]

## Step 3: GP Instance

It is library-dependent. On `slim_gsgp`, the following customization options are available:

- `pop_size`: the size of the population of candidate solutions.
- `p_xo`: the probability of applying the cross-over genetic operator to candidate solutions.
- `elitism`: should the elite(s) be preserved at each generation?
- `n_elits`: if using elitism, how many solutions should be kept?
- Selection method. Only tournament selection in available on `slim_gsgp` libraya, as this is the most commonly used. It requires the definition of the `tournament_size` hyperparameter: **how many solutions should participate in the tournament of tournament selection?**


In [None]:
POP_SIZE = 50
P_XO = 0.9
ELISTISM = True
N_ELITES = 1
TOURNAMENT_SIZE = 2


## Step 4: Solve settings

In [None]:
GENERATIONS = 30
VERBOSE = 1

# Log level 2
# -----------
# 0  - Algorithm
# 1  - Instance ID
# 2  - Dataset
# 3  - Seed
# 4  - Generation
# 5  - Fitness
# 6  - Running time
# 7  - Population nodes
# 8  - Test fitness
# 9  - Elite nodes
# 10 - Genotype diversity: niche entropy
# 11 - Phenotype diversity: sd(pop.fit)
# 12 - Log level
LOG_LEVEL = 2
LOG_DIR = './log/PC2/'
LOG_PATH = LOG_DIR+'gp_'+DATASET_NAME+'.csv'

if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)

if os.path.exists(LOG_PATH):
    os.remove(LOG_PATH)

print(f'Total evaluations: {POP_SIZE*GENERATIONS}\n')

## Solve 

In [None]:
model = gp(
    # ---
    # Search Space
    init_depth=MAX_INIT_DEPTH,
    max_depth=MAX_DEPTH,
    tree_constants=TREE_CONSTANTS,
    tree_functions=TREE_FUNCTIONS,
    prob_const = PROB_CONSTANT,
    # --
    # Problem Instance
    X_train=X_train_tensor, y_train=y_train_tensor, 
    X_test=X_val_tensor, y_test=y_val_tensor,
    dataset_name=DATASET_NAME,
    fitness_function=FITNESS_FUNCTION,
    minimization=MINIMIZATION,
    # --
    # GP instance 
    pop_size=POP_SIZE,
    p_xo = P_XO,
    initializer=INITIALIZER,
    tournament_size = TOURNAMENT_SIZE,
    # ---
    # Solve settings
    n_iter=GENERATIONS,
    elitism=ELISTISM,
    n_elites=N_ELITES,
    test_elite=True,
    log_path=LOG_PATH,
    log_level=LOG_LEVEL,
    verbose=VERBOSE,
    n_jobs=1,
    seed=2
)


In [None]:
pd.read_csv(LOG_PATH, header=None).head()
# 0  - Algorithm
# 1  - Instance ID
# 2  - Dataset
# 3  - Seed
# 4  - Generation
# 5  - Fitness
# 6  - Running time
# 7  - Population nodes
# 8  - Test fitness
# 9  - Elite nodes
# 10 - niche entropy
# 11 - sd(pop.fit)
# 12 - Log level


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=pd.read_csv(LOG_PATH, header=None).iloc[:,5].values, 
                         mode='lines', name='Train', line=dict(color=train_color)))
fig.add_trace(go.Scatter(y=pd.read_csv(LOG_PATH, header=None).iloc[:,8].values, 
                         mode='lines', name='Test', line=dict(color=test_color)))
fig.update_layout(
    height=400, width=800, 
    margin=dict(t=50),
    yaxis_range=[0,None],
    title_text='GP - Train vs Test Fitness ('+DATASET_NAME+' dataset)',
    xaxis_title='Generation', yaxis_title='RMSE'
)
fig.update_yaxes(range=[0, None])
fig.show()


In [None]:
from plotly.subplots import make_subplots
fig = make_subplots(
    rows=1, cols=2,
    subplot_titles=('GP - Fitness evolution ('+DATASET_NAME+' dataset)', 'GP - Size evolution ('+DATASET_NAME+' dataset)')
)

fig.add_trace(go.Scatter(y=pd.read_csv(LOG_PATH, header=None).iloc[:,5].values, 
                         mode='lines', name='Train', line=dict(color=train_color)), row=1, col=1)
fig.add_trace(go.Scatter(y=pd.read_csv(LOG_PATH, header=None).iloc[:,8].values, 
                         mode='lines', name='Test', line=dict(color=test_color)), row=1, col=1)
fig.add_trace(go.Scatter(y=pd.read_csv(LOG_PATH, header=None).iloc[:,9].values, 
                         mode='lines', name='Size'), row=1, col=2)
fig.update_layout(
    width=1000,
    height=400, 
    showlegend=True,
    yaxis_range=[0,None],
    legend=dict(
        orientation='h',
        yanchor='bottom',
        y=-0.3,
        xanchor='center',
        x=0.5
    )
)
fig.show()


In [None]:
model.print_tree_representation()
# model.node_count


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=pd.read_csv(LOG_PATH, header=None).iloc[:,10].values, 
                         mode='lines', name='Train', line=dict(color=train_color)))
fig.update_layout(
    height=400, width=800, 
    margin=dict(t=50),
    title_text='GP - Niche entropy ('+DATASET_NAME+' dataset)',
    yaxis_range=[0,None],
    xaxis_title='Generation', yaxis_title='Entropy'
)
fig.show()


In [None]:
fig = go.Figure()
fig.add_trace(go.Scatter(y=pd.read_csv(LOG_PATH, header=None).iloc[:,11].values, 
                         mode='lines', name='Train', line=dict(color=train_color)))
fig.update_layout(
    height=400, width=800, 
    margin=dict(t=50),
    yaxis_range=[0,None],
    title_text='GP - Population Fitness Diversity ('+DATASET_NAME+' dataset)',
    xaxis_title='Generation', yaxis_title='Fitness Standard Deviation'
)
fig.show()


In [None]:
model.predict(X_val_tensor)


In [None]:
y_val_tensor


In [None]:
[model.fitness, model.test_fitness]

<br />
<hr />

# Excercises (not graded)

- Experiment new datasets.
- Run the hyperparameters tunning for the Boston or synthetic data.
    
<br />
