# Global settings

## Imports

In [None]:
import os
import pickle
import datetime
import torch
import random

import pandas as pd
import numpy as np
import plotly.graph_objects as go

from itertools import product
from collections import defaultdict
from sklearn.model_selection import KFold
from torch.utils.data import DataLoader, TensorDataset

from slim_gsgp.datasets.data_loader import *
from slim_gsgp.main_slim import slim

from plots import *


## Settings

In [None]:
seed = 1111
np.random.seed(seed)
random.seed(seed)


# Cross-validation objects

In [None]:
k_outer = 15
k_inner = 10
cv_outer = KFold(n_splits=k_outer, random_state=seed, shuffle=True)
cv_inner = KFold(n_splits=k_inner, random_state=seed, shuffle=True)


# Problem Instance definition

- `X` and `y`: which dataset will be used?
- `fitnesss_function`: the fitness function that will be used to measure the algorithm learning.
- `minimization`: is this a minimization problem?


In [None]:
# DATASET = 'boston'
# DATASET = 'concrete_strength'
DATASET = 'bike'

if DATASET == 'boston':
    X, y = load_boston(X_y=True)
    DATASET_NAME = 'Boston'
elif DATASET == 'bike':
    X, y = load_bike_sharing(X_y=True)
    # X = X[:, :11]
    DATASET_NAME = 'Bike'
elif DATASET == 'concrete_strength':
    X, y = load_concrete_strength(X_y=True)
    DATASET_NAME = 'Concrete-Strength'
    
FITNESS_FUNCTION = 'rmse'
MINIMIZATION = True

total_instances = X.shape[0]
outer_test_size = total_instances // k_outer
outer_train_size = total_instances - outer_test_size
inner_val_size = outer_train_size // k_inner
inner_train_size = outer_train_size - inner_val_size

print(f'Total Instances:\t{total_instances}\n--')
print(f'Outer Train set:\t{outer_train_size}')
print(f'Test set:\t\t{outer_test_size}\n--')
print(f'Inner Train set:\t{inner_train_size}')
print(f'Validation set:\t\t{inner_val_size}\n')


# Solve settings


In [None]:
LOG_DIR = './log/PC4/'
LOG_LEVEL = 2
if not os.path.exists(LOG_DIR):
    os.makedirs(LOG_DIR)


In [None]:
POP_SIZE = 20
SLIM_VERSIONS = ['SLIM+SIG2', 'SLIM+SIG1', 'SLIM+ABS', 'SLIM*SIG2', 'SLIM*SIG1', 'SLIM*ABS']

fixed_params = {
    # ---
    # Search Space
    'initializer': 'rhh',
    'init_depth': 2,
    'max_depth': 10,
    'tree_constants': [random.uniform(0, 1) for _ in range(9)]+[ -1.], 
    'tree_functions': ['add', 'subtract'],
    'prob_const': 0.1,
    # ---
    # Problem Instance
    'dataset_name': DATASET_NAME,
    'fitness_function': 'rmse',
    'minimization': True,
    # ---
    # GSGP instance 
    'tournament_size': int(POP_SIZE*0.02) if POP_SIZE>100 else 2,
    'pop_size': POP_SIZE,
    'ms_lower': 0,
    'ms_upper': 0.5,
    'p_inflate': 0.05,
    'copy_parent': True,
    'reconstruct': False,
    # ---
    # Solve settings
    'n_iter': 500,
    'elitism': True,
    'n_elites': 2, 
    'test_elite': True,
    'log_level': LOG_LEVEL,
    'verbose': 0,
    'n_jobs': 1
}

param_grid = {
    'slim_version': SLIM_VERSIONS
}


In [None]:
def call_slim(fixed_params, param_grid, seed, set_max_depth = False):
    models = []
    keys, values = zip(*param_grid.items())
    for combo in product(*values):
        dynamic_params = dict(zip(keys, combo))
        full_params = {**fixed_params, **dynamic_params}
        if set_max_depth:
            full_params.update({'max_depth': full_params['init_depth']+15})
        model = slim(**full_params, seed=seed)
        res = {'model': model}
        res.update({'rmse_train': model.fitness.item()})
        res.update({'rmse_test': model.test_fitness.item()})
        res.update({'dynamic_params': dynamic_params})
        models.append(res)
    return models      


In [None]:
# Outer CV loop
# Notice that here we are using only the first iteration of the outer CV loop
data_cv_outer = [[learning_ix, test_ix] for learning_ix, test_ix in cv_outer.split(X, y)][0]

X_learning = ...
y_learning = ...
X_test = ...
y_test = ...

print('\n'+'-'*41+'\n')
print(f'Outer CV\nLearning shape: {X_learning.shape}\nTest shape: {X_test.shape}\n')

# Inner CV loop
results = []
data_cv_inner = [[train_ix, val_ix] for train_ix, val_ix in cv_inner.split(X_learning, y_learning)]
for i_inner, data_cv in enumerate(data_cv_inner):
    print('-----\nInner CV {}'.format(i_inner))
    # ----------
    # Data
    
    
    print(f'Training shape: {X_train.shape}\nValidation shape: {X_val.shape}\n')
    fixed_params.update({
        'X_train': X_train, 'y_train': y_train,
        'X_test': X_val, 'y_test': y_val
    })

    # Fit
    LOG_PATH = LOG_DIR+'slim_'+DATASET_NAME+'_'+str(i_inner)+'.csv'
    if os.path.exists(LOG_PATH):
        os.remove(LOG_PATH)
    fixed_params.update({'log_path': LOG_PATH})
    res = call_slim(fixed_params, param_grid, seed=(seed+i_inner))
    
    # Log
    results.append(res)


In [None]:
rmse_by_config = defaultdict(list)

for split in results:
    rmse_train = []
    rmse_test = []
    
    for result in split:
        key = ''
        for k, v in result['dynamic_params'].items():
            key += k+': '+str(v)+' <br /> '
        rmse_by_config[key].append(result['rmse_test'])

fig = go.Figure()
for config, rmse_values in rmse_by_config.items():
    fig.add_trace(go.Box(
        y=rmse_values,
        boxpoints='all',
        jitter=0.5,
        pointpos=0,
        line=dict(color='orange'),
        name=config
    ))

fig.update_layout(
    title=DATASET_NAME+' dataset',
    xaxis_title='',
    yaxis_title='Test RMSE',
    height=500, width=1100,
    xaxis_tickangle=-90,
    yaxis_range=[0,None],
    margin=dict(l=50, r=50, t=50, b=20),
    showlegend=False,
    template='plotly_white'
)

fig.show()


In [None]:
# Plot settings
df_log = []
for i_inner in range(k_inner):
    tmp = pd.read_csv(LOG_DIR+'slim_'+DATASET_NAME+'_'+str(i_inner)+'.csv', header=None)
    tmp['cv'] = i_inner
    df_log.append(tmp)
df_log = pd.concat(df_log, ignore_index=True)

n_rows = 2
n_cols = 3


In [None]:
make_evolution_plots(n_rows, n_cols, SLIM_VERSIONS, df_log, plot_title = 'SLIM - Train vs Test Fitness ('+DATASET_NAME+' dataset)')
[fixed_params['pop_size'], fixed_params['tournament_size']]

In [None]:
make_evolution_plots(n_rows, n_cols, SLIM_VERSIONS, df_log, var='size', plot_title = 'SLIM -Size ('+DATASET_NAME+' dataset)')

<br />
<hr />

# Exercises (not graded)

- Run the nested crossvalidation for hyperparameters tunning using the Concrete Strength dataset.

<br />