In [1]:

import os
import csv
import torch
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd

from gpolnel.utils.datasets import load_boston
from torch.utils.data import TensorDataset, DataLoader

from gpolnel.problems.inductive_programming import SML
from gpolnel.utils.utils import train_test_split
from gpolnel.utils.ffunctions import Ffunctions
from gpolnel.utils.inductive_programming import function_map
from gpolnel.algorithms.genetic_algorithm import GeneticAlgorithm
from gpolnel.operators.initializers import grow, prm_grow, ERC
from gpolnel.operators.variators import swap_xo, prm_subtree_mtn
from gpolnel.operators.selectors import prm_tournament, roulette_wheel, rank_selection
from gpolnel.utils.inductive_programming import _execute_tree
seed=1
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
#_evaluate_individual_ffunction

In [2]:
from sklearn.metrics import mean_squared_error

def evaluate_solution( mheuristic, X_test, y_test):
    return mean_squared_error(_execute_tree(mheuristic.best_sol.repr_, X_test), y_test)

In [3]:
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
device

'cpu'

In [4]:
X_train = torch.tensor(pd.read_csv('datamart/X_train_clipped_scaled.csv').values)
X_test = torch.tensor(pd.read_csv('datamart/X_test_clipped_scaled.csv').values)
y_train = torch.tensor(pd.read_csv('datamart/y_lactose_train.csv')['lactose_percent'].values)
y_test = torch.tensor(pd.read_csv('datamart/y_lactose_test.csv')['lactose_percent'].values)

In [5]:
fset = [function_map['add'],  function_map['mul'], function_map['div'],function_map['sub']]#function_map['sub'],
total_batches = 1
batch_size = X_train.shape[0]
sspace_sml = {
    'n_dims': X_train.shape[1],
    'function_set': fset, 'constant_set': ERC(-5., 5.),
    'p_constants': 0.1,
    'max_init_depth': 3,
    'max_depth': 5, 
    'n_batches': total_batches,
    'device': device
}

ps = 50
selection_pressure = .1
mutation_prob = .7
xo_prob = .9
has_elitism = True
allow_reproduction = False




In [6]:
shuffle= True
total_batches = 1
batch_size = X_train.shape[0]
# Creates training and validatation data sets
ds_train = TensorDataset(X_train, y_train)
ds_test = TensorDataset(X_test, y_test)

# Creates training and test data loaders
dl_train = DataLoader(ds_train, batch_size, shuffle)
dl_test = DataLoader(ds_test, batch_size, shuffle)

In [7]:
def train_model(dl_train, dl_val, ps, sspace_sml, log_path, selection_pressure, mutation_prob, xo_prob, has_elitism, allow_reproduction, seed=1, n_iter=3000, device='cpu'):

    pi_sml = SML(
        sspace=sspace_sml,
        ffunction=Ffunctions('rmse'),
        dl_train=dl_train, dl_test=dl_val,  # For the algorithm, the unseen is our validation!
        n_jobs=8
    )

    mheuristic = GeneticAlgorithm(
        pi=pi_sml,
        initializer=grow,
        selector=prm_tournament(pressure=selection_pressure),  #prm_tournament(pressure=selection_pressure)
        crossover=swap_xo,
        mutator=prm_subtree_mtn(initializer=prm_grow(sspace_sml)),
        pop_size=ps,
        p_m=mutation_prob,
        p_c=xo_prob,
        elitism=has_elitism,
        reproduction=allow_reproduction,  # False = or xo or mutation
        device=device,
        seed=seed
    )
    mheuristic._initialize()

    mheuristic.solve(
        n_iter,
        verbose=0, log=3, log_path=log_path,
        test_elite=True
    )
    return mheuristic

In [8]:
m = train_model(dl_train = dl_train, dl_val = dl_test, ps=ps, sspace_sml = sspace_sml, log_path='abc.csv',  selection_pressure=0.05, mutation_prob=0.2, xo_prob=0.8, has_elitism=True, allow_reproduction=True, seed=1, n_iter=3, device='cpu')
m.best_sol

1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4


<gpolnel.utils.tree.Tree at 0x23f9d7fba90>

In [9]:
def gp_cross_validation(X_train, y_train, kf, ps, sspace_sml, log_path,  selection_pressure, mutation_prob, xo_prob, has_elitism, allow_reproduction, seed, n_iter, device):
    results = []
    
    for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
        ds_train = TensorDataset(X_train[train_index], y_train[train_index])
        ds_val = TensorDataset(X_train[val_index], y_train[val_index])

        # Creates training and test data loaders
        dl_train = DataLoader(ds_train, batch_size, shuffle)
        dl_val = DataLoader(ds_val, batch_size, shuffle)

        m = train_model(dl_train = dl_train, dl_val = dl_val, ps=ps, sspace_sml = sspace_sml, log_path=log_path,  selection_pressure=selection_pressure, mutation_prob=mutation_prob, xo_prob=xo_prob, has_elitism=has_elitism, allow_reproduction=allow_reproduction, seed=seed, n_iter=n_iter, device=device)
        mse = evaluate_solution(m, X_train[val_index], y_train[val_index])
        results.append(mse)
        
        
    avg_mse = np.mean(results)
    print(avg_mse)
    return avg_mse 

In [10]:
kf = RepeatedKFold(n_splits=10, random_state=seed, n_repeats=2)
gp_cross_validation(X_train, y_train, kf, ps=ps, sspace_sml = sspace_sml, log_path='abc.csv',  selection_pressure=0.05, mutation_prob=0.2, xo_prob=0.8, has_elitism=True, allow_reproduction=True, seed=1, n_iter=2, device='cpu')

1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4
1
3
2
4


1.0267212566930435

In [11]:
# fsets = [[function_map['add'],  function_map['mul'], function_map['div'],function_map['sub'], function_map['exp'], function_map['log'], function_map['sin'], function_map['cos'], function_map['tanh'], function_map['lf'], function_map['mean'], function_map['max'], function_map['min']]]



In [12]:
# fset = [function_map['add'],  function_map['mul'], function_map['div'],function_map['sub']]#function_map['sub'],
# total_batches = 1
# batch_size = X_train.shape[0]
# sspace_sml = {
#     'n_dims': X_train.shape[1],
#     'function_set': fset, 'constant_set': ERC(-5., 5.),
#     'p_constants': 0.1,
#     'max_init_depth': 3,
#     'max_depth': 5, 
#     'n_batches': total_batches,
#     'device': device
# }

# ps = 226
# selection_pressure = .1
# mutation_prob = .7
# xo_prob = .9
# has_elitism = True
# allow_reproduction = False




# name = 'logs/NN/' + 'random_search' + '.csv'
# with open(name, 'w', newline='\n') as csvfile:
#     w = csv.writer(csvfile, delimiter=';')
#     w.writerow(['id']+['model']+['optimizer']+['learning_rate']+['l2']+['batch_size']+['n_epochs']+['cv_score'])


# for i in range(1000):

#     loss_fn = nn.MSELoss()
#     batch_size = random.choice(batch_sizes)
#     model = create_random_model()
#     lr = random.choice(learning_rates)
#     optimizer_name = random_optimizer()
#     n_epochs = random.choice(n_epoches)
#     weight_decay = random.choice(weight_decays)
#     score = nn_cross_validation(X_train=X_train, y_train=y_train, model=model, loss=loss_fn, optimizer_name = optimizer_name,lr=lr,  n_epochs=n_epochs, batch_size=batch_size,kf=kf, log=True, id=i)
    
#     name = 'logs/NN/' + 'random_search' + '.csv'
#     with open(name, 'a', newline='\n') as csvfile:
#         w = csv.writer(csvfile, delimiter=';')
#         w.writerow([i]+[f'{model}']+[f'{optimizer_name}']+[lr]+[weight_decay]+[batch_size]+[n_epochs]+[score])
    
#     if score < best_score:
#         best_score = score
#         print('Iteration '+ str(i))
#         print(f'New best validation score: {best_score}')
        
#         nn_train(X_train = X_train, y_train= y_train, model=model, loss_fn = loss_fn, optimizer_name = optimizer_name, lr=lr, weight_decay=weight_decay, n_epochs=n_epochs, batch_size=batch_size, log=True, id=i)
#         model = pickle.load(open('models/NN/' + str(i) + '_NN.pkl', 'rb'))
#         y_test_pred = model(X_test)
#         test_score = loss_fn(y_test_pred, y_test)
#         test_score = float(test_score)
#         print(f'New best test score: {test_score}')

In [13]:

fset = [function_map['add'],  function_map['mul'], function_map['div'],function_map['sub']]#function_map['sub'],

sspace_sml = {
    'n_dims': X_train.shape[1],
    'function_set': fset, 'constant_set': ERC(-5., 5.),
    'p_constants': 0.1,
    'max_init_depth': 3,
    'max_depth': 5, 
    'n_batches': total_batches,
    'device': device
}


In [14]:
pi_sml = SML(
    sspace=sspace_sml,
    ffunction=Ffunctions('rmse'),
    dl_train=dl_train, dl_test=dl_val,  # For the algorithm, the unseen is our validation!
    n_jobs=8
)
ps = 226
selection_pressure = .1
mutation_prob = .7
xo_prob = .9
has_elitism = True
allow_reproduction = False

mheuristic = GeneticAlgorithm(
    pi=pi_sml,
    initializer=grow,
    selector=prm_tournament(pressure=selection_pressure),  #prm_tournament(pressure=selection_pressure)
    crossover=swap_xo,
    mutator=prm_subtree_mtn(initializer=prm_grow(sspace_sml)),
    pop_size=ps,
    p_m=mutation_prob,
    p_c=xo_prob,
    elitism=has_elitism,
    reproduction=allow_reproduction,  # False = or xo or mutation
    device=device,
    seed=seed
)
mheuristic._initialize()

NameError: name 'dl_val' is not defined

In [None]:
rs = RandomSearch(pi_sml, prm_grow, device=device, seed=seed)
rs._initialize()

TypeError: prm_grow() got an unexpected keyword argument 'device'

In [None]:
# Log settings
file_dir = './log/'
file_name = 'intro.log'
log_path = file_dir + file_name
if os.path.exists(file_dir + file_name):
    os.remove(file_dir + file_name)
if not os.path.exists(file_dir):
    os.makedirs(file_dir)

# Learning
n_iter = 3000

mheuristic.solve(
    n_iter,
    verbose=3, log=3, log_path=log_path,
    test_elite=True
)

PermissionError: [WinError 32] Der Prozess kann nicht auf die Datei zugreifen, da sie von einem anderen Prozess verwendet wird: './log/intro.log'

In [None]:
pi_sml._evaluate_sol(mheuristic.best_sol, dl_val)

tensor(1.0317, dtype=torch.float64)

In [None]:
y_pred = _execute_tree(mheuristic.best_sol.repr_, X_test)

tensor([4.5729, 5.3822, 3.6637, 5.7050, 3.6637, 3.6637, 3.6637, 3.6637, 5.7050,
        5.7548, 3.6637, 5.7742, 3.6637, 3.6637, 5.7823, 3.6637, 3.6637, 4.9609,
        3.6637, 5.0766, 5.8608, 6.3908, 5.7080, 3.6637, 5.1885, 5.9177, 5.3403,
        4.8997, 3.6637, 6.1859, 5.8241, 3.6637, 3.6637, 5.9738, 5.7660, 5.3511,
        3.6637, 3.6637, 3.6637, 3.6637, 5.9516, 3.6637, 5.5144, 3.6637, 3.6637,
        5.1677, 5.7110, 5.1161, 4.8997, 3.6637, 3.6637, 5.5543, 4.9207, 3.6637,
        3.6637, 5.1237, 3.6637, 3.6637, 4.9706, 4.8667, 3.6637, 3.6637, 4.4982,
        3.6637, 6.0362, 3.6637, 3.6637, 6.0864, 5.2411, 6.0344, 3.6637, 3.6637,
        6.0540, 3.6637, 3.6637, 5.1461, 6.0627, 3.6637, 5.9658, 5.3872, 3.6637,
        5.7019, 6.3908, 3.6637, 4.8997, 5.0517, 3.6637, 6.1673, 3.6637, 5.8062,
        5.0079, 5.4849, 5.2956, 3.6637, 5.6833, 5.6409, 3.6637, 3.6637],
       dtype=torch.float64)

In [None]:
from sklearn.metrics import mean_squared_error

def evaluate_solution( mheuristic, X_test, y_test):
    return mean_squared_error(_execute_tree(mheuristic.best_sol.repr_, X_test), y_test)

1.0644175028168836