In [1]:

import os
import csv
import torch
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import random
from gpolnel.utils.datasets import load_boston
from torch.utils.data import TensorDataset, DataLoader

from gpolnel.problems.inductive_programming import SML
from gpolnel.utils.utils import train_test_split
from gpolnel.utils.ffunctions import Ffunctions
from gpolnel.utils.inductive_programming import function_map
from gpolnel.algorithms.genetic_algorithm import GeneticAlgorithm
from gpolnel.operators.initializers import grow, prm_grow, ERC
from gpolnel.operators.variators import swap_xo, prm_subtree_mtn
from gpolnel.operators.selectors import prm_tournament, roulette_wheel, rank_selection
from gpolnel.utils.inductive_programming import _execute_tree
seed=1
random.seed(seed)
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
#_evaluate_individual_ffunction

In [2]:
from sklearn.metrics import mean_squared_error

def evaluate_solution( mheuristic, X_test, y_test):
    return mean_squared_error(_execute_tree(mheuristic.best_sol.repr_, X_test), y_test)

In [3]:
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
device

'cpu'

In [4]:
X_train = torch.tensor(pd.read_csv('datamart/X_train_clipped.csv').values)
X_test = torch.tensor(pd.read_csv('datamart/X_test_clipped.csv').values)
y_train = torch.tensor(pd.read_csv('datamart/y_lactose_train.csv')['lactose_percent'].values)
y_test = torch.tensor(pd.read_csv('datamart/y_lactose_test.csv')['lactose_percent'].values)

In [5]:
fset = [function_map['add'],  function_map['mul'], function_map['div'],function_map['sub']]#function_map['sub'],
total_batches = 1
batch_size = X_train.shape[0]
sspace_sml = {
    'n_dims': X_train.shape[1],
    'function_set': fset, 'constant_set': ERC(-5., 5.),
    'p_constants': 0.1,
    'max_init_depth': 3,
    'max_depth': 5, 
    'n_batches': total_batches,
    'device': device
}

ps = 50
selection_pressure = .1
mutation_prob = .7
xo_prob = .9
has_elitism = True
allow_reproduction = False




In [6]:
shuffle= True
total_batches = 1
batch_size = X_train.shape[0]
# Creates training and validatation data sets
ds_train = TensorDataset(X_train, y_train)
ds_test = TensorDataset(X_test, y_test)

# Creates training and test data loaders
dl_train = DataLoader(ds_train, batch_size, shuffle)
dl_test = DataLoader(ds_test, batch_size, shuffle)

In [7]:
def train_model(dl_train, dl_val, ps, sspace_sml, log_path, selection_pressure, mutation_prob, xo_prob, has_elitism, allow_reproduction, seed=1, n_iter=3000, device='cpu'):

    pi_sml = SML(
        sspace=sspace_sml,
        ffunction=Ffunctions('rmse'),
        dl_train=dl_train, dl_test=dl_val,  # For the algorithm, the unseen is our validation!
        n_jobs=8
    )

    mheuristic = GeneticAlgorithm(
        pi=pi_sml,
        initializer=grow,
        selector=prm_tournament(pressure=selection_pressure),  #prm_tournament(pressure=selection_pressure)
        crossover=swap_xo,
        mutator=prm_subtree_mtn(initializer=prm_grow(sspace_sml)),
        pop_size=ps,
        p_m=mutation_prob,
        p_c=xo_prob,
        elitism=has_elitism,
        reproduction=allow_reproduction,  # False = or xo or mutation
        device=device,
        seed=seed
    )
    mheuristic._initialize()

    mheuristic.solve(
        n_iter,
        verbose=0, log=3, log_path=log_path,
        test_elite=True
    )
    return mheuristic

In [8]:
m = train_model(dl_train = dl_train, dl_val = dl_test, ps=ps, sspace_sml = sspace_sml, log_path='abc.csv',  selection_pressure=0.05, mutation_prob=0.2, xo_prob=0.8, has_elitism=True, allow_reproduction=True, seed=1, n_iter=3, device='cpu')
m.best_sol

<gpolnel.utils.tree.Tree at 0x21c8043f510>

In [9]:
def gp_cross_validation(X_train, y_train, kf, ps, sspace_sml, log_path,  selection_pressure, mutation_prob, xo_prob, has_elitism, allow_reproduction, seed, n_iter, device):
    results = []
    feature_names = pd.read_csv('datamart/X_train_clipped_scaled.csv').columns
    for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
        ds_train = TensorDataset(X_train[train_index], y_train[train_index])
        ds_val = TensorDataset(X_train[val_index], y_train[val_index])

        # Creates training and test data loaders
        dl_train = DataLoader(ds_train, batch_size, shuffle)
        dl_val = DataLoader(ds_val, batch_size, shuffle)

        m = train_model(dl_train = dl_train, dl_val = dl_val, ps=ps, sspace_sml = sspace_sml, log_path=log_path,  selection_pressure=selection_pressure, mutation_prob=mutation_prob, xo_prob=xo_prob, has_elitism=has_elitism, allow_reproduction=allow_reproduction, seed=seed, n_iter=n_iter, device=device)
        mse = evaluate_solution(m, X_train[val_index], y_train[val_index])
        results.append(mse)
        print('tree:', m.pop.individuals[0].printTree(feature_names=feature_names))
        
    avg_mse = np.mean(results)
    print('cv_score:', avg_mse)
    
    return avg_mse 

In [10]:
kf = RepeatedKFold(n_splits=5, random_state=seed, n_repeats=2)
gp_cross_validation(X_train, y_train, kf, ps=ps, sspace_sml = sspace_sml, log_path='abc.csv',  selection_pressure=0.05, mutation_prob=0.2, xo_prob=0.8, has_elitism=True, allow_reproduction=True, seed=1, n_iter=2, device='cpu')

mul( dim, div( errors_by_100_milkings, mul( milkings_day, milkings_day ) ) )
tree: None
mul( dim, div( errors_by_100_milkings, mul( milkings_day, milkings_day ) ) )
tree: None
mul( dim, div( errors_by_100_milkings, mul( milkings_day, milkings_day ) ) )
tree: None
mul( dim, div( errors_by_100_milkings, mul( milkings_day, milkings_day ) ) )
tree: None
mul( dim, div( errors_by_100_milkings, mul( milkings_day, milkings_day ) ) )
tree: None
mul( dim, div( errors_by_100_milkings, mul( milkings_day, milkings_day ) ) )
tree: None
mul( dim, div( errors_by_100_milkings, mul( milkings_day, milkings_day ) ) )
tree: None
mul( dim, div( errors_by_100_milkings, mul( milkings_day, milkings_day ) ) )
tree: None
mul( dim, div( errors_by_100_milkings, mul( milkings_day, milkings_day ) ) )
tree: None
mul( dim, div( errors_by_100_milkings, mul( milkings_day, milkings_day ) ) )
tree: None
cv_score: 0.39562115557761046


0.39562115557761046

In [11]:
fsets = [[function_map['add'],  function_map['mul'], function_map['div'],function_map['sub']]] #,  function_map['mean']]
constant_sets = [ERC(-1., 1.), ERC(-5., 5.), ERC(-10., 10.)]
pss = [50, 100, 250, 500, 1000, 5000]
p_constantss = [0.05, 0.1, 0.2, 0.3]
max_init_depths = [2, 3, 4, 5, 6]
max_depths = [5, 6, 7, 8, 9]
selection_pressures = [0.05, 0.1, 0.2, 0.3, 0.4]
mutatuion_probs = [0.05, 0.1, 0.15, 0.2, 0.5]
xo_probs = [0.05, 0.1, 0.15, 0.2, 0.5]
has_elitisms = [True, False]
allow_reproductions = [True, False]
n_iters = [50, 100, 250, 500, 1000, 5000]



In [12]:
# name = 'logs/GP/' + 'random_search' + '.csv'
# with open(name, 'w', newline='\n') as csvfile:
#     w = csv.writer(csvfile, delimiter=';')
#     w.writerow(['id']+['model']+['optimizer']+['learning_rate']+['l2']+['batch_size']+['n_epochs']+['cv_score'])

kf = RepeatedKFold(n_splits=5, random_state=seed, n_repeats=1)
for i in range(1000):

    total_batches = 1
    batch_size = X_train.shape[0]

    fset = random.choice(fsets)
    
    print(fset)
    constant_set = random.choice(constant_sets)
    ps = random.choice(pss)
    p_constants = random.choice(p_constantss)
    max_init_depth = random.choice(max_init_depths)
    max_depth = random.choice(max_depths)
    selection_pressure = random.choice(selection_pressures)
    mutation_prob = random.choice(mutatuion_probs)
    xo_prob = random.choice(xo_probs)
    has_elitism = random.choice(has_elitisms)
    allow_reproduction = random.choice(allow_reproductions)
    n_iter = random.choice(n_iters)


    sspace_sml = {
        'n_dims': X_train.shape[1],
        'function_set': fset, 'constant_set': constant_set,
        'p_constants': p_constants,
        'max_init_depth': max_init_depth,
        'max_depth': max_depth, 
        'n_batches': total_batches,
        'device': device
    }
    score = gp_cross_validation(X_train, y_train, kf, ps=ps, sspace_sml = sspace_sml, log_path='abc.csv',  selection_pressure=selection_pressure, mutation_prob=mutation_prob, xo_prob=xo_prob, has_elitism=has_elitism, allow_reproduction=allow_reproduction, seed=1, n_iter=n_iter, device='cpu')
    print(score)
    # name = 'logs/NN/' + 'random_search' + '.csv'
    # with open(name, 'a', newline='\n') as csvfile:
    #     w = csv.writer(csvfile, delimiter=';')
    #     w.writerow([i]+[f'{model}']+[f'{optimizer_name}']+[lr]+[weight_decay]+[batch_size]+[n_epochs]+[score])
    
    # if score < best_score:
    #     best_score = score
    #     print('Iteration '+ str(i))
    #     print(f'New best validation score: {best_score}')
        
    #     nn_train(X_train = X_train, y_train= y_train, model=model, loss_fn = loss_fn, optimizer_name = optimizer_name, lr=lr, weight_decay=weight_decay, n_epochs=n_epochs, batch_size=batch_size, log=True, id=i)
    #     model = pickle.load(open('models/NN/' + str(i) + '_NN.pkl', 'rb'))
    #     y_test_pred = model(X_test)
    #     test_score = loss_fn(y_test_pred, y_test)
    #     test_score = float(test_score)
    #     print(f'New best test score: {test_score}')

[add, mul, div, sub]
sub( forage_kg_day, -0.1496 )
tree: None
add( 0.2646, forage_kg_day )
tree: None
sub( forage_kg_day, -0.1496 )
tree: None
sub( 0.8964, watery_by_100_milkings )
tree: None
sub( forage_kg_day, -0.1496 )
tree: None
cv_score: 0.36047836953907375
0.36047836953907375
[add, mul, div, sub]
forage_kg_day
tree: None
forage_kg_day
tree: None
forage_kg_day
tree: None
forage_kg_day
tree: None
forage_kg_day
tree: None
cv_score: 0.3956049942947952
0.3956049942947952
[add, mul, div, sub]


KeyboardInterrupt: 

In [None]:
fset = [function_map['add'],  function_map['mul'], function_map['div'],function_map['sub']]#function_map['sub'],

sspace_sml = {
    'n_dims': X_train.shape[1],
    'function_set': fset, 'constant_set': ERC(-5., 5.),
    'p_constants': 0.1,
    'max_init_depth': 3,
    'max_depth': 5, 
    'n_batches': total_batches,
    'device': device
}


In [None]:
pi_sml = SML(
    sspace=sspace_sml,
    ffunction=Ffunctions('rmse'),
    dl_train=dl_train, dl_test=dl_test,  # For the algorithm, the unseen is our validation!
    n_jobs=8
)
ps = 500
selection_pressure = .1
mutation_prob = .7
xo_prob = .9
has_elitism = True
allow_reproduction = False

mheuristic = GeneticAlgorithm(
    pi=pi_sml,
    initializer=grow,
    selector=prm_tournament(pressure=selection_pressure),  #prm_tournament(pressure=selection_pressure)
    crossover=swap_xo,
    mutator=prm_subtree_mtn(initializer=prm_grow(sspace_sml)),
    pop_size=ps,
    p_m=mutation_prob,
    p_c=xo_prob,
    elitism=has_elitism,
    reproduction=allow_reproduction,  # False = or xo or mutation
    device=device,
    seed=seed
)
mheuristic._initialize()

3 tensor([1.7866e+04, 6.2676e+03, 5.5587e+03, 6.3908e+03, 5.8879e+03, 3.5797e+05,
        1.5751e+04, 5.8722e+03, 6.8674e+03, 1.0401e+04, 1.2243e+05, 4.1142e+03,
        5.6648e+03, 3.2551e+05, 5.3308e+05, 1.0468e+04, 3.7554e+02, 1.0392e+04,
        6.0728e+03, 5.8644e+03, 5.8665e+03, 1.1745e+04, 9.2719e+02, 5.4659e+03,
        9.2053e+05, 5.4766e+02, 5.7028e+03, 9.4812e+03, 2.5464e+06, 5.4935e+03,
        2.5289e+06, 5.9415e+03, 7.8909e+04, 5.8547e+03, 5.4989e+03, 5.9656e+03,
        5.4242e+03, 4.7607e+04, 6.2946e+03, 5.9222e+03, 6.0464e+03, 1.0416e+06,
        1.1798e+05, 5.8358e+03, 6.1727e+03, 6.3018e+03, 6.3118e+03, 3.8084e+03,
        5.9332e+04, 5.2051e+03, 1.6951e+05, 6.2374e+03, 4.2517e+04, 6.2453e+03,
        6.9202e+03, 3.0151e+03, 9.5094e+03, 5.8714e+03, 9.0670e+03, 8.0333e+03,
        4.0109e+03, 3.7152e+04, 5.7869e+03, 5.8154e+03, 5.4039e+03, 2.5486e+06,
        4.5402e+04, 3.6496e+03, 3.6013e+06, 6.1619e+05, 5.3170e+03, 5.9481e+03,
        8.4980e+03, 5.6423e+03, 6.6597

In [None]:
rs = RandomSearch(pi_sml, prm_grow, device=device, seed=seed)
rs._initialize()

NameError: name 'RandomSearch' is not defined

In [None]:
# Log settings
file_dir = './log/'
file_name = 'intro.log'
log_path = file_dir + file_name
if os.path.exists(file_dir + file_name):
    os.remove(file_dir + file_name)
if not os.path.exists(file_dir):
    os.makedirs(file_dir)

# Learning
n_iter = 3000

mheuristic.solve(
    n_iter,
    verbose=3, log=3, log_path=log_path,
    test_elite=True
)

PermissionError: [WinError 32] Der Prozess kann nicht auf die Datei zugreifen, da sie von einem anderen Prozess verwendet wird: './log/intro.log'

In [None]:
pi_sml._evaluate_sol(mheuristic.best_sol, dl_val)

tensor(1.0317, dtype=torch.float64)

In [None]:
y_pred = _execute_tree(mheuristic.best_sol.repr_, X_test)

tensor([4.5729, 5.3822, 3.6637, 5.7050, 3.6637, 3.6637, 3.6637, 3.6637, 5.7050,
        5.7548, 3.6637, 5.7742, 3.6637, 3.6637, 5.7823, 3.6637, 3.6637, 4.9609,
        3.6637, 5.0766, 5.8608, 6.3908, 5.7080, 3.6637, 5.1885, 5.9177, 5.3403,
        4.8997, 3.6637, 6.1859, 5.8241, 3.6637, 3.6637, 5.9738, 5.7660, 5.3511,
        3.6637, 3.6637, 3.6637, 3.6637, 5.9516, 3.6637, 5.5144, 3.6637, 3.6637,
        5.1677, 5.7110, 5.1161, 4.8997, 3.6637, 3.6637, 5.5543, 4.9207, 3.6637,
        3.6637, 5.1237, 3.6637, 3.6637, 4.9706, 4.8667, 3.6637, 3.6637, 4.4982,
        3.6637, 6.0362, 3.6637, 3.6637, 6.0864, 5.2411, 6.0344, 3.6637, 3.6637,
        6.0540, 3.6637, 3.6637, 5.1461, 6.0627, 3.6637, 5.9658, 5.3872, 3.6637,
        5.7019, 6.3908, 3.6637, 4.8997, 5.0517, 3.6637, 6.1673, 3.6637, 5.8062,
        5.0079, 5.4849, 5.2956, 3.6637, 5.6833, 5.6409, 3.6637, 3.6637],
       dtype=torch.float64)

In [None]:
from sklearn.metrics import mean_squared_error

def evaluate_solution( mheuristic, X_test, y_test):
    return mean_squared_error(_execute_tree(mheuristic.best_sol.repr_, X_test), y_test)

1.0644175028168836