In [1]:

import os
import csv
import torch
import pandas as pd
import matplotlib.pyplot as plt
import pandas as pd
import random
from gpolnel.utils.datasets import load_boston
from torch.utils.data import TensorDataset, DataLoader
import pickle
from gpolnel.problems.inductive_programming import SML
from gpolnel.utils.utils import train_test_split
from gpolnel.utils.ffunctions import Ffunctions
from gpolnel.utils.inductive_programming import function_map
from gpolnel.algorithms.genetic_algorithm import GeneticAlgorithm
from gpolnel.operators.initializers import grow, prm_grow, ERC
from gpolnel.operators.variators import swap_xo, prm_subtree_mtn
from gpolnel.operators.selectors import prm_tournament, roulette_wheel, rank_selection
from gpolnel.utils.inductive_programming import _execute_tree
seed=1
random.seed(seed)
import numpy as np
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import KFold
from sklearn.metrics import mean_squared_error

#_evaluate_individual_ffunction

In [2]:
from sklearn.metrics import mean_squared_error

def evaluate_solution( mheuristic, X_test, y_test):
    return mean_squared_error(_execute_tree(mheuristic.best_sol.repr_, X_test), y_test)

In [3]:
device = 'mps' if torch.backends.mps.is_available() else 'cpu'
device

'cpu'

In [4]:
X_train = torch.tensor(pd.read_csv('datamart/X_train_clipped.csv').values)
X_test = torch.tensor(pd.read_csv('datamart/X_test_clipped.csv').values)
y_train = torch.tensor(pd.read_csv('datamart/y_lactose_train.csv')['lactose_percent'].values)
y_test = torch.tensor(pd.read_csv('datamart/y_lactose_test.csv')['lactose_percent'].values)

In [5]:
fset = [function_map['add'],  function_map['mul'], function_map['div'],function_map['sub']]#function_map['sub'],
total_batches = 1
batch_size = X_train.shape[0]
sspace_sml = {
    'n_dims': X_train.shape[1],
    'function_set': fset, 'constant_set': ERC(-5., 5.),
    'p_constants': 0.1,
    'max_init_depth': 3,
    'max_depth': 5, 
    'n_batches': total_batches,
    'device': device
}

ps = 50
selection_pressure = .1
mutation_prob = .7
xo_prob = .9
has_elitism = True
allow_reproduction = False
ffunction = Ffunctions('rmse')



In [6]:
shuffle= True
total_batches = 1
batch_size = X_train.shape[0]
# Creates training and validatation data sets
ds_train = TensorDataset(X_train, y_train)
ds_test = TensorDataset(X_test, y_test)

# Creates training and test data loaders
dl_train = DataLoader(ds_train, batch_size, shuffle)
dl_test = DataLoader(ds_test, batch_size, shuffle)

In [7]:
def gp_train(dl_train, dl_val, ps, sspace_sml, ffunction, log_path, selection_pressure, mutation_prob, xo_prob, has_elitism, allow_reproduction, seed=1, n_iter=3000, device='cpu',save_model=False, id=None):

    pi_sml = SML(
        sspace=sspace_sml,
        ffunction=ffunction,
        dl_train=dl_train, dl_test=dl_val,  # For the algorithm, the unseen is our validation!
        n_jobs=8
    )

    mheuristic = GeneticAlgorithm(
        pi=pi_sml,
        initializer=grow,
        selector=prm_tournament(pressure=selection_pressure),  #prm_tournament(pressure=selection_pressure)
        crossover=swap_xo,
        mutator=prm_subtree_mtn(initializer=prm_grow(sspace_sml)),
        pop_size=ps,
        p_m=mutation_prob,
        p_c=xo_prob,
        elitism=has_elitism,
        reproduction=allow_reproduction,  # False = or xo or mutation
        device=device,
        seed=seed
    )
    mheuristic._initialize()

    mheuristic.solve(
        n_iter,
        verbose=0, log=0, log_path=log_path,
        test_elite=True
    )
    
    if save_model:    
        with open('models/GP/'+str(id)+'_GP.pkl', 'wb') as f:
            pickle.dump(mheuristic, f)
    
    return mheuristic

In [8]:
m = gp_train(dl_train = dl_train, dl_val = dl_test, ps=ps, sspace_sml = sspace_sml, ffunction=ffunction, log_path='abc.csv',  selection_pressure=0.05, mutation_prob=0.2, xo_prob=0.8, has_elitism=True, allow_reproduction=True, seed=1, n_iter=3, device='cpu')
m.best_sol.printTree()

x_2


In [9]:
def gp_cross_validation(X_train, y_train, kf, ps, sspace_sml, ffunction, log_path,  selection_pressure, mutation_prob, xo_prob, has_elitism, allow_reproduction, seed, n_iter, device, id=None, log=False):
    results = []
    feature_names = pd.read_csv('datamart/X_train_clipped_scaled.csv').columns
    
    if log:
        name = 'logs/GP/' + f'{id}' + '_random_search.csv'
        with open(name, 'w', newline='\n') as csvfile:
            w = csv.writer(csvfile, delimiter=';')
            w.writerow(['id']+['fold']+['score'])
            
    for fold, (train_index, val_index) in enumerate(kf.split(X_train)):
        ds_train = TensorDataset(X_train[train_index], y_train[train_index])
        ds_val = TensorDataset(X_train[val_index], y_train[val_index])

        # Creates training and test data loaders
        dl_train = DataLoader(ds_train, batch_size, shuffle)
        dl_val = DataLoader(ds_val, batch_size, shuffle)

        m = train_model(dl_train = dl_train, dl_val = dl_val, ps=ps, sspace_sml = sspace_sml, ffunction=ffunction, log_path=log_path,  selection_pressure=selection_pressure, mutation_prob=mutation_prob, xo_prob=xo_prob, has_elitism=has_elitism, allow_reproduction=allow_reproduction, seed=seed, n_iter=n_iter, device=device)
        mse = evaluate_solution(m, X_train[val_index], y_train[val_index])
        results.append(mse)
        m.best_sol.printTree(feature_names=feature_names)
        
    avg_mse = np.mean(results)
    print('cv_score:', avg_mse)
    
    return avg_mse 

In [10]:
kf = RepeatedKFold(n_splits=5, random_state=seed, n_repeats=2)
gp_cross_validation(X_train, y_train, kf, ps=ps, sspace_sml = sspace_sml, ffunction = ffunction, log_path='abc.csv',  selection_pressure=0.05, mutation_prob=0.2, xo_prob=0.8, has_elitism=True, allow_reproduction=True, seed=1, n_iter=2, device='cpu')

forage_kg_day
forage_kg_day
forage_kg_day
forage_kg_day
forage_kg_day
forage_kg_day
forage_kg_day
forage_kg_day
forage_kg_day
forage_kg_day
cv_score: 0.39562115557761046


0.39562115557761046

In [12]:
fsets = [function_map['add'],  function_map['mul'], function_map['div'],function_map['sub'], function_map['mean']] #,  function_map['mean']]
constant_sets = [ERC(-1., 1.), ERC(-5., 5.), ERC(-10., 10.)]
pss = [50, 100, 250, 500, 1000]
p_constantss = [0.05, 0.1, 0.2, 0.3]
max_init_depths = [2, 3, 4, 5, 6]
max_depths = [5, 6, 7, 8, 9]
selection_pressures = [0.05, 0.1, 0.2, 0.3, 0.4]
mutatuion_probs = [0.05, 0.1, 0.15, 0.2, 0.5]
xo_probs = [0.05, 0.1, 0.15, 0.2, 0.5]
has_elitisms = [True, False]
allow_reproductions = [True, False]
n_iters = [50, 100, 250, 500, 1000]
ffunction = Ffunctions('rmse')
kf = RepeatedKFold(n_splits=5, random_state=seed, n_repeats=1)

best_score = np.inf

name = 'logs/GP/' + 'random_search_total' + '.csv'
with open(name, 'w', newline='\n') as csvfile:
    w = csv.writer(csvfile, delimiter=';')
    w.writerow(['id']+['fset']+['constant_set']+['ps']+['p_constants']+['max_init_depth']+['max_depth']+['selection_pressure']+['mutation_prob']+['xo_prob']+['has_elitism']+['allow_reproduction']+['n_iter']+['score'])


for i in range(200):

    total_batches = 1
    batch_size = X_train.shape[0]
    
    fset = random.sample(fsets, random.randint(1, len(fsets)))
    print(fset)
    constant_set = random.choice(constant_sets)
    ps = random.choice(pss)
    p_constants = random.choice(p_constantss)
    max_init_depth = random.choice(max_init_depths)
    max_depth = random.choice(max_depths)
    selection_pressure = random.choice(selection_pressures)
    mutation_prob = random.choice(mutatuion_probs)
    xo_prob = random.choice(xo_probs)
    has_elitism = random.choice(has_elitisms)
    allow_reproduction = random.choice(allow_reproductions)
    n_iter = random.choice(n_iters)


    sspace_sml = {
        'n_dims': X_train.shape[1],
        'function_set': fset, 'constant_set': constant_set,
        'p_constants': p_constants,
        'max_init_depth': max_init_depth,
        'max_depth': max_depth, 
        'n_batches': total_batches,
        'device': device
    }
    score = gp_cross_validation(X_train, y_train, kf, ps=ps, sspace_sml = sspace_sml, ffunction=ffunction, log_path='abc.csv',  selection_pressure=selection_pressure, mutation_prob=mutation_prob, xo_prob=xo_prob, has_elitism=has_elitism, allow_reproduction=allow_reproduction, seed=1, n_iter=n_iter, device='cpu')
    print(score)
    
    
    name = 'logs/GP/' + 'random_search_total' + '.csv'
    with open(name, 'a', newline='\n') as csvfile:
        w = csv.writer(csvfile, delimiter=';')
        w.writerow([i]+[fset]+[constant_set]+[ps]+[p_constants]+[max_init_depth]+[max_depth]+[selection_pressure]+[mutation_prob]+[xo_prob]+[has_elitism]+[allow_reproduction]+[n_iter]+[score])
    
    if score < best_score:
        best_score = score
        print('Iteration '+ str(i))
        print(f'New best validation score: {best_score}')
        
        gp_train(dl_train = dl_train, dl_val = dl_test, ps=ps, sspace_sml = sspace_sml, ffunction=ffunction, log_path='abc.csv',  selection_pressure=0.05, mutation_prob=0.2, xo_prob=0.8, has_elitism=True, allow_reproduction=True, seed=1, n_iter=3, device='cpu', log=True, id=i)
        model = pickle.load(open('models/NN/' + str(i) + '_NN.pkl', 'rb'))
        y_test_pred = model(X_test)
        test_score = loss_fn(y_test_pred, y_test)
        test_score = float(test_score)
        print(f'New best test score: {test_score}')

[add, div, sub]
sub( refusals_by_milking, -4.4250 )
forage_kg_day
forage_kg_day
forage_kg_day
forage_kg_day
cv_score: 0.32519392122003526
0.32519392122003526


NameError: name 'model' is not defined