In [27]:
# importando as bibliotecas
import pandas as pd
import numpy as np
import random
from tqdm.auto import tqdm
import copy
import os
import json
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import v_measure_score
from multiprocessing.dummy import Pool as ThreadPool
import logging

In [28]:
class Node:
    def __init__(self, value, left=None, right=None):
        self.value = value
        self.left = left
        self.right = right

    def is_leaf(self):
        return self.left is None and self.right is None

    def get_all_nodes(self):
        nodes = [self]
        if self.left:
            nodes.extend(self.left.get_all_nodes())
        if self.right:
            nodes.extend(self.right.get_all_nodes())
        return nodes

    def view_expression(self):
        if self.is_leaf():
            if isinstance(self.value, str):
                return f"{self.value}"
            else:
                return str(self.value)
        else:
            left_expr = self.left.view_expression()
            right_expr = self.right.view_expression()
            operator = self.value
            return f"operators['{operator}']({left_expr}, {right_expr})"

    def depth(self):
        depth = 0
        current_node = self
        while current_node.parent is not None:
            current_node = current_node.parent
            depth += 1
        return depth

    def get_subtree_depth(self):
        if self.left is None and self.right is None:
            return 1
        left_depth = self.left.get_subtree_depth() if self.left else 0
        right_depth = self.right.get_subtree_depth() if self.right else 0
        return 1 + max(left_depth, right_depth)

    def protected_division(x, y):
        return np.divide(x, y, out=np.zeros_like(x), where=y!=0)

    operators = {
        '+': np.add,
        '-': np.subtract,
        '*': np.multiply,
        '/': protected_division
    }



In [29]:
random.seed(42)

# 1. Carregar e tratar os dados

In [30]:
def get_data(file_name, header):

    def read_csv_fle(file_name, header=0):
        train_data = pd.read_csv(f'./data/{file_name}train.csv', header=header)
        test_data = pd.read_csv(f'./data/{file_name}test.csv', header=header)
        return train_data, test_data

    train_data, test_data = read_csv_fle(file_name, header)

    train_labels = train_data.iloc[:, -1]
    train_features = train_data.iloc[:, :-1]

    test_labels = test_data.iloc[:, -1]
    test_features = test_data.iloc[:, :-1]

    def normalize(train_data, test_data):
        scaler = StandardScaler()
        scaler.fit(train_data)
        train_data = scaler.transform(train_data)
        test_data = scaler.transform(test_data)
        return train_data, test_data

    train_features, test_features = normalize(train_features, test_features)

    return train_features, train_labels, test_features, test_labels

# 2. Modelagem dos Indivíduo e População

Gera um idividuo modelado como uma árvore aleatória... usando o método grow.

Inicializa a população gerando uma lista de indivíduos aleatórios.

In [31]:
def generate_random_tree(max_depth: int, current_depth: int, terminals: list, variables: list, parent: Node = None) -> Node:
    if current_depth == max_depth-1 or (current_depth > 1 and random.random() > 0.5):
        value = random.choice(variables)
        node = Node(value)
        node.parent = parent
        return node
    else:
        op = random.choice(terminals)
        node = Node(op)
        node.parent = parent
        node.left = generate_random_tree(max_depth, current_depth + 1, terminals, variables, parent=node)
        node.right = generate_random_tree(max_depth, current_depth + 1, terminals, variables, parent=node)
        return node

In [32]:
def initialize_population_grow(pop_size:int, max_depth:int, terminals:list, variables:list) -> list:

    return np.array([generate_random_tree(max_depth=max_depth, current_depth=0, terminals=terminals, variables=variables) for _ in range(pop_size)])

# 3. Cálculo do Fitness

Explciar cálculo da fitness através da matriz de distâncias, valor V e clusterização.

In [33]:
def evaluate_fitness(individual, differences, labels):
    safe_dict = {}
    safe_dict['operators'] = individual.operators
    safe_dict.update(differences)

    expression = individual.view_expression()

    distance_matrix = eval(expression, {"__builtins__": None}, safe_dict)

    num_clusters = len(np.unique(labels))
    clustering = AgglomerativeClustering(n_clusters=num_clusters, metric='precomputed', linkage='average')
    clustering.fit(distance_matrix)

    y_pred  = clustering.labels_
    fitness = v_measure_score(labels, y_pred)

    return fitness

In [34]:
def compute_differences(features):
    X = pd.DataFrame(features, columns=[f'x{i}' for i in range(features.shape[1])])
    differences = {}
    for feature in X.columns:
        differences[feature] = X[feature].values[:, np.newaxis] - X[feature].values[np.newaxis, :]
    return differences

In [35]:
def calculate_fitness_population(population, differences, labels):
    return np.array([evaluate_fitness(ind, differences, labels) for ind in population])

# 4. Operadores Genéticos

In [36]:
def crossover(parent1: Node, parent2: Node, max_depth: int) -> tuple:
    child1, child2 = copy.deepcopy(parent1), copy.deepcopy(parent2)

    nodes1 = child1.get_all_nodes()
    nodes2 = child2.get_all_nodes()

    crossover_point1 = random.choice(nodes1)
    crossover_point2 = random.choice(nodes2)

    depth1 = crossover_point1.depth()
    depth2 = crossover_point2.depth()

    subtree_depth1 = crossover_point1.get_subtree_depth()
    subtree_depth2 = crossover_point2.get_subtree_depth()

    if depth1 + subtree_depth2 <= max_depth and depth2 + subtree_depth1 <= max_depth:
        crossover_point1.value, crossover_point2.value = crossover_point2.value, crossover_point1.value
        crossover_point1.left, crossover_point2.left = crossover_point2.left, crossover_point1.left
        crossover_point1.right, crossover_point2.right = crossover_point2.right, crossover_point1.right

    return child1, child2


In [37]:
def mutate(individual, terminals, variables, max_depth):
    mutant = copy.deepcopy(individual)

    nodes = mutant.get_all_nodes()

    mutation_point = random.choice(nodes)
    mutation_point_depth = mutation_point.depth()

    # Calculate the remaining depth allowed for the new subtree
    remaining_depth = max_depth - mutation_point_depth

    # Ensure remaining_depth is at least 1
    remaining_depth = max(1, remaining_depth)

    # Generate a new subtree with the remaining depth
    new_subtree = generate_random_tree(max_depth=remaining_depth, current_depth=0, terminals=terminals, variables=variables)

    mutation_point.value = new_subtree.value
    mutation_point.left = new_subtree.left
    mutation_point.right = new_subtree.right

    return mutant


In [38]:
def selection(population, fitness_scores, tournament_size=0):
    idx1 = np.random.choice(len(population), tournament_size)
    individual1 = population[idx1[np.argmax(fitness_scores[idx1])]]

    # Select the second individual
    idx2 = np.random.choice(len(population), tournament_size)
    individual2 = population[idx2[np.argmax(fitness_scores[idx2])]]

    return individual1, fitness_scores[idx1], individual2, fitness_scores[idx2]

# 5. Algoritmo de GP

explicar o algoritmo de GP

In [39]:
def gp_algorithm(pop_size, max_depth, terminals, variables, generations, mutation_rate, crossover_rate, tournament_size, elitism, train_labels, test_labels, train_features, test_features):
    history = []
    best_fitness_train = 0
    best_fitness_test_rep = 0
    count_better = 0
    count_worse = 0
    population = initialize_population_grow(pop_size, max_depth, terminals, variables)
    train_diff = compute_differences(train_features)
    test_diff = compute_differences(test_features)
    for generation in tqdm(range(generations), desc="Repetitions"):
        population_fitness = calculate_fitness_population(population, train_diff, train_labels)
        best_individual = population[np.argmax(population_fitness)]
        best_fitness_train = max(best_fitness_train, max(population_fitness))
        best_fitness_test = evaluate_fitness(best_individual, test_diff, test_labels)
        best_fitness_test_rep = max(best_fitness_test_rep, best_fitness_test)
        history.append({
            'generation': generation,
            'best_fitness': max(population_fitness),
            'min_fitness': min(population_fitness),
            'average_fitness': np.mean(population_fitness),
            'std_fitness': np.std(population_fitness),
            'test_fitness': best_fitness_test
        })
        new_population = [best_individual] if elitism else []
        while len(new_population) < pop_size:
            parent1, fitness_p1, parent2, fitness_p2 = selection(population, population_fitness, tournament_size)
            child1, child2 = copy.deepcopy(parent1), copy.deepcopy(parent2)

            if random.random() < crossover_rate:
                child1, child2 = crossover(parent1, parent2, max_depth)

                if evaluate_fitness(child1, train_diff, train_labels) >= np.mean([fitness_p1,fitness_p2]):
                    count_better += 1
                else:
                    count_worse += 1
                if evaluate_fitness(child2, train_diff, train_labels) >= np.mean([fitness_p1,fitness_p2]):
                    count_better += 1
                else:
                    count_worse += 1


            if random.random() < mutation_rate:
                child1 = mutate(child1, terminals, variables, max_depth)
                child2 = mutate(child2, terminals, variables, max_depth)

            new_population.extend([child1, child2])
        population = new_population[:pop_size]

    

    return {
        'best_train_v_measure': best_fitness_train,
        'best_test_v_measure': best_fitness_test_rep,
        'best_individual': best_individual.view_expression(),
        'count_better': count_better,
        'count_worse': count_worse,
        'history': history
    }

# 6. Experimentação

In [40]:
def save_result_as_csv(result):
    results_dir = './results/other_dataset'

    if not os.path.exists(results_dir):
        os.makedirs(results_dir)

    filename = f"history_experiment_{result['experiment_id']}_rep_{result['repetition']}.json"
    filepath = os.path.join(results_dir, filename)

    with open(filepath, 'w') as f:
        json.dump(result, f, indent=4)


In [41]:
def run_experiment(file_name, experiment_id, repetition, pop_size, max_depth, generations, mutation_rate, crossover_rate, tournament_size, elitism, config):
    terminals = ['+', '-', '*', '/']

    train_features, train_labels, test_features, test_labels = get_data(file_name, header=None)
    variables = [f'x{i}' for i in range(len(train_features[1]))]
    result = gp_algorithm(pop_size=pop_size, max_depth=max_depth, terminals=terminals, variables=variables, generations=generations, mutation_rate=mutation_rate, crossover_rate=crossover_rate, tournament_size=tournament_size, elitism=elitism, train_labels=train_labels, test_labels=test_labels, train_features=train_features, test_features=test_features)
    result['experiment_id'] = experiment_id
    result['repetition'] = repetition
    result['config'] = config

    result['history'] = result['history']

    save_result_as_csv(result)

In [42]:
def run_experiment_safe(file_name, experiment_id, repetition, population_size, max_depth, num_generation, mutation_prob,
                        crossover_prob, tournament_size, elitism):
    try:
        # Set up run_config within this function
        run_config = {
            'population_size': population_size,
            'num_generations': num_generation,
            'crossover_prob': crossover_prob,
            'mutation_prob': mutation_prob,
            'tournament_size': tournament_size,
            'experiment_id': experiment_id,
            'repetition': repetition
        }

        logging.info(f"Starting experiment {experiment_id}, repetition {repetition}")

        run_experiment(file_name, experiment_id, repetition, population_size, max_depth,num_generation, mutation_prob, crossover_prob, tournament_size, elitism, run_config)

        logging.info(f"Completed experiment {experiment_id}, repetition {repetition}")
    except Exception as e:
        logging.error(f"Error in experiment {experiment_id}, repetition {repetition}: {e}")

In [43]:
def apply_in_thread_pool(
    num_threads, function, file_name, experiment_id, N_repetition, population_size, max_depth, num_generation, mutation_prob,
                        crossover_prob, tournament_size, elitism
):
    repetitions = list(range(N_repetition))

    with ThreadPool(num_threads) as pool:
        result = pool.map(
            lambda repetition: function(file_name, experiment_id, repetition, population_size, max_depth, num_generation, mutation_prob,
                        crossover_prob, tournament_size, elitism),
            repetitions,
            chunksize=1
        )
        assert len(result) == N_repetition

In [44]:
def test_population(config):
    pop_size = config['population_size']
    max_depth = config['max_individual_size']
    num_generations = config['num_generations']
    elitism = config['elitism']
    n_repetitions = config['n_repetitions']
    experiment_id = config['experiment_id']
    file_name = config['file_name']

    # Define parâmetros fixos para evitar repetição desnecessária
    crossover_prob = 0.9
    mutation_prob = 0.05
    tournament_size = 2

  # Verifica arquivos apenas uma vez

    for population_size in tqdm(pop_size, desc='Population Size'):
        for num_generation in tqdm(num_generations, desc='Number of Generations'):
            if experiment_id in [1,2]:
                apply_in_thread_pool(os.cpu_count()//2, run_experiment_safe, file_name, experiment_id, n_repetitions, population_size, max_depth, num_generation, mutation_prob, crossover_prob, tournament_size, elitism)
            experiment_id += 1

In [45]:
def test_operators(config):
    pop_size = config['population_size']
    max_depth = config['max_individual_size']
    num_generations = config['num_generations']
    elitism = config['elitism']
    n_repetitions = config['n_repetitions']
    experiment_id = config['experiment_id']
    file_name = config['file_name']

    probabilidades = config['operators']
    tournament_size = 2

  # # Verifica arquivos apenas uma vez

    for prob in tqdm(probabilidades, desc='Operators Prob'):
          apply_in_thread_pool(os.cpu_count()//2, run_experiment_safe, file_name, experiment_id, n_repetitions, pop_size, max_depth, num_generations, prob['MUTATION'], prob['CROSSOVER'], tournament_size, elitism)
          experiment_id += 1

In [46]:
def test_k_selection(config):
    pop_size = config['population_size']
    max_depth = config['max_individual_size']
    num_generations = config['num_generations']
    elitism = config['elitism']
    n_repetitions = config['n_repetitions']
    experiment_id = config['experiment_id']
    file_name = config['file_name']

    prob = config['operators']
    tournament_sizes = config['k_torneio']

  # # Verifica arquivos apenas uma vez

    for k in tqdm(tournament_sizes, desc='Operators Prob'):
      if experiment_id in [1,2]:
        apply_in_thread_pool(os.cpu_count()//2, run_experiment_safe, file_name, experiment_id, n_repetitions, pop_size, max_depth, num_generations, prob['MUTATION'], prob['CROSSOVER'], k, elitism)
      experiment_id += 1

In [47]:
def test_best_params(config):
    pop_size = config['population_size']
    max_depth = config['max_individual_size']
    num_generations = config['num_generations']
    elitism = config['elitism']
    n_repetitions = config['n_repetitions']
    experiment_id = config['experiment_id']
    file_name = config['file_name']

    prob = config['operators']
    tournament_size = config['k_torneio']

    apply_in_thread_pool(os.cpu_count()//2, run_experiment_safe, file_name, experiment_id, n_repetitions, pop_size, max_depth, num_generations, prob['MUTATION'], prob['CROSSOVER'], tournament_size, elitism)


# 7. Rodar Experimentos

In [48]:
# PARAMETROS DO ALGORITMO GENETICO

### FIXOS ###
TAMANHO_MAXIMO_INDIVIDUO = 7
ELITISMO = True
REPETICOES = 10
TERMINAIS = ['+', '-', '*', '/']

### VARIÁVEIS ###
TAMAHO_POPULACAO = [30, 50, 100, 250]
NUMERO_GERACOES = [30, 50, 100, 250]
K_TORNEIO = [2,3,5,7]
PROB_OPERADORES = [
        {'CROSSOVER': 0.9, 'MUTATION': 0.05},
        {'CROSSOVER': 0.6, 'MUTATION': 0.3},
    ]

# PARAMETROS DO DATASET
WINE_FILE_NAME = 'wineRed-'
BREAST_CANCER_FILE_NAME = 'breast_cancer_coimbra_'

### 7.1 Teste População

In [49]:
# experiment_id = 1

# config = {
#     'file_name': BREAST_CANCER_FILE_NAME,
#     'population_size': TAMAHO_POPULACAO,
#     'max_individual_size': TAMANHO_MAXIMO_INDIVIDUO,
#     'num_generations': NUMERO_GERACOES,
#     'elitism': ELITISMO,
#     'n_repetitions': REPETICOES,
#     'experiment_id': experiment_id
# }

# test_population(config)

In [50]:
# experiment_id = 1

# config = {
#     'file_name': BREAST_CANCER_FILE_NAME,
#     'population_size': 250,
#     'max_individual_size': TAMANHO_MAXIMO_INDIVIDUO,
#     'num_generations': 100,
#     'elitism': ELITISMO,
#     'n_repetitions': REPETICOES,
#     'experiment_id': experiment_id,
#     'operators': PROB_OPERADORES
# }

# test_operators(config)

In [51]:
# experiment_id = 1

# config = {
#     'file_name': BREAST_CANCER_FILE_NAME,
#     'population_size': 250,
#     'max_individual_size': TAMANHO_MAXIMO_INDIVIDUO,
#     'num_generations': 100,
#     'elitism': ELITISMO,
#     'n_repetitions': REPETICOES,
#     'experiment_id': experiment_id,
#     'operators': {'CROSSOVER': 0.6, 'MUTATION': 0.3},
#     'k_torneio': K_TORNEIO
# }

# test_k_selection(config)

# Melhores Parametros em outra base de dados

In [52]:
experiment_id = 1

config = {
    'file_name': WINE_FILE_NAME,
    'population_size': 250,
    'max_individual_size': TAMANHO_MAXIMO_INDIVIDUO,
    'num_generations': 100,
    'elitism': ELITISMO,
    'n_repetitions': REPETICOES,
    'experiment_id': experiment_id,
    'operators': {'CROSSOVER': 0.6, 'MUTATION': 0.3},
    'k_torneio': 5
}

test_best_params(config)

Repetitions:   0%|          | 0/100 [00:00<?, ?it/s]

Repetitions:   0%|          | 0/100 [00:00<?, ?it/s]

Repetitions:   0%|          | 0/100 [00:00<?, ?it/s]

Repetitions:   0%|          | 0/100 [00:00<?, ?it/s]

Repetitions:   0%|          | 0/100 [00:00<?, ?it/s]

Repetitions:   0%|          | 0/100 [00:00<?, ?it/s]

Repetitions:   0%|          | 0/100 [00:00<?, ?it/s]

Repetitions:   0%|          | 0/100 [00:00<?, ?it/s]

Repetitions:   0%|          | 0/100 [00:00<?, ?it/s]

Repetitions:   0%|          | 0/100 [00:00<?, ?it/s]