In [3]:
# importando as bibliotecas
import numpy as np
import pandas as pd
import random
from tqdm.auto import tqdm
from sklearn.preprocessing import StandardScaler
from sklearn.cluster import AgglomerativeClustering
from sklearn.metrics import v_measure_score

In [5]:
# importando os modulos definidos
from src.Node import Node

# Carregamento de dados

In [6]:
def load_dataset(dataset_name, header=None):
    train_path = f'./data/{dataset_name}train.csv'
    test_path = f'./data/{dataset_name}test.csv'

    train_data = pd.read_csv(filepath_or_buffer=train_path, header=header)
    test_data = pd.read_csv(filepath_or_buffer=test_path, header=header)
    
    return train_data, test_data

In [7]:
breast_cancer_train_data, breast_cancer_test_data = load_dataset('breast_cancer_coimbra_', header=0)
print(breast_cancer_train_data.shape, breast_cancer_test_data.shape)

(92, 10) (24, 10)


In [8]:
breast_cancer_train_data.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,44,20.76,86,7.553,1.6,14.09,20.32,7.64,63.61,1
1,46,20.83,88,3.42,0.742368,12.87,18.55,13.56,301.21,2
2,53,36.790166,101,10.175,2.534932,27.1841,20.03,10.26309,695.754,1
3,54,30.483158,90,5.537,1.229214,12.331,9.73138,10.19299,1227.91,1
4,34,24.242424,92,21.699,4.924226,16.7353,21.823745,12.06534,481.949,2


* 0 - fixed acidity (tartaric acid - g / dm^3)
* 1 - volatile acidity (acetic acid - g / dm^3)
* 2 - citric acid (g / dm^3)
* 3 - residual sugar (g / dm^3)
* 4 - chlorides (sodium chloride - g / dm^3)
* 5 - free sulfur dioxide (mg / dm^3)
* 6 - total sulfur dioxide (mg / dm^3)
* 7 - density (g / cm^3)
* 8 - pH
* 9 - sulphates (potassium sulphate - g / dm3)
* 10 - alcohol (% by volume)
* 11 - quality (score between 0 and 10) - output variable

In [9]:
wine_train_data, wine_test_data = load_dataset('wineRed-', header=None)
print(wine_train_data.shape, wine_test_data.shape)

(1279, 12) (320, 12)


In [10]:
wine_train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,7.4,0.64,0.17,5.4,0.168,52.0,98.0,0.99736,3.28,0.5,9.5,5
1,10.4,0.44,0.73,6.55,0.074,38.0,76.0,0.999,3.17,0.85,12.0,7
2,10.7,0.43,0.39,2.2,0.106,8.0,32.0,0.9986,2.89,0.5,9.6,5
3,8.9,0.635,0.37,1.7,0.263,5.0,62.0,0.9971,3.0,1.09,9.3,5
4,7.8,0.57,0.09,2.3,0.065,34.0,45.0,0.99417,3.46,0.74,12.7,8


In [11]:
wine_train_data = wine_train_data[:-1]
wine_train_labels = wine_train_data.iloc[:, -1]

wine_test_data = wine_test_data[:-1]
wine_test_labels = wine_test_data.iloc[:, -1]

### Transformação de dados

In [12]:
def normalize(train_data, test_data):
    scaler = StandardScaler()
    scaler.fit(train_data)
    return scaler.transform(train_data), scaler.transform(test_data)

In [13]:
wine_train_data_normalized,wine_test_data_normalized = normalize(wine_train_data,wine_test_data)
wine_train_data_normalized

array([[-0.5388674 ,  0.64743164, -0.528528  , ..., -0.92014031,
        -0.85311605, -0.78179359],
       [ 1.19074131, -0.48022442,  2.33770613, ...,  1.08316347,
         1.50847219,  1.69744267],
       [ 1.36370218, -0.53660723,  0.59749255, ..., -0.92014031,
        -0.75865252, -0.78179359],
       ...,
       [ 0.61420507, -0.64937283,  1.05813732, ...,  0.73973996,
         1.69739925,  1.69744267],
       [-0.82713551,  0.08360361, -1.19390378, ...,  1.42658697,
        -0.94757958,  0.45782454],
       [ 0.32593696,  0.47828323,  1.10932007, ...,  2.05619674,
        -1.04204311, -0.78179359]])

# Modelagem dos indivíduos

In [14]:
TAMANHO_MAXIMO_INDIVIDUO = 7
TERMINAIS = ['+', '-', '*', '/']

In [15]:
def generate_random_tree(max_depth: int, terminals: list, variables:list, method:str='grow') -> Node:
    if max_depth == 0:
        value = random.choice(variables)
        return Node(value)
    
    #max_depth -= 1
    
    if method == 'full':
        # Sempre gera operadores até a profundidade máxima
        op = random.choice(terminals)
        left_subtree = generate_random_tree(max_depth - 1, terminals, variables, method='full')
        right_subtree = generate_random_tree(max_depth - 1, terminals, variables, method='full')
        return Node(op, left_subtree, right_subtree)
    
    elif method == 'grow':
        # Decide aleatoriamente entre operador e folha
        if random.random() < 0.5:
            # Gera operador
            op = random.choice(terminals)
            left_subtree = generate_random_tree(max_depth - 1, terminals, variables, method='grow')
            right_subtree = generate_random_tree(max_depth - 1, terminals, variables, method='grow')
            return Node(op, left_subtree, right_subtree)
        else:
            value = random.choice(variables)
            return Node(value)

In [16]:
variables = [f'x{i}' for i in range(wine_train_data_normalized.shape[1])]

# Gerando uma árvore aleatória
tree1 = generate_random_tree(max_depth=TAMANHO_MAXIMO_INDIVIDUO, terminals=TERMINAIS, variables=variables, method='full')
tree2 = generate_random_tree(max_depth=TAMANHO_MAXIMO_INDIVIDUO, terminals=TERMINAIS, variables=variables, method='full')

In [17]:
tree1.view_tree()

In [39]:
tree1.depth()

8

### Modelagem da população

In [15]:
def initialize_population_ramped_half_and_half(pop_size:int, min_depth:int, max_depth:int, terminals:list, variables:list) -> list:

    population = []
    num_depths = max_depth - min_depth + 1
    individuals_per_depth = pop_size // num_depths
    remaining = pop_size % num_depths
    
    for depth in range(min_depth, max_depth + 1):
        num_individuals = individuals_per_depth
        if remaining > 0:
            num_individuals += 1
            remaining -= 1

        half_full = num_individuals // 2
        half_grow = num_individuals - half_full

        for _ in range(half_full):
            tree = generate_random_tree(max_depth=depth, terminals=terminals, variables=variables, method='full')
            population.append(tree)

        for _ in range(half_grow):
            tree = generate_random_tree(max_depth=depth, terminals=terminals, variables=variables, method='grow')
            population.append(tree)

    return population

In [16]:
pop_size = 50

population = initialize_population_ramped_half_and_half(pop_size, 1,TAMANHO_MAXIMO_INDIVIDUO, TERMINAIS, variables)

print(f'Tamanho da população inicial: {len(population)}')

Tamanho da população inicial: 50


# Fitness

In [17]:
def evaluate_tree(node, example1, example2):
    if node.is_leaf():
        if isinstance(node.value, str):
            return example1[node.value] - example2[node.value]
        else:
            return float(node.value)
    else:
        func = node.operators[node.value]
        if func is None:
            raise ValueError(f"Operador desconhecido: {node.value}")
        left_val = evaluate_tree(node.left, example1, example2)
        right_val = evaluate_tree(node.right, example1, example2)
        return func(left_val, right_val)

In [18]:
def compute_distance_matrix(tree, X):
    num_examples = X.shape[0]
    distance_matrix = np.zeros((num_examples, num_examples))
    
    if not isinstance(X, pd.DataFrame):
        X = pd.DataFrame(X, columns=[f'x{i}' for i in range(X.shape[1])])
    
    for i in range(num_examples):
        for j in range(i + 1, num_examples):
            example1 = X.iloc[i].to_dict()
            example2 = X.iloc[j].to_dict()
            dist = evaluate_tree(tree, example1, example2)
            # Usar valor absoluto para garantir que a distância seja não negativa
            distance_matrix[i, j] = abs(dist)
            distance_matrix[j, i] = distance_matrix[i, j]  # Matriz simétrica
    return distance_matrix

In [19]:
def evaluate_fitness(individual, X, y_true):
    distance_matrix = compute_distance_matrix(individual, X)
    
    num_clusters = len(np.unique(y_true))
    
    clustering = AgglomerativeClustering(n_clusters=num_clusters, metric='precomputed', linkage='average')
    clustering.fit(distance_matrix)
    
    y_pred = clustering.labels_
    
    fitness = v_measure_score(y_true, y_pred)
    
    return fitness

In [20]:
def calculate_fitness_population(population, X, y_true):
    fitness_scores = []
    for individual in tqdm(population, total=len(population), desc='Calculating fitness'):
        fitness = evaluate_fitness(individual, X, y_true)
        fitness_scores.append(fitness)
    return fitness_scores

In [21]:
evaluate_fitness(population[0], wine_train_data_normalized, wine_train_labels)

np.float64(0.05370246027474579)

In [22]:
#fitness_scores = calculate_fitness_population(population, wine_train_data_normalized, wine_train_labels)

# Operadores

In [18]:
import copy

In [19]:
def crossover(parent1: Node, parent2: Node, method: str='') -> tuple:
    # Faz uma cópia profunda dos pais para não modificar os originais
    child1 = copy.deepcopy(parent1)
    child2 = copy.deepcopy(parent2)
    
    # Obter todos os nós das árvores
    nodes1 = child1.get_all_nodes()
    nodes2 = child2.get_all_nodes()
    
    # Selecionar nós aleatórios para troca
    crossover_point1 = random.choice(nodes1)
    crossover_point2 = random.choice(nodes2)
    
    # Realizar o crossover
    crossover_point1.value, crossover_point2.value = crossover_point2.value, crossover_point1.value
    crossover_point1.left, crossover_point2.left = crossover_point2.left, crossover_point1.left
    crossover_point1.right, crossover_point2.right = crossover_point2.right, crossover_point1.right
    
    return child1, child2

In [24]:
child1,child2 = crossover(tree1, tree2)

print(tree1.view_expression(), tree2.view_expression())
print(child1.view_expression(), child2.view_expression())

(((((((x7 - x7) + (x11 + x0)) * ((x4 * x11) + (x0 / x2))) * (((x0 / x8) + (x0 + x1)) / ((x9 * x7) * (x2 + x9)))) + ((((x7 / x5) + (x8 * x3)) * ((x7 / x8) + (x10 - x7))) + (((x4 / x10) / (x10 * x9)) - ((x8 - x1) + (x6 * x2))))) / (((((x6 + x3) * (x4 - x5)) / ((x3 - x7) + (x11 + x0))) / (((x1 + x5) / (x8 - x2)) - ((x7 / x4) + (x7 - x1)))) / ((((x11 / x6) / (x5 - x5)) - ((x9 - x5) * (x11 + x8))) * (((x1 - x3) + (x0 * x11)) - ((x2 / x6) * (x4 / x0)))))) / ((((((x6 - x8) + (x3 * x1)) - ((x8 - x10) - (x5 + x1))) / (((x0 - x9) - (x9 * x7)) * ((x8 / x2) + (x8 + x7)))) + ((((x1 + x4) * (x8 - x11)) * ((x0 / x6) / (x6 + x0))) * (((x1 - x6) + (x1 * x7)) * ((x8 + x4) / (x4 * x8))))) * (((((x3 / x2) + (x6 + x11)) * ((x11 + x8) - (x11 - x7))) + (((x4 / x9) * (x1 / x7)) + ((x1 / x2) - (x8 - x0)))) * ((((x6 * x1) / (x8 - x6)) + ((x1 + x6) * (x0 / x3))) + (((x9 / x8) + (x5 / x7)) + ((x10 - x11) + (x5 / x1))))))) (((((((x4 * x9) - (x7 / x3)) * ((x5 + x5) + (x1 - x2))) + (((x10 * x4) * (x2 / x7)) * ((x7 /

In [25]:
tree1.depth()

8

In [26]:
child1.depth()

8

In [31]:
def mutate(individual, terminals, variables, max_depth, method):
    mutant = copy.deepcopy(individual)
    
    # Obter todos os nós
    nodes = mutant.get_all_nodes()
    
    # Selecionar um nó aleatório para mutação
    mutation_point = random.choice(nodes)
    
    # Gerar uma nova subárvore
    new_subtree = generate_random_tree(max_depth=max_depth, terminals=terminals, variables=variables, method=method)
    
    # Substituir a subárvore no ponto de mutação
    mutation_point.value = new_subtree.value
    mutation_point.left = new_subtree.left
    mutation_point.right = new_subtree.right
    
    return mutant

In [32]:
tree1 = mutate(tree1, TERMINAIS, variables, TAMANHO_MAXIMO_INDIVIDUO, method='grow')
tree1.view_tree()

In [33]:
tree1.depth()

12

In [30]:
def selection(population, fitness_scores, method, tournament_size=0):
    if method == 'tournament':
        selected = []
        for _ in range(len(population)):
            # Seleciona indivíduos aleatoriamente para o torneio
            tournament = random.sample(list(zip(population, fitness_scores)), tournament_size)
            # Seleciona o melhor do torneio
            winner = max(tournament, key=lambda x: x[1])[0]
            selected.append(winner)
        return selected
    
    elif method == 'roulette':
        total_fitness = sum(fitness_scores)
        probabilities = [fitness / total_fitness for fitness in fitness_scores]
        selected = np.random.choice(population, size=len(population), p=probabilities)
        return selected

# Algoritmo GP