In [11]:
# importando as bibliotecas
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler
import random

In [10]:
# importando os modulos definidos
from src.Node import Node

# Carregamento de dados

In [2]:
def load_dataset(dataset_name, header=None):
    train_path = f'./data/{dataset_name}train.csv'
    test_path = f'./data/{dataset_name}test.csv'

    train_data = pd.read_csv(filepath_or_buffer=train_path, header=header)
    test_data = pd.read_csv(filepath_or_buffer=test_path, header=header)
    
    return train_data, test_data

In [3]:
breast_cancer_train_data, breast_cancer_test_data = load_dataset('breast_cancer_coimbra_', header=0)
print(breast_cancer_train_data.shape, breast_cancer_test_data.shape)

(92, 10) (24, 10)


In [4]:
breast_cancer_train_data.head()

Unnamed: 0,Age,BMI,Glucose,Insulin,HOMA,Leptin,Adiponectin,Resistin,MCP.1,Classification
0,44,20.76,86,7.553,1.6,14.09,20.32,7.64,63.61,1
1,46,20.83,88,3.42,0.742368,12.87,18.55,13.56,301.21,2
2,53,36.790166,101,10.175,2.534932,27.1841,20.03,10.26309,695.754,1
3,54,30.483158,90,5.537,1.229214,12.331,9.73138,10.19299,1227.91,1
4,34,24.242424,92,21.699,4.924226,16.7353,21.823745,12.06534,481.949,2


* 0 - fixed acidity (tartaric acid - g / dm^3)
* 1 - volatile acidity (acetic acid - g / dm^3)
* 2 - citric acid (g / dm^3)
* 3 - residual sugar (g / dm^3)
* 4 - chlorides (sodium chloride - g / dm^3)
* 5 - free sulfur dioxide (mg / dm^3)
* 6 - total sulfur dioxide (mg / dm^3)
* 7 - density (g / cm^3)
* 8 - pH
* 9 - sulphates (potassium sulphate - g / dm3)
* 10 - alcohol (% by volume)
* 11 - quality (score between 0 and 10) - output variable

In [5]:
wine_train_data, wine_test_data = load_dataset('wineRed-', header=None)
print(wine_train_data.shape, wine_test_data.shape)

(1279, 12) (320, 12)


In [6]:
wine_train_data.head()

Unnamed: 0,0,1,2,3,4,5,6,7,8,9,10,11
0,7.4,0.64,0.17,5.4,0.168,52.0,98.0,0.99736,3.28,0.5,9.5,5
1,10.4,0.44,0.73,6.55,0.074,38.0,76.0,0.999,3.17,0.85,12.0,7
2,10.7,0.43,0.39,2.2,0.106,8.0,32.0,0.9986,2.89,0.5,9.6,5
3,8.9,0.635,0.37,1.7,0.263,5.0,62.0,0.9971,3.0,1.09,9.3,5
4,7.8,0.57,0.09,2.3,0.065,34.0,45.0,0.99417,3.46,0.74,12.7,8


### Transformação de dados

In [7]:
def normalize(train_data, test_data):
    scaler = StandardScaler()
    scaler.fit(train_data)
    return scaler.transform(train_data), scaler.transform(test_data)

In [8]:
wine_train_data_normalized,wine_test_data_normalized = normalize(wine_train_data,wine_test_data)
wine_train_data_normalized

array([[-0.53853669,  0.64402546, -0.52757169, ..., -0.91985987,
        -0.85278319, -0.776328  ],
       [ 1.19150792, -0.4811826 ,  2.33832098, ...,  1.08391273,
         1.50932148,  1.69363099],
       [ 1.36451238, -0.537443  ,  0.59831471, ..., -0.91985987,
        -0.75829901, -0.776328  ],
       ...,
       [-0.82687746,  0.08142143, -1.19286821, ...,  1.42741661,
        -0.94726738,  0.45865149],
       [ 0.32648561,  0.47524425,  1.11008126, ...,  2.05717371,
        -1.04175157, -0.776328  ],
       [-0.59620484,  2.55687915, -1.14169155, ..., -0.63360664,
        -0.66381482, -3.24628699]])

# Modelagem dos indivíduos

In [14]:
TAMANHO_MAXIMO_INDIVIDUO = 7
TERMINAIS = ['+', '-', '*', '/']

In [21]:
def generate_random_tree(max_depth, terminals, variables, probability=0.9):
    if max_depth == 0 or (random.random() > probability):
        # Nó folha: variável ou constante
        if random.random() < 0.5:
            # Seleciona uma variável aleatória
            value = random.choice(variables)
        else:
            # Gera uma constante aleatória
            value = random.uniform(-10, 10)
        return Node(value)
    else:
        # Nó interno: operador
        op = random.choice(terminals)
        left_subtree = generate_random_tree(max_depth - 1,terminals, variables, probability)
        right_subtree = generate_random_tree(max_depth - 1,terminals, variables, probability)
        return Node(op, left_subtree, right_subtree)


In [22]:
variables = [f'x{i}' for i in range(wine_train_data_normalized.shape[1])]

# Gerando uma árvore aleatória
tree = generate_random_tree(max_depth=TAMANHO_MAXIMO_INDIVIDUO,terminals=TERMINAIS, variables=variables)
tree

<src.Node.Node at 0x13259976480>

In [26]:
def tree_to_string(node):
    if node.is_leaf():
        return str(node.value)
    else:
        left_str = tree_to_string(node.left)
        right_str = tree_to_string(node.right)
        return f'({left_str} {node.value} {right_str})'

tree_to_string(tree)

'(((((((x6 / x1) / (0.8877350438355229 - x0)) * ((x8 + x4) * 6.1353849471384905)) * (((x4 - x1) * (x6 - 0.0341635255256616)) / ((-9.632355482502422 / 2.57781958571025) / (x7 / x3)))) * ((((-5.312802446905199 + x0) + (-7.551090027072 / x8)) / x6) / (((-0.8173246341381191 / -0.03284348174277696) / (2.5692590813010785 - x7)) + ((2.6714409103750505 * x2) * (x0 + 0.8508485794895151))))) * (((((x10 / 1.3852306688199878) * (x9 - x4)) / ((2.5629053324778006 - 4.210677960910232) - (x2 + x9))) * ((-6.191436665894631 / (x7 / x9)) * ((x11 * x10) - x3))) - ((((0.3216289939735191 - 2.1444837583663308) + (6.70778182038935 + -7.545660998737618)) - ((x4 * x7) - (-0.5206797157167937 * x5))) / (((x9 - x6) * x5) - ((x10 - 3.4536163814429344) - (x3 * -7.706753801235589)))))) + ((((x6 / ((x10 - x11) / (x9 - 3.6271410067858145))) * (((x7 + x1) + x10) - ((-9.761514496844896 * x1) * (x2 - x1)))) - (2.828311208793439 / ((x8 - (x11 - x6)) / ((-4.238247688403078 + x11) / (-5.8352698853428535 - x3))))) + (((((x5 /