In [1]:
import sklearn
from pathlib import Path
import shutil

# %% Configurações de dataset
DATASET_NAME = 'iris'
RANDOM_STATE = 1

# %% Carregamento
X, y = sklearn.datasets.load_iris(return_X_y=True, as_frame=True)

# %% Split
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(
    X, y, random_state=RANDOM_STATE, stratify=y
)

# %% Diretórios temporários
TMP_ROOT = Path('../results/tmp') / DATASET_NAME
if TMP_ROOT.exists():
    shutil.rmtree(TMP_ROOT)

X_train.shape, X_test.shape, y_train.shape, y_test.shape

((112, 4), (38, 4), (112,), (38,))

In [2]:
from tpot import TPOTClassifier
from tpot import objectives
import time

def run_tpot(early_stop=20, max_time_mins=float('inf'), warm_start=False, search_space="linear"):
    scorer = sklearn.metrics.get_scorer('f1_weighted')

    tpot = TPOTClassifier(
        search_space=search_space,
        scorers=[scorer, objectives.complexity_scorer],
        scorers_weights=[1.0, -1.0],
        cv = 10,
        memory= str(TMP_ROOT / 'memory'),
        preprocessing= True,
        max_time_mins=max_time_mins,
        n_jobs=1,
        validation_strategy='split',
        early_stop=early_stop,
        warm_start=warm_start,
        periodic_checkpoint_folder= str(TMP_ROOT / 'checkpoints'),
        verbose=4,
        random_state=RANDOM_STATE,
    )
    start = time.time()
    tpot.fit(X_train, y_train)
    return tpot, time.time() - start

In [3]:
import os
import sys
from datetime import datetime
from sklearn.base import clone

sys.path.insert(0, os.path.abspath(os.path.join('..', 'src')))

from metrics import evaluate_metrics
from serialize import serialize_tpot

def build_document(tpot, elapsed):
    """
    Gera um dicionário com métricas dos melhores modelos encontrados pelo TPOT.
    Mantém:
      - 'pareto_complexity': métricas do menor 'complexity_scorer' (desempate por maior 'f1_score')
      - 'pareto_f1': métricas do maior 'f1_score' (desempate por menor 'complexity_scorer')
      - 'best_model': métricas do tpot.fitted_pipeline_ 
    """

    best_model = tpot.fitted_pipeline_
    model = serialize_tpot(best_model)
    metrics = evaluate_metrics(best_model, X_test, y_test)
    metrics['complexity_scorer'] = objectives.complexity_scorer(best_model, X_test, y_test)
    model['metrics'] = metrics
    
    # Estrutura base do resultado
    result = {
        'pareto_complexity': None,
        'pareto_f1': None,
        'best_model': model,
        'date': datetime.now().strftime('%Y-%m-%d %H:%M:%S'),
        'dataset': DATASET_NAME,
        'elapsed': elapsed,
    }
    
    # Verifica se existe Pareto front válido
    df = tpot.pareto_front
    if df is None or len(df) == 0:
        return result
    
    # Filtrar linhas válidas
    df_valid = df[
        df["Eval Error"].isna() &
        df["complexity_scorer"].notna() &
        df["f1_score"].notna() &
        df["Instance"].notna()
    ]
    
    if df_valid.empty:
        return result
    
    # Selecionar e treinar modelo de menor complexidade
    best_complexity = df_valid.sort_values(["complexity_scorer", "f1_score"], ascending=[True, False]).iloc[0]
    model = clone(best_complexity["Instance"])
    model.fit(X_train, y_train)
    result['pareto_complexity'] = evaluate_metrics(model, X_test, y_test)
    result['pareto_complexity']['complexity_scorer'] = int(best_complexity["complexity_scorer"])
    
    # Selecionar e treinar modelo de maior F1
    best_f1 = df_valid.sort_values(["f1_score", "complexity_scorer"], ascending=[False, True]).iloc[0]
    model = clone(best_f1["Instance"])
    model.fit(X_train, y_train)
    result['pareto_f1'] = evaluate_metrics(model, X_test, y_test)
    result['pareto_f1']['complexity_scorer'] = int(best_f1["complexity_scorer"])
    
    return result

In [4]:
tpot, elapsed = run_tpot(early_stop=5, max_time_mins=5, warm_start=False, search_space="linear")

Generation: : 0it [00:00, ?it/s]

Generation:  1
Best f1_score score: 0.9771428571428572
Best complexity_scorer score: 7.0


Generation: : 1it [01:02, 62.95s/it]

Generation:  2
Best f1_score score: 0.9885714285714287
Best complexity_scorer score: 6.0


Generation: : 2it [02:04, 61.84s/it]

Generation:  3
Best f1_score score: 0.9885714285714287
Best complexity_scorer score: 6.0


Generation: : 3it [03:15, 66.11s/it]

Generation:  4
Best f1_score score: 0.9885714285714287
Best complexity_scorer score: 6.0


Generation: : 4it [04:34, 71.28s/it]

Generation:  5
Best f1_score score: 0.9885714285714287
Best complexity_scorer score: 6.0


Generation: : 5it [05:18, 63.68s/it]
2025-08-06 00:32:18,368 - distributed.scheduler - ERROR - Removing worker 'tcp://127.0.0.1:58684' caused the cluster to lose scattered data, which can't be recovered: {'Series-625f7f9c3951c6a7bcb963cf53499061', 'Series-323756ed51b9541f5cb01dda1f245c43', 'DataFrame-b88fb2d327ee88d0537059a39137bdbd', 'DataFrame-03506e3c5b1cc0d4fb95e9e1024920d5'} (stimulus_id='handle-worker-cleanup-1754451138.3670158')


In [5]:
document = build_document(tpot, elapsed)
import pprint

pp = pprint.PrettyPrinter(indent=4)
pp.pprint(document)

{   'best_model': {   'metrics': {   'accuracy': 1.0,
                                     'balanced_f1': 1.0,
                                     'complexity_scorer': 8,
                                     'confusion_matrix': [   [1.0, 0.0, 0.0],
                                                             [0.0, 1.0, 0.0],
                                                             [0.0, 0.0, 1.0]],
                                     'precision': 1.0,
                                     'recall': 1.0},
                      'model': 'gASV5iMAAAAAAACMEHNrbGVhcm4ucGlwZWxpbmWUjAhQaXBlbGluZZSTlCmBlH2UKIwFc3RlcHOUXZQojApwaXBlbGluZS0xlGgCKYGUfZQoaAVdlIwOaW1wdXRlX251bWVyaWOUjBx0cG90LmJ1aWx0aW5fbW9kdWxlcy5pbXB1dGVylIwTQ29sdW1uU2ltcGxlSW1wdXRlcpSTlCmBlH2UKIwHY29sdW1uc5SMA2FsbJSMDm1pc3NpbmdfdmFsdWVzlEd/+AAAAAAAAIwIc3RyYXRlZ3mUjARtZWFulIwKZmlsbF92YWx1ZZROjARjb3B5lIiMDWFkZF9pbmRpY2F0b3KUiYwTa2VlcF9lbXB0eV9mZWF0dXJlc5SJjAhjb2x1bW5zX5SMGHBhbmRhcy5jb3JlLmluZGV4ZXMuYmFzZZSMCl9uZXdfSW5kZXiUk5RoG

In [6]:
df = tpot.evaluated_individuals
df = df.sort_values('f1_score', ascending=False)
df = df[['f1_score', 'complexity_scorer', 'Generation', 'Pareto_Front', 'validation_f1_score', 'validation_complexity_scorer']]
df

Unnamed: 0,f1_score,complexity_scorer,Generation,Pareto_Front,validation_f1_score,validation_complexity_scorer
61,0.988571,80.0,1.0,,0.824534,80.0
54,0.988571,10115.0,1.0,,,
245,0.988571,80.0,4.0,,,
198,0.988571,80.0,3.0,,,
190,0.988571,9631.0,3.0,,,
...,...,...,...,...,...,...
230,,,4.0,,,
231,,,4.0,,,
232,,,4.0,,,
233,,,4.0,,,


In [7]:
pareto = tpot.pareto_front
pareto = pareto[['f1_score', 'complexity_scorer', 'Generation', 'Pareto_Front', 'validation_f1_score', 'validation_complexity_scorer']]
pareto

Unnamed: 0,f1_score,complexity_scorer,Generation,Pareto_Front,validation_f1_score,validation_complexity_scorer
90,0.965714,6.0,1.0,1.0,0.869565,6.0
171,0.977778,7.0,3.0,1.0,0.956128,7.0


In [8]:
from tpot.objectives import complexity_scorer
from sklearn.metrics import get_scorer

# supondo que 'scorer' é seu scorer de F1 original:
f1_scorer = get_scorer('f1_weighted')
pipeline = tpot.fitted_pipeline_

f1_value   = f1_scorer(pipeline, X_test, y_test)       # milissegundos
complexity = complexity_scorer(pipeline)             # igualmente rápido

print(f"F1 = {f1_value:.4f}, Complexity = {complexity}")


F1 = 1.0000, Complexity = 8


In [9]:
#achar dentro do df o elemento que tem esses dados
df = tpot.evaluated_individuals
df = df[df['complexity_scorer'] == complexity]
df

Unnamed: 0,f1_score,complexity_scorer,Parents,Variation_Function,Individual,Generation,Submitted Timestamp,Completed Timestamp,Eval Error,Pareto_Front,Instance,validation_f1_score,validation_complexity_scorer,validation_start_times,validation_end_times,validation_eval_errors,Validation_Pareto_Front
93,0.977143,8.0,"(8, 8)",ind_mutate,<tpot.search_spaces.pipelines.sequential.Seque...,1.0,1754451000.0,1754451000.0,,,"((ColumnSimpleImputer()), (MaxAbsScaler(), Sel...",,,,,,
109,0.977143,8.0,"(93, 93)",ind_mutate,<tpot.search_spaces.pipelines.sequential.Seque...,2.0,1754451000.0,1754451000.0,,,"((ColumnSimpleImputer()), (MaxAbsScaler(), Sel...",,,,,,
132,0.965714,8.0,"(93, 93)",ind_mutate,<tpot.search_spaces.pipelines.sequential.Seque...,2.0,1754451000.0,1754451000.0,,,"((ColumnSimpleImputer()), (MaxAbsScaler(), RFE...",,,,,,
138,0.977143,8.0,"(93, 93)",ind_mutate,<tpot.search_spaces.pipelines.sequential.Seque...,2.0,1754451000.0,1754451000.0,,,"((ColumnSimpleImputer()), (MaxAbsScaler(), Sel...",,,,,,
164,0.977143,8.0,"(109, 77)",ind_crossover,<tpot.search_spaces.pipelines.sequential.Seque...,3.0,1754451000.0,1754451000.0,,,"((ColumnSimpleImputer()), (Passthrough(), Sele...",,,,,,
181,0.927024,8.0,"(138, 138)",ind_mutate,<tpot.search_spaces.pipelines.sequential.Seque...,3.0,1754451000.0,1754451000.0,,,"((ColumnSimpleImputer()), (StandardScaler(), S...",,,,,,
183,0.920635,8.0,"(140, 109)","ind_mutate , ind_mutate , ind_crossover",<tpot.search_spaces.pipelines.sequential.Seque...,3.0,1754451000.0,1754451000.0,,,"((ColumnSimpleImputer()), (MaxAbsScaler(), RFE...",,,,,,
248,0.966349,8.0,"(171, 171)",ind_mutate,<tpot.search_spaces.pipelines.sequential.Seque...,4.0,1754451000.0,1754451000.0,,,"((ColumnSimpleImputer()), (Normalizer(norm=np....",,,,,,
