In [None]:
# Etapa 1: Reinicia os pacotes essenciais para evitar conflitos
!pip install -q numpy==1.24.4 scikit-learn==1.3.2 xgboost==1.7.6 tpot==0.12.0 --force-reinstall

# Etapa 2: Reinicie o runtime (necessário após instalação crítica)
import os
os.kill(os.getpid(), 9)

In [None]:
import tpot
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.model_selection import train_test_split
import time
import os
from score import pipeline_score

In [15]:
# Listar todos os arquivos CSV na pasta
datasets = [f for f in os.listdir() if f.endswith(".csv")]

# Caminho do arquivo CSV onde os resultados serão armazenados
result_csv_path = "TPOT_results.csv"

# Se o arquivo de resultados ainda não existir, cria com cabeçalho
if not os.path.exists(result_csv_path):
    pd.DataFrame(columns=["dataset", "best_params", "test_score", "execution_time"]).to_csv(result_csv_path, index=False)

In [None]:
for dataset in datasets:
  print(f"\nProcessando: {dataset}")

  # Carregar o dataset
  df = pd.read_csv(dataset)
  X = df.iloc[:, :-1].values
  y = df.iloc[:, -1].values

  qntLabel = len(np.unique(y))

  X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=42)

  tpot_config = {
    'sklearn.neighbors.KNeighborsClassifier':{
        'n_neighbors': range(qntLabel + 1, 101),
        'weights': ['uniform', 'distance'],
        'p': [1, 2]
    },
    'sklearn.ensemble.RandomForestClassifier':{
        'n_estimators': range(10, 200, 10),
        'min_samples_split': [2,5,10,15,20],
        'min_samples_leaf': [1,2,4,8,16],
        'max_features': ['sqrt', 'log2', None],
        'criterion': ['gini', 'entropy'],
        'class_weight': [None, 'balanced', 'balanced_subsample'],
    },
    'sklearn.tree.DecisionTreeClassifier':{
        'criterion': ['gini', 'entropy'],
        'min_samples_split': [2,5,10,15,20],
        'min_samples_leaf': [1,2,4,8,16],
        'max_features': ['sqrt', 'log2'],
        'class_weight': [None, 'balanced'],
        'splitter': ['best', 'random'],
    },
    'sklearn.svm.SVC':{
        'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
        'C': [0.01, 0.1, 1, 10, 100, 1000],
        'gamma': ['scale', 'auto'],
        'class_weight': [None, 'balanced'],
        'gamma': ['scale', 'auto', 0.001, 0.01, 0.1, 1, 10],
        'degree': [2,3,4,5],
        'coef0': [-1.0,-0.5, -0.1, 0.0, 0.5, 1.0],
        'max_iter': [4000]
    }
  }

  tpotModel = tpot.TPOTClassifier(generations=5, population_size=50, verbosity=3, config_dict=tpot_config)

  start_time = time.time()
  tpotModel.fit(X_train, y_train)
  end_time = time.time()
  execution_time = end_time - start_time

  best_params = tpotModel.fitted_pipeline_

  y_pred = tpotModel.predict(X_test)
  final_score = pipeline_score(y_test, y_pred)

  # Criar um dicionário com os resultados
  result = {
      "dataset": dataset,
      # Convertendo para string para evitar problemas de formatação
      "best_params": str(best_params),
      "test_score": final_score,
      "execution_time": execution_time,
  }

  # Adicionar ao arquivo CSV imediatamente
  pd.DataFrame([result]).to_csv(result_csv_path, mode="a", header=False, index=False)


Processando: CDC Diabetes Health Indicators.csv
4 operators have been imported by TPOT.


Version 0.12.0 of tpot is outdated. Version 1.0.0 was released Wednesday February 26, 2025.


Optimization Progress:   0%|          | 0/300 [00:00<?, ?pipeline/s]

Skipped pipeline #1 due to time out. Continuing to the next pipeline.
Skipped pipeline #9 due to time out. Continuing to the next pipeline.
Skipped pipeline #13 due to time out. Continuing to the next pipeline.
Skipped pipeline #15 due to time out. Continuing to the next pipeline.
Skipped pipeline #18 due to time out. Continuing to the next pipeline.
Skipped pipeline #20 due to time out. Continuing to the next pipeline.
Skipped pipeline #23 due to time out. Continuing to the next pipeline.
Skipped pipeline #26 due to time out. Continuing to the next pipeline.
Skipped pipeline #31 due to time out. Continuing to the next pipeline.
Skipped pipeline #35 due to time out. Continuing to the next pipeline.
Skipped pipeline #39 due to time out. Continuing to the next pipeline.
Skipped pipeline #44 due to time out. Continuing to the next pipeline.
Skipped pipeline #49 due to time out. Continuing to the next pipeline.
Skipped pipeline #51 due to time out. Continuing to the next pipeline.
Skipped 