In [4]:
%pip install kagglehub scikit-learn scikit-optimize sklearn-genetic-opt



In [None]:
import kagglehub
import pandas as pd
import numpy as np
import os
from typing import Any, cast
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from scipy.stats import loguniform
from time import perf_counter
from sklearn.model_selection import train_test_split, KFold, GridSearchCV, RandomizedSearchCV
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.metrics import (
    accuracy_score, precision_score, recall_score, f1_score, roc_auc_score,
    cohen_kappa_score, make_scorer
)
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# Genética (sklearn-genetic-opt)
from sklearn_genetic import GASearchCV
from sklearn_genetic.space import Continuous, Integer as GInteger, Categorical as GCategorical

## Instalação do Dataset

In [6]:
# Download latest version
path = kagglehub.dataset_download("uciml/pima-indians-diabetes-database")

print("Path to dataset files:", path)

Path to dataset files: C:\Users\JonasCGN\.cache\kagglehub\datasets\uciml\pima-indians-diabetes-database\versions\1


In [7]:
csv_path = os.path.join(path, "diabetes.csv")
df = pd.read_csv(csv_path)

In [8]:
df.head()

Unnamed: 0,Pregnancies,Glucose,BloodPressure,SkinThickness,Insulin,BMI,DiabetesPedigreeFunction,Age,Outcome
0,6,148,72,35,0,33.6,0.627,50,1
1,1,85,66,29,0,26.6,0.351,31,0
2,8,183,64,0,0,23.3,0.672,32,1
3,1,89,66,23,94,28.1,0.167,21,0
4,0,137,40,35,168,43.1,2.288,33,1


In [9]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 768 entries, 0 to 767
Data columns (total 9 columns):
 #   Column                    Non-Null Count  Dtype  
---  ------                    --------------  -----  
 0   Pregnancies               768 non-null    int64  
 1   Glucose                   768 non-null    int64  
 2   BloodPressure             768 non-null    int64  
 3   SkinThickness             768 non-null    int64  
 4   Insulin                   768 non-null    int64  
 5   BMI                       768 non-null    float64
 6   DiabetesPedigreeFunction  768 non-null    float64
 7   Age                       768 non-null    int64  
 8   Outcome                   768 non-null    int64  
dtypes: float64(2), int64(7)
memory usage: 54.1 KB


## Preparação dos Dados

In [10]:
X,y = df.drop('Outcome', axis=1), df['Outcome']

In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

## Modelos

In [None]:
cv = KFold(n_splits=5, shuffle=True, random_state=42)
scoring = {
    "accuracy": "accuracy",
    "precision": make_scorer(precision_score, zero_division=0),
    "recall": make_scorer(recall_score, zero_division=0),
    "f1": "f1",
    "roc_auc": "roc_auc",
    "kappa": make_scorer(cohen_kappa_score),
}
primary_metric = "f1"  # métrica de refit

def evaluate_on_test(estimator, X_t, y_t):
    y_pred = estimator.predict(X_t)
    if hasattr(estimator, "predict_proba"):
        y_proba = estimator.predict_proba(X_t)[:, 1]
    elif hasattr(estimator, "decision_function"):
        y_proba = estimator.decision_function(X_t)
    else:
        y_proba = None

    metrics = {
        "accuracy": accuracy_score(y_t, y_pred),
        "precision": precision_score(y_t, y_pred, zero_division=0),
        "recall": recall_score(y_t, y_pred, zero_division=0),
        "f1": f1_score(y_t, y_pred),
        "kappa": cohen_kappa_score(y_t, y_pred),
    }
    if y_proba is not None:
        metrics["roc_auc"] = roc_auc_score(y_t, y_proba)
    else:
        metrics["roc_auc"] = np.nan
    return metrics

def summarize_cv(gs):
    idx = gs.best_index_
    out = {}
    for m in scoring.keys():
        out[f"cv_{m}_mean"] = gs.cv_results_[f"mean_test_{m}"][idx]
        out[f"cv_{m}_std"]  = gs.cv_results_[f"std_test_{m}"][idx]
    return out

results = []

# =========================
# SVM (SVC probability=True)
# =========================
svm_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", SVC(probability=True, random_state=42))
])

# Espaços SVM
grid_svm = [
    {
        "clf__kernel": ["linear"],
        "clf__C": [1e-3, 1e-2, 1e-1, 1, 10, 1e2, 1e3],
        "clf__class_weight": [None, "balanced"],
    },
    {
        "clf__kernel": ["rbf"],
        "clf__C": [1e-3, 1e-2, 1e-1, 1, 10, 1e2, 1e3],
        "clf__gamma": [1e-4, 1e-3, 1e-2, 1e-1, 1.0],
        "clf__class_weight": [None, "balanced"],
    },
]

rand_svm = {
    "clf__kernel": ["linear", "rbf"],
    "clf__C": loguniform(1e-3, 1e3),
    "clf__gamma": loguniform(1e-4, 1.0),  # ignorado quando kernel=linear
    "clf__class_weight": [None, "balanced"],
}

bayes_svm = [
    {
        "clf__kernel": Categorical(["linear"]),
        "clf__C": Real(1e-3, 1e3, prior="log-uniform"),
        "clf__class_weight": Categorical([None, "balanced"]),
    },
    {
        "clf__kernel": Categorical(["rbf"]),
        "clf__C": Real(1e-3, 1e3, prior="log-uniform"),
        "clf__gamma": Real(1e-4, 1.0, prior="log-uniform"),
        "clf__class_weight": Categorical([None, "balanced"]),
    },
]

gen_svm = {
    "clf__kernel": GCategorical(["linear", "rbf"]),
    "clf__C": Continuous(1e-3, 1e3, distribution="log-uniform"),
    "clf__gamma": Continuous(1e-4, 1.0, distribution="log-uniform"),  # ignorado em linear
    "clf__class_weight": GCategorical([None, "balanced"]),
}

# Grid Search - SVM
t0 = perf_counter()
gs_svm = GridSearchCV(
    estimator=svm_pipe, param_grid=grid_svm, cv=cv, scoring=scoring,
    refit=primary_metric,  # type: ignore[arg-type]
    n_jobs=-1, verbose=0
)
gs_svm.fit(X_train, y_train)
t_grid = perf_counter() - t0
res = {"model":"SVM","strategy":"GridSearch","time_s":t_grid,
       "best_params":gs_svm.best_params_}
res.update(summarize_cv(gs_svm))
res.update({f"test_{k}":v for k,v in evaluate_on_test(gs_svm.best_estimator_, X_test, y_test).items()})
results.append(res)

# Random Search - SVM (n_iter=20)
t0 = perf_counter()
rs_svm = RandomizedSearchCV(
    estimator=svm_pipe, param_distributions=rand_svm, n_iter=20,
    cv=cv, scoring=scoring, refit=primary_metric,  # type: ignore[arg-type]
    n_jobs=-1, random_state=42, verbose=0
)
rs_svm.fit(X_train, y_train)
t_rand = perf_counter() - t0
res = {"model":"SVM","strategy":"RandomizedSearch","time_s":t_rand,
       "best_params":rs_svm.best_params_}
res.update(summarize_cv(rs_svm))
res.update({f"test_{k}":v for k,v in evaluate_on_test(rs_svm.best_estimator_, X_test, y_test).items()})
results.append(res)

# Bayesiana (BayesSearchCV) - SVM
t0 = perf_counter()
bs_svm = BayesSearchCV(
    estimator=svm_pipe, search_spaces=bayes_svm, n_iter=30,
    cv=cv, scoring=scoring, refit=primary_metric,  # type: ignore[arg-type]
    n_jobs=-1, random_state=42, verbose=0
)
bs_svm.fit(X_train, y_train)
t_bayes = perf_counter() - t0
res = {"model":"SVM","strategy":"BayesSearchCV","time_s":t_bayes,
       "best_params":bs_svm.best_params_}  # type: ignore[attr-defined]
res.update(summarize_cv(bs_svm))
res.update({f"test_{k}":v for k,v in evaluate_on_test(bs_svm.best_estimator_, X_test, y_test).items()})  # type: ignore[attr-defined]
results.append(res)

# Genética (GASearchCV) - SVM
t0 = perf_counter()
gn_svm = GASearchCV(
    estimator=svm_pipe, cv=cv,  # type: ignore[arg-type]
    scoring=make_scorer(f1_score), n_jobs=-1,
    param_grid=gen_svm, generations=20, population_size=30, tournament_size=3,
    elitism=True, mutation_probability=0.1, crossover_probability=0.8, verbose=False
)
gn_svm.fit(X_train, y_train)
t_gen = perf_counter() - t0
res = {"model":"SVM","strategy":"GeneticSearchCV","time_s":t_gen,
       "best_params":gn_svm.best_params_}
res.update(summarize_cv(gn_svm))
res.update({f"test_{k}":v for k,v in evaluate_on_test(gn_svm.best_estimator_, X_test, y_test).items()})
results.append(res)

# =========
# MLP
# =========
mlp_pipe = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", StandardScaler()),
    ("clf", MLPClassifier(random_state=42))
])

hidden_options = [(64,), (128,), (64,32), (128,64), (256,128)]

grid_mlp = {
    "clf__hidden_layer_sizes": hidden_options,
    "clf__activation": ["relu", "tanh"],
    "clf__solver": ["adam", "lbfgs"],
    "clf__alpha": [1e-6, 1e-5, 1e-4, 1e-3, 1e-2],
    "clf__learning_rate_init": [1e-4, 1e-3, 1e-2],
    "clf__max_iter": [300, 600],
    "clf__early_stopping": [True],
}

rand_mlp = {
    "clf__hidden_layer_sizes": hidden_options,
    "clf__activation": ["relu", "tanh"],
    "clf__solver": ["adam", "lbfgs"],
    "clf__alpha": loguniform(1e-6, 1e-2),
    "clf__learning_rate_init": loguniform(1e-4, 1e-2),
    "clf__max_iter": [300, 600],
    "clf__early_stopping": [True],
}

bayes_mlp = {
    "clf__hidden_layer_sizes": Categorical(hidden_options),
    "clf__activation": Categorical(["relu", "tanh"]),
    "clf__solver": Categorical(["adam", "lbfgs"]),
    "clf__alpha": Real(1e-6, 1e-2, prior="log-uniform"),
    "clf__learning_rate_init": Real(1e-4, 1e-2, prior="log-uniform"),
    "clf__max_iter": Integer(300, 600),
    "clf__early_stopping": Categorical([True]),
}

gen_mlp = {
    "clf__hidden_layer_sizes": GCategorical(hidden_options),
    "clf__activation": GCategorical(["relu", "tanh"]),
    "clf__solver": GCategorical(["adam", "lbfgs"]),
    "clf__alpha": Continuous(1e-6, 1e-2, distribution="log-uniform"),
    "clf__learning_rate_init": Continuous(1e-4, 1e-2, distribution="log-uniform"),
    "clf__max_iter": GInteger(300, 600),
    "clf__early_stopping": GCategorical([True]),
}

# Grid Search - MLP
t0 = perf_counter()
gs_mlp = GridSearchCV(
    estimator=mlp_pipe, param_grid=grid_mlp, cv=cv, scoring=scoring,
    refit=primary_metric,  # type: ignore[arg-type]
    n_jobs=-1, verbose=0
)
gs_mlp.fit(X_train, y_train)
t_grid = perf_counter() - t0
res = {"model":"MLP","strategy":"GridSearch","time_s":t_grid,
       "best_params":gs_mlp.best_params_}
res.update(summarize_cv(gs_mlp))
res.update({f"test_{k}":v for k,v in evaluate_on_test(gs_mlp.best_estimator_, X_test, y_test).items()})
results.append(res)

# Random Search - MLP (n_iter=20)
t0 = perf_counter()
rs_mlp = RandomizedSearchCV(
    estimator=mlp_pipe, param_distributions=rand_mlp, n_iter=20,
    cv=cv, scoring=scoring, refit=primary_metric,  # type: ignore[arg-type]
    n_jobs=-1, random_state=42, verbose=0
)
rs_mlp.fit(X_train, y_train)
t_rand = perf_counter() - t0
res = {"model":"MLP","strategy":"RandomizedSearch","time_s":t_rand,
       "best_params":rs_mlp.best_params_}
res.update(summarize_cv(rs_mlp))
res.update({f"test_{k}":v for k,v in evaluate_on_test(rs_mlp.best_estimator_, X_test, y_test).items()})
results.append(res)

# Bayesiana (BayesSearchCV) - MLP
t0 = perf_counter()
bs_mlp = BayesSearchCV(
    estimator=mlp_pipe, search_spaces=bayes_mlp, n_iter=30,
    cv=cv, scoring=scoring, refit=primary_metric,  # type: ignore[arg-type]
    n_jobs=-1, random_state=42, verbose=0
)
bs_mlp.fit(X_train, y_train)
t_bayes = perf_counter() - t0
res = {"model":"MLP","strategy":"BayesSearchCV","time_s":t_bayes,
       "best_params":bs_mlp.best_params_}  # type: ignore[attr-defined]
res.update(summarize_cv(bs_mlp))
res.update({f"test_{k}":v for k,v in evaluate_on_test(bs_mlp.best_estimator_, X_test, y_test).items()})  # type: ignore[attr-defined]
results.append(res)

# Genética (GASearchCV) - MLP
t0 = perf_counter()
gn_mlp = GASearchCV(
    estimator=mlp_pipe, cv=cv,  # type: ignore[arg-type]
    scoring=make_scorer(f1_score), n_jobs=-1,
    param_grid=gen_mlp, generations=20, population_size=30, tournament_size=3,
    elitism=True, mutation_probability=0.1, crossover_probability=0.8, verbose=False
)


gn_mlp.fit(X_train, y_train)
t_gen = perf_counter() - t0
res = {"model":"MLP","strategy":"GeneticSearchCV","time_s":t_gen,
       "best_params":gn_mlp.best_params_}
res.update(summarize_cv(gn_mlp))
res.update({f"test_{k}":v for k,v in evaluate_on_test(gn_mlp.best_estimator_, X_test, y_test).items()})
results.append(res)

# Tabela final com resultados
pd.DataFrame(results)

TypeError: GASearchCV.__init__() got an unexpected keyword argument 'random_state'