In [None]:
# Importando as Bibliotecas Necessárias
import pandas as pd
import numpy as np

# Modelos
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn_lvq import GlvqModel

# Validação e métricas
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

# Bibliotecas adicionais
import optuna
from optuna.integration.mlflow import MLflowCallback
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Configurando o MLflow
mlflow.set_experiment("Modelagem de Classificação")


<Experiment: artifact_location='file:///c:/Users/PC/Documents/GitHub/diabetes/mlruns/529707021101234128', creation_time=1732319685838, experiment_id='529707021101234128', last_update_time=1732319685838, lifecycle_stage='active', name='Modelagem de Classificação com Optuna', tags={}>

In [4]:
# Carregando os conjuntos de dados
train_data = pd.read_csv('train_data.csv')
validation_data = pd.read_csv('validation_data.csv')
test_data = pd.read_csv('test_data.csv')


In [4]:
# Separando features e target no conjunto de treinamento
X_train = train_data.drop('class', axis=1)
y_train = train_data['class']

# Separando features e target no conjunto de validação
X_val = validation_data.drop('class', axis=1)
y_val = validation_data['class']

# Conjunto de teste
X_test = test_data.drop('class', axis=1)
y_test = test_data['class']


In [None]:
# Definindo uma Função de Avaliação
def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_val)
        if y_prob.ndim == 1 or y_prob.shape[1] == 2:
            y_prob = y_prob[:, -1]  # Para classes binárias
            roc_auc = roc_auc_score(y_val, y_prob)
        else:
            roc_auc = roc_auc_score(y_val, y_prob, multi_class='ovo', average='weighted')
    else:
        roc_auc = None
    return acc, f1, recall, roc_auc


In [None]:
# Definindo os Modelos
models = {
    'K-NN': KNeighborsClassifier(),
    'LVQ': GlvqModel(),
    'Árvore de Decisão': DecisionTreeClassifier(),
    'SVM': SVC(probability=True),
    'Random Forest': RandomForestClassifier(),
    'Rede Neural MLP': MLPClassifier(max_iter=500),
    'Comitê de Redes Neurais Artificiais': None,  # Será definido posteriormente
    'Comitê Heterogêneo (Stacking)': None,       # Será definido posteriormente
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier()
}


In [7]:
# Comitê de Redes Neurais Artificiais
# Definindo múltiplas redes neurais
nn1 = MLPClassifier(hidden_layer_sizes=(50,), activation='relu', solver='adam', max_iter=500, random_state=1)
nn2 = MLPClassifier(hidden_layer_sizes=(100,), activation='tanh', solver='sgd', max_iter=500, random_state=2)
nn3 = MLPClassifier(hidden_layer_sizes=(50, 50), activation='relu', solver='adam', max_iter=500, random_state=3)

# Criando o Comitê
committee_nn = VotingClassifier(estimators=[
    ('nn1', nn1),
    ('nn2', nn2),
    ('nn3', nn3)
], voting='soft')

models['Comitê de Redes Neurais Artificiais'] = committee_nn


In [8]:
# Comitê Heterogêneo (Stacking)
# Modelos base
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', SVC(probability=True, random_state=42)),
    ('knn', KNeighborsClassifier())
]

# Modelo meta
from sklearn.linear_model import LogisticRegression
final_estimator = LogisticRegression()

# Criando o StackingClassifier
stacking_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=5)

models['Comitê Heterogêneo (Stacking)'] = stacking_model


In [9]:
# Executando os Modelos Básicos e Registrando no MLflow
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        acc, f1, recall, roc_auc = evaluate_model(model, X_train, y_train, X_val, y_val)
        mlflow.log_param("model_type", model_name)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("recall", recall)
        if roc_auc is not None:
            mlflow.log_metric("roc_auc", roc_auc)
        # Salvar o modelo
        mlflow.sklearn.log_model(model, model_name)
        print(f"{model_name} - Acurácia: {acc}, F1-Score: {f1}, Recall: {recall}, ROC AUC: {roc_auc}")




K-NN - Acurácia: 0.8749487014129097, F1-Score: 0.8305980818922424, Recall: 0.8749487014129097, ROC AUC: 0.5054859295809839




LVQ - Acurácia: 0.5108166735064783, F1-Score: 0.5965374024166652, Recall: 0.5108166735064783, ROC AUC: None




Árvore de Decisão - Acurácia: 0.797443864688984, F1-Score: 0.8035671899673817, Recall: 0.797443864688984, ROC AUC: 0.5278021874946004




SVM - Acurácia: 0.8869672275312188, F1-Score: 0.8338362757298086, Recall: 0.8869672275312188, ROC AUC: 0.5397813588752223




Random Forest - Acurácia: 0.8869672275312188, F1-Score: 0.8349637410004834, Recall: 0.8869672275312188, ROC AUC: 0.6420513699747867




Rede Neural MLP - Acurácia: 0.8859705692677493, F1-Score: 0.8350063672423627, Recall: 0.8859705692677493, ROC AUC: 0.6406096025083315




Comitê de Redes Neurais Artificiais - Acurácia: 0.8867327197045202, F1-Score: 0.8345094391946535, Recall: 0.8867327197045202, ROC AUC: 0.6561677114848266




Comitê Heterogêneo (Stacking) - Acurácia: 0.8864982118778214, F1-Score: 0.8347251165452486, Recall: 0.8864982118778214, ROC AUC: 0.6438651799229256


Parameters: { "use_label_encoder" } are not used.



XGBoost - Acurácia: 0.8854429266576772, F1-Score: 0.8368679264132897, Recall: 0.8854429266576772, ROC AUC: 0.6455116187509411
[LightGBM] [Info] Number of positive: 7714, number of negative: 60513
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003814 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1852
[LightGBM] [Info] Number of data points in the train set: 68227, number of used features: 83
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.113064 -> initscore=-2.059821
[LightGBM] [Info] Start training from score -2.059821




LightGBM - Acurácia: 0.8870258544878935, F1-Score: 0.835878963507557, Recall: 0.8870258544878935, ROC AUC: 0.6652721930265553


In [10]:
# Busca de Hiperparâmetros com Optuna
# Configurando o Callback do MLflow para o Optuna
mlflc = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(), metric_name='accuracy')


  mlflc = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(), metric_name='accuracy')


In [11]:
# K-NN
def objective_knn(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 30)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
    
    model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm)
    acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
    return acc


In [12]:
# Árvore de Decisão
def objective_decision_tree(trial):
    max_depth = trial.suggest_int('max_depth', 1, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    
    model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, criterion=criterion)
    acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
    return acc


In [13]:
# SVM
def objective_svm(trial):
    C = trial.suggest_float('C', 0.1, 10.0, log=True)
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf'])
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
    
    model = SVC(C=C, kernel=kernel, gamma=gamma)
    acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()
    return acc


In [14]:
# Random Forest
def objective_random_forest(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 2, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)
    acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()
    return acc


In [15]:
# Rede Neural MLP
def objective_mlp(trial):
    hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', [(50,), (100,), (50,50)])
    activation = trial.suggest_categorical('activation', ['tanh', 'relu'])
    solver = trial.suggest_categorical('solver', ['sgd', 'adam'])
    alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
    
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, max_iter=500)
    acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()
    return acc


In [16]:
# Comitê de Redes Neurais Artificiais
def objective_committee_nn(trial):
    # Hiperparâmetros para as redes individuais
    hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', [(50,), (100,), (50,50)])
    activation = trial.suggest_categorical('activation', ['tanh', 'relu'])
    solver = trial.suggest_categorical('solver', ['sgd', 'adam'])
    
    # Definindo as redes neurais
    nn1 = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=500, random_state=1)
    nn2 = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=500, random_state=2)
    nn3 = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=500, random_state=3)
    
    # Comitê
    committee_nn = VotingClassifier(estimators=[
        ('nn1', nn1),
        ('nn2', nn2),
        ('nn3', nn3)
    ], voting='soft')
    
    acc = cross_val_score(committee_nn, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()
    return acc


In [28]:
# Comitê Heterogêneo (Stacking)
def objective_stacking(trial):
    # Hiperparâmetros do estimador final
    C = trial.suggest_loguniform('C', 1e-3, 1e3)
    penalty = trial.suggest_categorical('penalty', ['l2'])
    solver = trial.suggest_categorical('solver', ['lbfgs'])
    
    final_estimator = LogisticRegression(C=C, penalty=penalty, solver=solver)
    
    # Modelos base (podemos também ajustar seus hiperparâmetros se desejado)
    estimators = [
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('svm', SVC(random_state=42)),
        ('knn', KNeighborsClassifier())
    ]
    
    stacking_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=5)
    
    acc = cross_val_score(stacking_model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()
    return acc


In [18]:
# XGBoost
def objective_xgboost(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    
    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()
    return acc

In [19]:
# LightGBM
def objective_lightgbm(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    num_leaves = trial.suggest_int('num_leaves', 31, 150)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    
    model = LGBMClassifier(
        n_estimators=n_estimators,
        num_leaves=num_leaves,
        learning_rate=learning_rate,
        subsample=subsample
    )
    acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()
    return acc


In [20]:
# Executando as Otimizações com Optuna
n_trials = 5  # Número de iterações

# Dicionário para armazenar os melhores modelos
best_models = {}


In [21]:
# K-NN
study_knn = optuna.create_study(direction='maximize') # , study_name='Modelagem', sampler=optuna.samplers.TPESampler(seed=123)
study_knn.optimize(objective_knn, n_trials=n_trials, callbacks=[mlflc])
best_knn = KNeighborsClassifier(**study_knn.best_params)
best_models['K-NN'] = best_knn


[I 2024-11-22 22:08:27,072] A new study created in memory with name: no-name-dd69eb3b-7ee3-4f61-9bcb-c7420ae17b74
[I 2024-11-22 22:08:59,685] Trial 0 finished with value: 0.8869069434978053 and parameters: {'n_neighbors': 14, 'weights': 'uniform', 'algorithm': 'kd_tree'}. Best is trial 0 with value: 0.8869069434978053.
2024/11/22 22:08:59 INFO mlflow.tracking.fluent: Experiment with name 'no-name-dd69eb3b-7ee3-4f61-9bcb-c7420ae17b74' does not exist. Creating a new experiment.
[I 2024-11-22 22:09:04,387] Trial 1 finished with value: 0.8866577711979835 and parameters: {'n_neighbors': 29, 'weights': 'distance', 'algorithm': 'auto'}. Best is trial 0 with value: 0.8869069434978053.
[I 2024-11-22 22:09:09,046] Trial 2 finished with value: 0.8840341725632488 and parameters: {'n_neighbors': 11, 'weights': 'distance', 'algorithm': 'brute'}. Best is trial 0 with value: 0.8869069434978053.
[I 2024-11-22 22:09:34,516] Trial 3 finished with value: 0.8733785564711258 and parameters: {'n_neighbors': 

In [22]:
# Árvore de Decisão
study_dt = optuna.create_study(direction='maximize')
study_dt.optimize(objective_decision_tree, n_trials=n_trials, callbacks=[mlflc])
best_dt = DecisionTreeClassifier(**study_dt.best_params)
best_models['Árvore de Decisão'] = best_dt


[I 2024-11-22 22:11:16,617] A new study created in memory with name: no-name-e92d4281-bd5f-4f00-bc2a-054ed1bba9b7
[I 2024-11-22 22:11:19,815] Trial 0 finished with value: 0.8558927757498174 and parameters: {'max_depth': 18, 'min_samples_split': 2, 'criterion': 'gini'}. Best is trial 0 with value: 0.8558927757498174.
2024/11/22 22:11:19 INFO mlflow.tracking.fluent: Experiment with name 'no-name-e92d4281-bd5f-4f00-bc2a-054ed1bba9b7' does not exist. Creating a new experiment.
[I 2024-11-22 22:11:22,599] Trial 1 finished with value: 0.86819000269979 and parameters: {'max_depth': 15, 'min_samples_split': 6, 'criterion': 'entropy'}. Best is trial 1 with value: 0.86819000269979.
[I 2024-11-22 22:11:25,350] Trial 2 finished with value: 0.8684977884225036 and parameters: {'max_depth': 15, 'min_samples_split': 6, 'criterion': 'gini'}. Best is trial 2 with value: 0.8684977884225036.
[I 2024-11-22 22:11:27,336] Trial 3 finished with value: 0.8816304454245273 and parameters: {'max_depth': 10, 'min_

In [23]:
# SVM
study_svm = optuna.create_study(direction='maximize')
study_svm.optimize(objective_svm, n_trials=n_trials, callbacks=[mlflc])
best_svm = SVC(probability=True, **study_svm.best_params)
best_models['SVM'] = best_svm


[I 2024-11-22 22:11:30,035] A new study created in memory with name: no-name-e1cff4a1-f303-423a-bce7-a096b5fa74dd
[I 2024-11-22 22:36:54,288] Trial 0 finished with value: 0.8822899976138519 and parameters: {'C': 2.0972890313319916, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 0 with value: 0.8822899976138519.
2024/11/22 22:36:54 INFO mlflow.tracking.fluent: Experiment with name 'no-name-e1cff4a1-f303-423a-bce7-a096b5fa74dd' does not exist. Creating a new experiment.
[I 2024-11-22 23:59:15,159] Trial 1 finished with value: 0.883814350476561 and parameters: {'C': 0.2133654984424116, 'kernel': 'linear', 'gamma': 'scale'}. Best is trial 1 with value: 0.883814350476561.
[I 2024-11-23 00:19:16,308] Trial 2 finished with value: 0.8869362571910037 and parameters: {'C': 0.12453052603146088, 'kernel': 'rbf', 'gamma': 'auto'}. Best is trial 2 with value: 0.8869362571910037.
[I 2024-11-23 00:39:17,990] Trial 3 finished with value: 0.8869362571910037 and parameters: {'C': 0.36787350005040176, '

In [24]:
# Random Forest
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_random_forest, n_trials=n_trials, callbacks=[mlflc])
best_rf = RandomForestClassifier(**study_rf.best_params)
best_models['Random Forest'] = best_rf


[I 2024-11-23 00:45:11,246] A new study created in memory with name: no-name-aa1d3845-dad8-4525-aa4e-e57a1dfa9c17
[I 2024-11-23 00:45:22,302] Trial 0 finished with value: 0.8871707667365898 and parameters: {'n_estimators': 204, 'max_depth': 15, 'min_samples_split': 3}. Best is trial 0 with value: 0.8871707667365898.
2024/11/23 00:45:22 INFO mlflow.tracking.fluent: Experiment with name 'no-name-aa1d3845-dad8-4525-aa4e-e57a1dfa9c17' does not exist. Creating a new experiment.
[I 2024-11-23 00:45:27,701] Trial 1 finished with value: 0.8869362571910037 and parameters: {'n_estimators': 136, 'max_depth': 8, 'min_samples_split': 7}. Best is trial 0 with value: 0.8871707667365898.
[I 2024-11-23 00:45:30,398] Trial 2 finished with value: 0.8869362571910037 and parameters: {'n_estimators': 60, 'max_depth': 6, 'min_samples_split': 4}. Best is trial 0 with value: 0.8871707667365898.
[I 2024-11-23 00:45:36,271] Trial 3 finished with value: 0.8871267988820819 and parameters: {'n_estimators': 93, 'max

In [25]:
# Rede Neural MLP
study_mlp = optuna.create_study(direction='maximize')
study_mlp.optimize(objective_mlp, n_trials=n_trials, callbacks=[mlflc])
best_mlp = MLPClassifier(**study_mlp.best_params, max_iter=500)
best_models['Rede Neural MLP'] = best_mlp


[I 2024-11-23 00:45:53,126] A new study created in memory with name: no-name-ed2e9296-3a78-4e47-9675-eee1ebef3640
  alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
[I 2024-11-23 00:46:03,519] Trial 0 finished with value: 0.8869362571910037 and parameters: {'hidden_layer_sizes': (50, 50), 'activation': 'tanh', 'solver': 'sgd', 'alpha': 1.5445127581631266e-05}. Best is trial 0 with value: 0.8869362571910037.
2024/11/23 00:46:03 INFO mlflow.tracking.fluent: Experiment with name 'no-name-ed2e9296-3a78-4e47-9675-eee1ebef3640' does not exist. Creating a new experiment.
  alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
[I 2024-11-23 00:46:11,489] Trial 1 finished with value: 0.8853093209026632 and parameters: {'hidden_layer_sizes': (50,), 'activation': 'relu', 'solver': 'adam', 'alpha': 1.3326492563789275e-05}. Best is trial 0 with value: 0.8869362571910037.
  alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
[I 2024-11-23 00:46:28,367] Trial 2 finished with value: 0.886936257

In [26]:
# Comitê de Redes Neurais Artificiais
study_committee_nn = optuna.create_study(direction='maximize')
study_committee_nn.optimize(objective_committee_nn, n_trials=n_trials, callbacks=[mlflc])

# Usando os melhores hiperparâmetros
hidden_layer_sizes = study_committee_nn.best_params['hidden_layer_sizes']
activation = study_committee_nn.best_params['activation']
solver = study_committee_nn.best_params['solver']

# Definindo as redes neurais
nn1 = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=500, random_state=1)
nn2 = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=500, random_state=2)
nn3 = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=500, random_state=3)

best_committee_nn = VotingClassifier(estimators=[
    ('nn1', nn1),
    ('nn2', nn2),
    ('nn3', nn3)
], voting='soft')

best_models['Comitê de Redes Neurais Artificiais'] = best_committee_nn


[I 2024-11-23 00:46:58,379] A new study created in memory with name: no-name-8be35d54-a96e-4810-836c-575da9691cfc
[I 2024-11-23 00:47:36,948] Trial 0 finished with value: 0.8869509145746607 and parameters: {'hidden_layer_sizes': (50,), 'activation': 'tanh', 'solver': 'adam'}. Best is trial 0 with value: 0.8869509145746607.
2024/11/23 00:47:36 INFO mlflow.tracking.fluent: Experiment with name 'no-name-8be35d54-a96e-4810-836c-575da9691cfc' does not exist. Creating a new experiment.
[I 2024-11-23 00:47:54,805] Trial 1 finished with value: 0.8862767082240264 and parameters: {'hidden_layer_sizes': (50,), 'activation': 'relu', 'solver': 'adam'}. Best is trial 0 with value: 0.8869509145746607.
[I 2024-11-23 00:48:50,708] Trial 2 finished with value: 0.8869216008814623 and parameters: {'hidden_layer_sizes': (50, 50), 'activation': 'tanh', 'solver': 'adam'}. Best is trial 0 with value: 0.8869509145746607.
[I 2024-11-23 00:49:28,343] Trial 3 finished with value: 0.8869362571910037 and parameters

In [29]:
# Comitê Heterogêneo (Stacking)
study_stacking = optuna.create_study(direction='maximize')
study_stacking.optimize(objective_stacking, n_trials=n_trials, callbacks=[mlflc])

# Modelo meta com os melhores hiperparâmetros
final_estimator = LogisticRegression(
    C=study_stacking.best_params['C'],
    penalty=study_stacking.best_params['penalty'],
    solver=study_stacking.best_params['solver']
)

# Modelos base (mantidos os mesmos)
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', SVC(probability=True, random_state=42)),
    ('knn', KNeighborsClassifier())
]

best_stacking = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=5)

best_models['Comitê Heterogêneo (Stacking)'] = best_stacking


[I 2024-11-23 01:21:54,562] A new study created in memory with name: no-name-6af6dc78-0797-4a3d-984a-987abad46745
  C = trial.suggest_loguniform('C', 1e-3, 1e3)
[I 2024-11-23 01:32:22,119] Trial 0 finished with value: 0.8869362571910037 and parameters: {'C': 0.004278975664230019, 'penalty': 'l2', 'solver': 'lbfgs'}. Best is trial 0 with value: 0.8869362571910037.
2024/11/23 01:32:22 INFO mlflow.tracking.fluent: Experiment with name 'no-name-6af6dc78-0797-4a3d-984a-987abad46745' does not exist. Creating a new experiment.
  C = trial.suggest_loguniform('C', 1e-3, 1e3)
[I 2024-11-23 01:42:49,039] Trial 1 finished with value: 0.8870095419610571 and parameters: {'C': 0.37052388460658653, 'penalty': 'l2', 'solver': 'lbfgs'}. Best is trial 1 with value: 0.8870095419610571.
  C = trial.suggest_loguniform('C', 1e-3, 1e3)
[I 2024-11-23 01:53:05,145] Trial 2 finished with value: 0.8871267999561976 and parameters: {'C': 2.9545486572561135, 'penalty': 'l2', 'solver': 'lbfgs'}. Best is trial 2 with 

In [30]:
# XGBoost
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgboost, n_trials=n_trials, callbacks=[mlflc])
best_xgb = XGBClassifier(**study_xgb.best_params, use_label_encoder=False, eval_metric='logloss')
best_models['XGBoost'] = best_xgb


[I 2024-11-23 02:14:07,720] A new study created in memory with name: no-name-539beff9-06d1-4fa9-86fe-7286d842a7f3
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
[I 2024-11-23 02:14:11,765] Trial 0 finished with value: 0.8833746021139565 and parameters: {'n_estimators': 221, 'max_depth': 7, 'learning_rate': 0.18607460441958731, 'subsample': 0.7424712152172469}. Best is trial 0 with value: 0.8833746021139565.
2024/11/23 02:14:11 INFO mlflow.tracking.fluent: Experiment with name 'no-name-539beff9-06d1-4fa9-86fe-7286d842a7f3' does not exist. Creating a new experiment.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
[I 2024-11-23 02:14:17,216] Trial 1 finished with value: 0.8850454998121103 and parameters: {'n_estimators': 138, 'max_depth': 12, 'learning_rate': 0.1368135740832945, 'subsample': 0.8357521356164341}. Best is trial 1 with value: 0.8850454998121103.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
[I 2024-11-23

In [31]:
# LightGBM
study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lightgbm, n_trials=n_trials, callbacks=[mlflc])
best_lgbm = LGBMClassifier(**study_lgbm.best_params)
best_models['LightGBM'] = best_lgbm


[I 2024-11-23 02:14:38,695] A new study created in memory with name: no-name-2a2c3ffa-ad17-4cb5-869b-0c2e95540871
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
[I 2024-11-23 02:14:40,367] Trial 0 finished with value: 0.8865405153510745 and parameters: {'n_estimators': 223, 'num_leaves': 45, 'learning_rate': 0.0925386820994413, 'subsample': 0.8236354621291435}. Best is trial 0 with value: 0.8865405153510745.
2024/11/23 02:14:40 INFO mlflow.tracking.fluent: Experiment with name 'no-name-2a2c3ffa-ad17-4cb5-869b-0c2e95540871' does not exist. Creating a new experiment.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
[I 2024-11-23 02:14:42,056] Trial 1 finished with value: 0.8862766899640585 and parameters: {'n_estimators': 99, 'num_leaves': 140, 'learning_rate': 0.10132320465083965, 'subsample': 0.7502716813083699}. Best is trial 0 with value: 0.8865405153510745.
  learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
[I 2024-11

In [32]:
# Avaliação dos Melhores Modelos e Registro no MLflow
for model_name, model in best_models.items():
    with mlflow.start_run(run_name=f"{model_name} - Optuna HPO"):
        acc, f1, recall, roc_auc = evaluate_model(model, X_train, y_train, X_val, y_val)
        # Registrando os melhores hiperparâmetros
        if model_name == 'K-NN':
            mlflow.log_params(study_knn.best_params)
        elif model_name == 'Árvore de Decisão':
            mlflow.log_params(study_dt.best_params)
        elif model_name == 'SVM':
            mlflow.log_params(study_svm.best_params)
        elif model_name == 'Random Forest':
            mlflow.log_params(study_rf.best_params)
        elif model_name == 'Rede Neural MLP':
            mlflow.log_params(study_mlp.best_params)
        elif model_name == 'Comitê de Redes Neurais Artificiais':
            mlflow.log_params(study_committee_nn.best_params)
        elif model_name == 'Comitê Heterogêneo (Stacking)':
            mlflow.log_params(study_stacking.best_params)
        elif model_name == 'XGBoost':
            mlflow.log_params(study_xgb.best_params)
        elif model_name == 'LightGBM':
            mlflow.log_params(study_lgbm.best_params)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("recall", recall)
        if roc_auc is not None:
            mlflow.log_metric("roc_auc", roc_auc)
        # Salvar o modelo
        mlflow.sklearn.log_model(model, model_name)
        print(f"{model_name} com Optuna HPO - Acurácia: {acc}, F1-Score: {f1}, Recall: {recall}, ROC AUC: {roc_auc}")




K-NN com Optuna HPO - Acurácia: 0.8869086005745441, F1-Score: 0.8338070665274058, Recall: 0.8869086005745441, ROC AUC: 0.5123098681902718




Árvore de Decisão com Optuna HPO - Acurácia: 0.8820425631705459, F1-Score: 0.8397023897834482, Recall: 0.8820425631705459, ROC AUC: 0.624586508996352




SVM com Optuna HPO - Acurácia: 0.8869672275312188, F1-Score: 0.8338362757298086, Recall: 0.8869672275312188, ROC AUC: 0.5193812294488698




Random Forest com Optuna HPO - Acurácia: 0.8870844814445682, F1-Score: 0.8346872954458089, Recall: 0.8870844814445682, ROC AUC: 0.6617177337141249




Rede Neural MLP com Optuna HPO - Acurácia: 0.8869672275312188, F1-Score: 0.8338362757298086, Recall: 0.8869672275312188, ROC AUC: 0.5449184043505246




Comitê de Redes Neurais Artificiais com Optuna HPO - Acurácia: 0.8869672275312188, F1-Score: 0.8338362757298086, Recall: 0.8869672275312188, ROC AUC: 0.6512032481927896




Comitê Heterogêneo (Stacking) com Optuna HPO - Acurácia: 0.8864982118778214, F1-Score: 0.8347251165452486, Recall: 0.8864982118778214, ROC AUC: 0.6442010192290972


Parameters: { "use_label_encoder" } are not used.



XGBoost com Optuna HPO - Acurácia: 0.8870258544878935, F1-Score: 0.8371794797071358, Recall: 0.8870258544878935, ROC AUC: 0.6540133654170263
[LightGBM] [Info] Number of positive: 7714, number of negative: 60513
[LightGBM] [Info] Auto-choosing row-wise multi-threading, the overhead of testing was 0.003723 seconds.
You can set `force_row_wise=true` to remove the overhead.
And if memory is not enough, you can set `force_col_wise=true`.
[LightGBM] [Info] Total Bins 1852
[LightGBM] [Info] Number of data points in the train set: 68227, number of used features: 83
[LightGBM] [Info] [binary:BoostFromScore]: pavg=0.113064 -> initscore=-2.059821
[LightGBM] [Info] Start training from score -2.059821




LightGBM com Optuna HPO - Acurácia: 0.8869086005745441, F1-Score: 0.8339208346576724, Recall: 0.8869086005745441, ROC AUC: 0.669157177732085


In [None]:
# Selecionando o Melhor Modelo = Random Forest
best_model_name = 'Random Forest'
best_model = best_models[best_model_name]


In [34]:
# Treinando o Melhor Modelo no Conjunto Combinado e Avaliando no Conjunto de Teste
# Combinando os conjuntos de treinamento e validação
X_combined = pd.concat([X_train, X_val])
y_combined = pd.concat([y_train, y_val])

# Treinando o melhor modelo
best_model.fit(X_combined, y_combined)

# Avaliando no conjunto de teste
y_pred_test = best_model.predict(X_test)
acc_test = accuracy_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test, average='weighted')
recall_test = recall_score(y_test, y_pred_test, average='weighted')
y_prob_test = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None
roc_auc_test = roc_auc_score(y_test, y_prob_test) if y_prob_test is not None else None

print(f"Desempenho no conjunto de teste - Acurácia: {acc_test}, F1-Score: {f1_test}, Recall: {recall_test}, ROC AUC: {roc_auc_test}")

# Registrando no MLflow
with mlflow.start_run(run_name="Melhor Modelo - Teste"):
    mlflow.log_param("model_type", best_model_name)
    mlflow.log_metric("accuracy_test", acc_test)
    mlflow.log_metric("f1_score_test", f1_test)
    mlflow.log_metric("recall_test", recall_test)
    if roc_auc_test is not None:
        mlflow.log_metric("roc_auc_test", roc_auc_test)
    mlflow.sklearn.log_model(best_model, "Melhor_Modelo")


Parameters: { "use_label_encoder" } are not used.



ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, the experimental DMatrix parameter`enable_categorical` must be set to `True`.  Invalid columns:race: object, gender: object, age: object, weight: object, payer_code: object, medical_specialty: object, diag_1: object, diag_2: object, diag_3: object, max_glu_serum: object, A1Cresult: object, metformin: object, repaglinide: object, nateglinide: object, chlorpropamide: object, glimepiride: object, acetohexamide: object, glipizide: object, glyburide: object, tolbutamide: object, pioglitazone: object, rosiglitazone: object, acarbose: object, miglitol: object, troglitazone: object, tolazamide: object, examide: object, citoglipton: object, insulin: object, glyburide.metformin: object, glipizide.metformin: object, glimepiride.pioglitazone: object, metformin.rosiglitazone: object, metformin.pioglitazone: object, change: object, diabetesMed: object

In [5]:
test_data.dtypes

admission_type_id            int64
discharge_disposition_id     int64
admission_source_id          int64
time_in_hospital             int64
num_lab_procedures           int64
num_procedures               int64
num_medications              int64
number_outpatient            int64
number_emergency             int64
number_inpatient             int64
number_diagnoses             int64
race                        object
gender                      object
age                         object
weight                      object
payer_code                  object
medical_specialty           object
diag_1                      object
diag_2                      object
diag_3                      object
max_glu_serum               object
A1Cresult                   object
metformin                   object
repaglinide                 object
nateglinide                 object
chlorpropamide              object
glimepiride                 object
acetohexamide               object
glipizide           

In [None]:
# Visualizações Adicionais
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Matriz de Confusão
conf_matrix = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(8,6))
plt.title("Matriz de Confusão - Conjunto de Teste")
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predito')
plt.ylabel('Real')
plt.show()


In [None]:
# Curva ROC
if roc_auc_test is not None:
    fpr, tpr, thresholds = roc_curve(y_test, y_prob_test)
    plt.figure(figsize=(8,6))
    plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc_test:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')  # Linha diagonal
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Curva ROC - Conjunto de Teste')
    plt.legend(loc='lower right')
    plt.show()


In [None]:
# Comparação dos Modelos
results = []

for model_name, model in best_models.items():
    acc, f1, recall, roc_auc = evaluate_model(model, X_train, y_train, X_val, y_val)
    results.append({
        'Modelo': model_name,
        'Acurácia': acc,
        'F1-Score': f1,
        'Recall': recall,
        'ROC AUC': roc_auc
    })

results_df = pd.DataFrame(results)
print(results_df)


In [None]:
# Plotando as métricas
results_df_melted = results_df.melt(id_vars='Modelo', value_vars=['Acurácia', 'F1-Score', 'Recall', 'ROC AUC'], var_name='Métrica', value_name='Valor')

plt.figure(figsize=(12,8))
sns.barplot(x='Modelo', y='Valor', hue='Métrica', data=results_df_melted)
plt.xticks(rotation=45)
plt.title('Comparação das Métricas dos Modelos')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Implementação da Metodologia de Janez Demsar
from scipy.stats import friedmanchisquare
import scikit_posthocs as sp

# Supondo que você tenha as acurácias de cada modelo em diferentes folds
# Exemplo simplificado
model_scores = {
    'K-NN': [0.8, 0.82, 0.81, 0.79, 0.8],
    'Árvore de Decisão': [0.85, 0.83, 0.84, 0.86, 0.85],
    # Adicione os demais modelos
}

# Convertendo para DataFrame
scores_df = pd.DataFrame(model_scores)

# Teste de Friedman
stat, p = friedmanchisquare(*[scores_df[model] for model in scores_df.columns])
print(f'Estatística: {stat}, p-valor: {p}')

# Se p-valor < 0.05, há diferença significativa
if p < 0.05:
    print('Diferença significativa entre os modelos. Realizando teste de Nemenyi.')
    nemenyi = sp.posthoc_nemenyi_friedman(scores_df.values)
    print(nemenyi)
else:
    print('Não há diferença significativa entre os modelos.')
