In [None]:
# Importando as Bibliotecas Necessárias
import pandas as pd
import numpy as np

# Modelos
from sklearn.neighbors import KNeighborsClassifier, NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Validação e métricas
from sklearn.model_selection import train_test_split, cross_val_score, KFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, roc_auc_score, roc_curve
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline

# Bibliotecas adicionais
import optuna
from optuna.integration.mlflow import MLflowCallback
import mlflow
import mlflow.sklearn
import matplotlib.pyplot as plt
import seaborn as sns


In [None]:
# Configurando o MLflow
mlflow.set_experiment("Modelagem de Classificação com Optuna")


In [None]:
# Carregando os conjuntos de dados
train_data = pd.read_csv('train_data.csv')
validation_data = pd.read_csv('validation_data.csv')
test_data = pd.read_csv('test_data.csv')


In [None]:
# Separando features e target no conjunto de treinamento
X_train = train_data.drop('class', axis=1)
y_train = train_data['class']

# Separando features e target no conjunto de validação
X_val = validation_data.drop('class', axis=1)
y_val = validation_data['class']

# Conjunto de teste
X_test = test_data.drop('class', axis=1)
y_test = test_data['class']


In [None]:
# Definindo uma Função de Avaliação
def evaluate_model(model, X_train, y_train, X_val, y_val):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred, average='weighted')
    recall = recall_score(y_val, y_pred, average='weighted')
    y_prob = model.predict_proba(X_val)[:, 1] if hasattr(model, "predict_proba") else None
    roc_auc = roc_auc_score(y_val, y_prob) if y_prob is not None else None
    return acc, f1, recall, roc_auc


In [None]:
# Definindo os Modelos
models = {
    'K-NN': KNeighborsClassifier(),
    'LVQ': NearestCentroid(),
    'Árvore de Decisão': DecisionTreeClassifier(),
    'SVM': SVC(probability=True),
    'Random Forest': RandomForestClassifier(),
    'Rede Neural MLP': MLPClassifier(max_iter=500),
    'Comitê de Redes Neurais Artificiais': None,  # Será definido posteriormente
    'Comitê Heterogêneo (Stacking)': None,       # Será definido posteriormente
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier()
}


In [None]:
# Comitê de Redes Neurais Artificiais
# Definindo múltiplas redes neurais
nn1 = MLPClassifier(hidden_layer_sizes=(50,), activation='relu', solver='adam', max_iter=500, random_state=1)
nn2 = MLPClassifier(hidden_layer_sizes=(100,), activation='tanh', solver='sgd', max_iter=500, random_state=2)
nn3 = MLPClassifier(hidden_layer_sizes=(50, 50), activation='relu', solver='adam', max_iter=500, random_state=3)

# Criando o Comitê
committee_nn = VotingClassifier(estimators=[
    ('nn1', nn1),
    ('nn2', nn2),
    ('nn3', nn3)
], voting='soft')

models['Comitê de Redes Neurais Artificiais'] = committee_nn


In [None]:
# Comitê Heterogêneo (Stacking)
# Modelos base
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', SVC(probability=True, random_state=42)),
    ('knn', KNeighborsClassifier())
]

# Modelo meta
from sklearn.linear_model import LogisticRegression
final_estimator = LogisticRegression()

# Criando o StackingClassifier
stacking_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=5)

models['Comitê Heterogêneo (Stacking)'] = stacking_model


In [None]:
# Executando os Modelos Básicos e Registrando no MLflow
for model_name, model in models.items():
    with mlflow.start_run(run_name=model_name):
        acc, f1, recall, roc_auc = evaluate_model(model, X_train, y_train, X_val, y_val)
        mlflow.log_param("model_type", model_name)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("recall", recall)
        if roc_auc is not None:
            mlflow.log_metric("roc_auc", roc_auc)
        # Salvar o modelo
        mlflow.sklearn.log_model(model, model_name)
        print(f"{model_name} - Acurácia: {acc}, F1-Score: {f1}, Recall: {recall}, ROC AUC: {roc_auc}")


In [None]:
# Busca de Hiperparâmetros com Optuna
# Configurando o Callback do MLflow para o Optuna
mlflc = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(), metric_name='accuracy')


In [None]:
# K-NN
def objective_knn(trial):
    n_neighbors = trial.suggest_int('n_neighbors', 1, 30)
    weights = trial.suggest_categorical('weights', ['uniform', 'distance'])
    algorithm = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
    
    model = KNeighborsClassifier(n_neighbors=n_neighbors, weights=weights, algorithm=algorithm)
    acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
    return acc


In [None]:
# Árvore de Decisão
def objective_decision_tree(trial):
    max_depth = trial.suggest_int('max_depth', 1, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    criterion = trial.suggest_categorical('criterion', ['gini', 'entropy'])
    
    model = DecisionTreeClassifier(max_depth=max_depth, min_samples_split=min_samples_split, criterion=criterion)
    acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy').mean()
    return acc


In [None]:
# SVM
def objective_svm(trial):
    C = trial.suggest_float('C', 0.1, 10.0, log=True)
    kernel = trial.suggest_categorical('kernel', ['linear', 'rbf'])
    gamma = trial.suggest_categorical('gamma', ['scale', 'auto'])
    
    model = SVC(C=C, kernel=kernel, gamma=gamma)
    acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()
    return acc


In [None]:
# Random Forest
def objective_random_forest(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 2, 20)
    min_samples_split = trial.suggest_int('min_samples_split', 2, 10)
    
    model = RandomForestClassifier(n_estimators=n_estimators, max_depth=max_depth, min_samples_split=min_samples_split)
    acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()
    return acc


In [None]:
# Rede Neural MLP
def objective_mlp(trial):
    hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', [(50,), (100,), (50,50)])
    activation = trial.suggest_categorical('activation', ['tanh', 'relu'])
    solver = trial.suggest_categorical('solver', ['sgd', 'adam'])
    alpha = trial.suggest_loguniform('alpha', 1e-5, 1e-1)
    
    model = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, alpha=alpha, max_iter=500)
    acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()
    return acc


In [None]:
# Comitê de Redes Neurais Artificiais
def objective_committee_nn(trial):
    # Hiperparâmetros para as redes individuais
    hidden_layer_sizes = trial.suggest_categorical('hidden_layer_sizes', [(50,), (100,), (50,50)])
    activation = trial.suggest_categorical('activation', ['tanh', 'relu'])
    solver = trial.suggest_categorical('solver', ['sgd', 'adam'])
    
    # Definindo as redes neurais
    nn1 = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=500, random_state=1)
    nn2 = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=500, random_state=2)
    nn3 = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=500, random_state=3)
    
    # Comitê
    committee_nn = VotingClassifier(estimators=[
        ('nn1', nn1),
        ('nn2', nn2),
        ('nn3', nn3)
    ], voting='soft')
    
    acc = cross_val_score(committee_nn, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()
    return acc


In [None]:
# Comitê Heterogêneo (Stacking)
def objective_stacking(trial):
    # Hiperparâmetros do estimador final
    C = trial.suggest_loguniform('C', 1e-3, 1e3)
    penalty = trial.suggest_categorical('penalty', ['l2'])
    solver = trial.suggest_categorical('solver', ['lbfgs'])
    
    final_estimator = LogisticRegression(C=C, penalty=penalty, solver=solver)
    
    # Modelos base (podemos também ajustar seus hiperparâmetros se desejado)
    estimators = [
        ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
        ('svm', SVC(probability=True, random_state=42)),
        ('knn', KNeighborsClassifier())
    ]
    
    stacking_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=5)
    
    acc = cross_val_score(stacking_model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()
    return acc


In [None]:
# XGBoost
def objective_xgboost(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    max_depth = trial.suggest_int('max_depth', 3, 15)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    
    model = XGBClassifier(
        n_estimators=n_estimators,
        max_depth=max_depth,
        learning_rate=learning_rate,
        subsample=subsample,
        use_label_encoder=False,
        eval_metric='logloss'
    )
    acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()
    return acc

In [None]:
# LightGBM
def objective_lightgbm(trial):
    n_estimators = trial.suggest_int('n_estimators', 50, 300)
    num_leaves = trial.suggest_int('num_leaves', 31, 150)
    learning_rate = trial.suggest_loguniform('learning_rate', 0.01, 0.3)
    subsample = trial.suggest_float('subsample', 0.5, 1.0)
    
    model = LGBMClassifier(
        n_estimators=n_estimators,
        num_leaves=num_leaves,
        learning_rate=learning_rate,
        subsample=subsample
    )
    acc = cross_val_score(model, X_train, y_train, cv=5, scoring='accuracy', n_jobs=-1).mean()
    return acc


In [None]:
# Executando as Otimizações com Optuna
n_trials = 5  # Número de iterações

# Dicionário para armazenar os melhores modelos
best_models = {}


In [None]:
# K-NN
study_knn = optuna.create_study(direction='maximize') # , study_name='Modelagem', sampler=optuna.samplers.TPESampler(seed=123)
study_knn.optimize(objective_knn, n_trials=n_trials, callbacks=[mlflc])
best_knn = KNeighborsClassifier(**study_knn.best_params)
best_models['K-NN'] = best_knn


In [None]:
# Árvore de Decisão
study_dt = optuna.create_study(direction='maximize')
study_dt.optimize(objective_decision_tree, n_trials=n_trials, callbacks=[mlflc])
best_dt = DecisionTreeClassifier(**study_dt.best_params)
best_models['Árvore de Decisão'] = best_dt


In [None]:
# SVM
study_svm = optuna.create_study(direction='maximize')
study_svm.optimize(objective_svm, n_trials=n_trials, callbacks=[mlflc])
best_svm = SVC(probability=True, **study_svm.best_params)
best_models['SVM'] = best_svm


In [None]:
# Random Forest
study_rf = optuna.create_study(direction='maximize')
study_rf.optimize(objective_random_forest, n_trials=n_trials, callbacks=[mlflc])
best_rf = RandomForestClassifier(**study_rf.best_params)
best_models['Random Forest'] = best_rf


In [None]:
# Rede Neural MLP
study_mlp = optuna.create_study(direction='maximize')
study_mlp.optimize(objective_mlp, n_trials=n_trials, callbacks=[mlflc])
best_mlp = MLPClassifier(**study_mlp.best_params, max_iter=500)
best_models['Rede Neural MLP'] = best_mlp


In [None]:
# Comitê de Redes Neurais Artificiais
study_committee_nn = optuna.create_study(direction='maximize')
study_committee_nn.optimize(objective_committee_nn, n_trials=n_trials, callbacks=[mlflc])

# Usando os melhores hiperparâmetros
hidden_layer_sizes = study_committee_nn.best_params['hidden_layer_sizes']
activation = study_committee_nn.best_params['activation']
solver = study_committee_nn.best_params['solver']

# Definindo as redes neurais
nn1 = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=500, random_state=1)
nn2 = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=500, random_state=2)
nn3 = MLPClassifier(hidden_layer_sizes=hidden_layer_sizes, activation=activation, solver=solver, max_iter=500, random_state=3)

best_committee_nn = VotingClassifier(estimators=[
    ('nn1', nn1),
    ('nn2', nn2),
    ('nn3', nn3)
], voting='soft')

best_models['Comitê de Redes Neurais Artificiais'] = best_committee_nn


In [None]:
# Comitê Heterogêneo (Stacking)
study_stacking = optuna.create_study(direction='maximize')
study_stacking.optimize(objective_stacking, n_trials=n_trials, callbacks=[mlflc])

# Modelo meta com os melhores hiperparâmetros
final_estimator = LogisticRegression(
    C=study_stacking.best_params['C'],
    penalty=study_stacking.best_params['penalty'],
    solver=study_stacking.best_params['solver']
)

# Modelos base (mantidos os mesmos)
estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', SVC(probability=True, random_state=42)),
    ('knn', KNeighborsClassifier())
]

best_stacking = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=5)

best_models['Comitê Heterogêneo (Stacking)'] = best_stacking


In [None]:
# XGBoost
study_xgb = optuna.create_study(direction='maximize')
study_xgb.optimize(objective_xgboost, n_trials=n_trials, callbacks=[mlflc])
best_xgb = XGBClassifier(**study_xgb.best_params, use_label_encoder=False, eval_metric='logloss')
best_models['XGBoost'] = best_xgb


In [None]:
# LightGBM
study_lgbm = optuna.create_study(direction='maximize')
study_lgbm.optimize(objective_lightgbm, n_trials=n_trials, callbacks=[mlflc])
best_lgbm = LGBMClassifier(**study_lgbm.best_params)
best_models['LightGBM'] = best_lgbm


In [None]:
# Avaliação dos Melhores Modelos e Registro no MLflow
for model_name, model in best_models.items():
    with mlflow.start_run(run_name=f"{model_name} - Optuna HPO"):
        acc, f1, recall, roc_auc = evaluate_model(model, X_train, y_train, X_val, y_val)
        # Registrando os melhores hiperparâmetros
        if model_name == 'K-NN':
            mlflow.log_params(study_knn.best_params)
        elif model_name == 'Árvore de Decisão':
            mlflow.log_params(study_dt.best_params)
        elif model_name == 'SVM':
            mlflow.log_params(study_svm.best_params)
        elif model_name == 'Random Forest':
            mlflow.log_params(study_rf.best_params)
        elif model_name == 'Rede Neural MLP':
            mlflow.log_params(study_mlp.best_params)
        elif model_name == 'Comitê de Redes Neurais Artificiais':
            mlflow.log_params(study_committee_nn.best_params)
        elif model_name == 'Comitê Heterogêneo (Stacking)':
            mlflow.log_params(study_stacking.best_params)
        elif model_name == 'XGBoost':
            mlflow.log_params(study_xgb.best_params)
        elif model_name == 'LightGBM':
            mlflow.log_params(study_lgbm.best_params)
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("recall", recall)
        if roc_auc is not None:
            mlflow.log_metric("roc_auc", roc_auc)
        # Salvar o modelo
        mlflow.sklearn.log_model(model, model_name)
        print(f"{model_name} com Optuna HPO - Acurácia: {acc}, F1-Score: {f1}, Recall: {recall}, ROC AUC: {roc_auc}")


In [None]:
# Selecionando o Melhor Modelo
# Supondo que o melhor modelo seja o 'XGBoost'
best_model_name = 'XGBoost'
best_model = best_models[best_model_name]


In [None]:
# Treinando o Melhor Modelo no Conjunto Combinado e Avaliando no Conjunto de Teste
# Combinando os conjuntos de treinamento e validação
X_combined = pd.concat([X_train, X_val])
y_combined = pd.concat([y_train, y_val])

# Treinando o melhor modelo
best_model.fit(X_combined, y_combined)

# Avaliando no conjunto de teste
y_pred_test = best_model.predict(X_test)
acc_test = accuracy_score(y_test, y_pred_test)
f1_test = f1_score(y_test, y_pred_test, average='weighted')
recall_test = recall_score(y_test, y_pred_test, average='weighted')
y_prob_test = best_model.predict_proba(X_test)[:, 1] if hasattr(best_model, "predict_proba") else None
roc_auc_test = roc_auc_score(y_test, y_prob_test) if y_prob_test is not None else None

print(f"Desempenho no conjunto de teste - Acurácia: {acc_test}, F1-Score: {f1_test}, Recall: {recall_test}, ROC AUC: {roc_auc_test}")

# Registrando no MLflow
with mlflow.start_run(run_name="Melhor Modelo - Teste"):
    mlflow.log_param("model_type", best_model_name)
    mlflow.log_metric("accuracy_test", acc_test)
    mlflow.log_metric("f1_score_test", f1_test)
    mlflow.log_metric("recall_test", recall_test)
    if roc_auc_test is not None:
        mlflow.log_metric("roc_auc_test", roc_auc_test)
    mlflow.sklearn.log_model(best_model, "Melhor_Modelo")


In [None]:
# Visualizações Adicionais
from sklearn.metrics import confusion_matrix
import seaborn as sns

# Matriz de Confusão
conf_matrix = confusion_matrix(y_test, y_pred_test)
plt.figure(figsize=(8,6))
plt.title("Matriz de Confusão - Conjunto de Teste")
sns.heatmap(conf_matrix, annot=True, fmt='d', cmap='Blues')
plt.xlabel('Predito')
plt.ylabel('Real')
plt.show()


In [None]:
# Curva ROC
if roc_auc_test is not None:
    fpr, tpr, thresholds = roc_curve(y_test, y_prob_test)
    plt.figure(figsize=(8,6))
    plt.plot(fpr, tpr, label=f'ROC curve (area = {roc_auc_test:.2f})')
    plt.plot([0, 1], [0, 1], 'k--')  # Linha diagonal
    plt.xlabel('False Positive Rate')
    plt.ylabel('True Positive Rate')
    plt.title('Curva ROC - Conjunto de Teste')
    plt.legend(loc='lower right')
    plt.show()


In [None]:
# Comparação dos Modelos
results = []

for model_name, model in best_models.items():
    acc, f1, recall, roc_auc = evaluate_model(model, X_train, y_train, X_val, y_val)
    results.append({
        'Modelo': model_name,
        'Acurácia': acc,
        'F1-Score': f1,
        'Recall': recall,
        'ROC AUC': roc_auc
    })

results_df = pd.DataFrame(results)
print(results_df)


In [None]:
# Plotando as métricas
results_df_melted = results_df.melt(id_vars='Modelo', value_vars=['Acurácia', 'F1-Score', 'Recall', 'ROC AUC'], var_name='Métrica', value_name='Valor')

plt.figure(figsize=(12,8))
sns.barplot(x='Modelo', y='Valor', hue='Métrica', data=results_df_melted)
plt.xticks(rotation=45)
plt.title('Comparação das Métricas dos Modelos')
plt.legend(loc='lower right')
plt.show()


In [None]:
# Implementação da Metodologia de Janez Demsar
from scipy.stats import friedmanchisquare
import scikit_posthocs as sp

# Supondo que você tenha as acurácias de cada modelo em diferentes folds
# Exemplo simplificado
model_scores = {
    'K-NN': [0.8, 0.82, 0.81, 0.79, 0.8],
    'Árvore de Decisão': [0.85, 0.83, 0.84, 0.86, 0.85],
    # Adicione os demais modelos
}

# Convertendo para DataFrame
scores_df = pd.DataFrame(model_scores)

# Teste de Friedman
stat, p = friedmanchisquare(*[scores_df[model] for model in scores_df.columns])
print(f'Estatística: {stat}, p-valor: {p}')

# Se p-valor < 0.05, há diferença significativa
if p < 0.05:
    print('Diferença significativa entre os modelos. Realizando teste de Nemenyi.')
    nemenyi = sp.posthoc_nemenyi_friedman(scores_df.values)
    print(nemenyi)
else:
    print('Não há diferença significativa entre os modelos.')
