# Pós-Graduação Strictu Sensu em Ciências da Computação
----
**Universidade Federal de Pernambuco**

**Disciplina**: Aprendizado de Máquina

**Discentes**: Jose Adeljan Marinho da Silva

Matheus Hopper Jansen Costa

Heitor Leite Ramos

Jefferson Medeiros Norberto

Karoline Juliana Costa da Silva

**Docentes**: Leandro Maciel Almeida

Francisco de Assis Tenorio de Carvalho

----

# Sumário:

0. [Importação das bibliotecas](#importação-das-bibliotecas-utilizadas)
1. [Introdução](#introdução)
2. [Análise Exploratória dos Dados](#análise-exploratória-dos-dados)
3. [Preparação dos dados](#preparação-dos-dados)
4. [Modelagem e Otimização](#modelagem-e-otimização)
5. [Avaliação dos Modelos](#avaliação-dos-modelos)
6. [Análise de Custo-Benefício](#análise-de-custo-benefício)
7. [Teste de Estresse dos Modelos](#teste-de-estresse-dos-modelos)
8. [Discussão sobre Limitações e Futuras Melhorias](#discussão-sobre-limitações-e-futuras-melhorias)
9. [Conclusão](#conclusão)
10. [Referências](#referências)


# Importação das bibliotecas utilizadas

In [1]:
!pip install openml



In [2]:
import openml
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import re
import warnings
from sklearn.model_selection import train_test_split
from sklearn.neighbors import LocalOutlierFactor
from imblearn.under_sampling import ClusterCentroids
from collections import Counter
from imblearn.over_sampling import ADASYN

# Modelos
from sklearn.neighbors import KNeighborsClassifier
from sklearn.neighbors import NearestCentroid
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier, StackingClassifier, VotingClassifier
from sklearn.neural_network import MLPClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier

# Validação e métricas
from sklearn.model_selection import cross_val_score, StratifiedKFold
from sklearn.metrics import accuracy_score, f1_score, recall_score, confusion_matrix, classification_report, roc_auc_score, roc_curve
from sklearn.preprocessing import StandardScaler
from scipy.stats import friedmanchisquare
import scikit_posthocs as sp

# Bibliotecas adicionais
import optuna
from optuna.integration.mlflow import MLflowCallback
import mlflow
import mlflow.sklearn
import time
import sys

warnings.filterwarnings('ignore')

  from .autonotebook import tqdm as notebook_tqdm


# Introdução

----

# Análise Exploratória dos Dados

In [3]:
dataset = openml.datasets.get_dataset(45069)

# Carregar o dataset
X, y, _, _ = dataset.get_data(target=dataset.default_target_attribute)

# Mostrar as primeiras linhas do DataFrame
df = pd.concat([X, y], axis=1)
df_bruto = df.copy(deep=True)

----

# Preparação dos dados

In [4]:
# Transformação do target em binário
df['class'] = [1 if each=='<30' else 0 for each in df['class']]

In [5]:
# Transformação das colunas categóricas em string
categorical_cols = df.select_dtypes(include='category').columns
df[categorical_cols] = df[categorical_cols].astype('object')

In [6]:
df["race"].fillna(df["race"].mode()[0], inplace = True)

In [7]:
data = df.loc[~df.discharge_disposition_id.isin([11,18,19,20,21,7,25,26])]

In [8]:
diag_list = ['diag_1','diag_2','diag_3']

for col in diag_list:
    df[col].fillna('NaN', inplace=True)

In [9]:
def transformFunc(value):
    value = re.sub("V[0-9]*", "0", value) # V
    value = re.sub("E[0-9]*", "0", value) # E
    value = re.sub('NaN', "-1", value) # Nan
    return value

def transformCategory(value):
    if value>=390 and value<=459 or value==785:
        category = 'Circulatory'
    elif value>=460 and value<=519 or value==786:
        category = 'Respiratory'
    elif value>=520 and value<=579 or value==787:
        category = 'Digestive'
    elif value==250:
        category = 'Diabetes'
    elif value>=800 and value<=999:
        category = 'Injury'
    elif value>=710 and value<=739:
        category = 'Musculoskeletal'
    elif value>=580 and value<=629 or value==788:
        category = 'Genitourinary'
    elif value>=140 and value<=239 :
        category = 'Neoplasms'
    elif value==-1:
        category = 'NAN'
    else :
        category = 'Other'

    return category

In [10]:
for col in diag_list:
    df[col] = df[col].apply(transformFunc)
    df[col] = df[col].astype(float)

In [11]:
for col in diag_list:
    df[col] = df[col].apply(transformCategory)

In [12]:
numerical_columns = df.select_dtypes(include=['uint8', 'int64']).columns.tolist()
numerical_columns.remove('class')

In [13]:
clf = LocalOutlierFactor(n_neighbors = 2 , contamination = 0.1)
clf.fit_predict(df[numerical_columns])

array([1, 1, 1, ..., 1, 1, 1])

In [14]:
df_scores = clf.negative_outlier_factor_
df_scores[0:30]

array([-1.11237244e+00, -1.00000000e+00, -1.05618622e+00, -1.00000000e+00,
       -9.64101615e-01, -9.76153125e-01, -1.20628580e+00, -1.00000000e+00,
       -9.54124145e-01, -1.00000000e+00, -1.35838165e+00, -1.06769362e+00,
       -9.73606798e-01, -9.65428645e-01, -1.18383741e+00, -1.16313671e+00,
       -1.21086536e+00, -5.00000000e+09, -1.41421356e+00, -1.13567449e+00,
       -1.00000000e+00, -1.17157288e+00, -1.28656609e+00, -1.22474487e+00,
       -1.10355339e+00, -1.31392974e+00, -1.02160050e+00, -1.19841474e+00,
       -1.10102051e+00, -1.17157288e+00])

In [15]:
np.sort(df_scores)[0:30]

array([-3.31662479e+10, -2.64575131e+10, -2.23606798e+10, -2.23606798e+10,
       -2.00000000e+10, -1.73205081e+10, -1.73205081e+10, -1.73205081e+10,
       -1.73205081e+10, -1.73205081e+10, -1.73205081e+10, -1.73205081e+10,
       -1.73205081e+10, -1.73205081e+10, -1.73205081e+10, -1.49767620e+10,
       -1.41421356e+10, -1.41421356e+10, -1.41421356e+10, -1.41421356e+10,
       -1.41421356e+10, -1.41421356e+10, -1.41421356e+10, -1.41421356e+10,
       -1.41421356e+10, -1.41421356e+10, -1.41421356e+10, -1.41421356e+10,
       -1.41421356e+10, -1.41421356e+10])

In [16]:
threshold_value = np.sort(df_scores)[2]

In [17]:
outlier_tf = df_scores > threshold_value

In [18]:
df = df[df_scores > threshold_value]

In [19]:
# A1Cresult and max_glu_serum
df['A1Cresult'] = df['A1Cresult'].replace(['>7','>8','Norm','None'],[1,1,0,-99])
df['max_glu_serum'] = df['max_glu_serum'].replace(['>200','>300','Norm','None'],[1,1,0,-99])

In [20]:
# One hot Encoding Race and Id's
one_hot_data = pd.get_dummies(df, columns=['race'], prefix=["enc"], drop_first = True)

columns_ids = ['admission_type_id', 'discharge_disposition_id', 'admission_source_id']

one_hot_data[columns_ids] = one_hot_data[columns_ids].astype('str')
one_hot_data = pd.get_dummies(one_hot_data, columns=columns_ids)

In [21]:
df = one_hot_data.copy()

In [22]:
diag_cols = ['diag_1', 'diag_2', 'diag_3']

df = pd.get_dummies(df, columns=diag_cols, prefix=["encdiag_1", "encdiag_2", "encdiag_3"], drop_first=True)

In [23]:
columns_to_drop = ['encdiag_1_NAN', 'encdiag_2_NAN', 'encdiag_3_NAN']

df = df.drop(columns=columns_to_drop)

In [24]:
# eliminando colunas com grande número de valores ausentes
df = df.drop(['weight','payer_code','medical_specialty'], axis = 1)

In [25]:
# Removendo as linhas Unknown/Invalid da coluna gender
df = df.loc[~df.gender.isin(['Unknown/Invalid'])]

In [26]:
df = df.drop(['citoglipton', 'examide'], axis = 1)

In [27]:
keys = ['metformin', 'repaglinide', 'nateglinide', 'chlorpropamide', 'glimepiride',
        'glipizide', 'glyburide', 'pioglitazone', 'rosiglitazone', 'acarbose',
        'miglitol', 'insulin', 'glyburide.metformin', 'tolazamide', 'metformin.pioglitazone',
        'metformin.rosiglitazone', 'glimepiride.pioglitazone', 'glipizide.metformin',
        'troglitazone', 'tolbutamide', 'acetohexamide']
for col in keys:
    colname = str(col) + 'temp'
    df[colname] = df[col].apply(lambda x: 0 if (x == 'No' or x == 'Steady') else 1)
df['numchange'] = 0
for col in keys:
    colname = str(col) + 'temp'
    df['numchange'] = df['numchange'] + df[colname]
    del df[colname]

df['numchange'].value_counts()

numchange
0    74056
1    26272
2     1318
3      108
4        5
Name: count, dtype: int64

In [28]:
df['change'] = df['change'].replace('Ch', 1)
df['change'] = df['change'].replace('No', 0)
df['gender'] = df['gender'].replace('Male', 1)
df['gender'] = df['gender'].replace('Female', 0)
df['diabetesMed'] = df['diabetesMed'].replace('Yes', 1)
df['diabetesMed'] = df['diabetesMed'].replace('No', 0)

for col in keys:
    df[col] = df[col].replace('No', 0)
    df[col] = df[col].replace('Steady', 1)
    df[col] = df[col].replace('Up', 1)
    df[col] = df[col].replace('Down', 1)

In [29]:
df['nummed'] = 0

for col in keys:
    df['nummed'] = df['nummed'] + df[col]
df['nummed'].value_counts()

nummed
1    47312
0    23401
2    21871
3     7777
4     1335
5       58
6        5
Name: count, dtype: int64

In [30]:
# transformando os intervalos das idade em 1 - 10
for i in range(0,10):
    df['age'] = df['age'].replace('['+str(10*i)+'-'+str(10*(i+1))+')', i+1)
df['age'].value_counts()

age
8     26066
7     22480
6     17256
9     17196
5      9685
4      3775
10     2792
3      1657
2       691
1       161
Name: count, dtype: int64

In [31]:
df['age'] = df['age'].astype('int64')
print(df.age.value_counts())
# convert age categories to mid-point values
age_dict = {1:5, 2:15, 3:25, 4:35, 5:45, 6:55, 7:65, 8:75, 9:85, 10:95}
df['age'] = df.age.map(age_dict)
print(df.age.value_counts())

age
8     26066
7     22480
6     17256
9     17196
5      9685
4      3775
10     2792
3      1657
2       691
1       161
Name: count, dtype: int64
age
75    26066
65    22480
55    17256
85    17196
45     9685
35     3775
95     2792
25     1657
15      691
5       161
Name: count, dtype: int64


In [32]:
interactionterms = [('num_medications','time_in_hospital'),
('num_medications','num_procedures'),
('time_in_hospital','num_lab_procedures'),
('num_medications','num_lab_procedures'),
('num_medications','number_diagnoses'),
('age','number_diagnoses'),
('change','num_medications'),
('number_diagnoses','time_in_hospital'),
('num_medications','numchange')]

In [33]:
for inter in interactionterms:
    name = inter[0] + '|' + inter[1]
    df[name] = df[inter[0]] * df[inter[1]]

In [34]:
df[['num_medications','time_in_hospital', 'num_medications|time_in_hospital']].head()

Unnamed: 0,num_medications,time_in_hospital,num_medications|time_in_hospital
0,17,4,68
1,10,3,30
2,8,2,16
3,12,1,12
4,23,3,69


In [35]:
bool_cols = df.select_dtypes(include=['bool']).columns
df[bool_cols] = df[bool_cols].astype(int)

In [36]:
# Colunas numéricas
numeric_cols = [
    'time_in_hospital',
    'num_lab_procedures',
    'num_procedures',
    'num_medications',
    'number_outpatient',
    'number_emergency',
    'number_inpatient',
    'number_diagnoses',
    'nummed',
    'num_medications|time_in_hospital',
    'num_medications|num_procedures',
    'time_in_hospital|num_lab_procedures',
    'num_medications|num_lab_procedures',
    'num_medications|number_diagnoses',
    'age|number_diagnoses',
    'change|num_medications',
    'number_diagnoses|time_in_hospital',
    'num_medications|numchange'
]

scaler = StandardScaler()

df[numeric_cols] = scaler.fit_transform(df[numeric_cols])

print(df[numeric_cols].describe())

       time_in_hospital  num_lab_procedures  num_procedures  num_medications  \
count      1.017590e+05        1.017590e+05    1.017590e+05     1.017590e+05   
mean       5.586083e-18        8.281368e-17    1.927199e-17    -1.094872e-16   
std        1.000005e+00        1.000005e+00    1.000005e+00     1.000005e+00   
min       -1.137692e+00       -2.139670e+00   -7.853987e-01    -1.848262e+00   
25%       -8.026939e-01       -6.148364e-01   -7.853987e-01    -7.409253e-01   
50%       -1.326970e-01        4.592477e-02   -1.991639e-01    -1.257385e-01   
75%        5.372998e-01        7.066860e-01    3.870708e-01     4.894483e-01   
max        3.217287e+00        4.518770e+00    2.732010e+00     7.994727e+00   

       number_outpatient  number_emergency  number_inpatient  \
count       1.017590e+05      1.017590e+05      1.017590e+05   
mean        5.250918e-17     -6.703299e-18      2.122711e-17   
std         1.000005e+00      1.000005e+00      1.000005e+00   
min        -2.914724e-0

In [44]:
# Selecionar as features e a variável target
feature_set = [col for col in df.columns if col != 'class']
X = df[feature_set]
y = df['class']

# Divisão treino (60%) e teste (20%) e validação (20%)
X_train_val, X_test, y_train_val, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)

X_train, X_val, y_train, y_val = train_test_split(X_train_val, y_train_val, test_size=0.25, random_state=42, stratify=y_train_val)

# Exibir as distribuições das classes em cada conjunto
print("Distribuição no conjunto de treino:")
print(y_train.value_counts(normalize=True))

print("\nDistribuição no conjunto de validação:")
print(y_val.value_counts(normalize=True))

print("\nDistribuição no conjunto de teste:")
print(y_test.value_counts(normalize=True))

# Exibir os tamanhos de cada conjunto
print(f"\nTamanho do conjunto de treino: X_train={X_train.shape}, y_train={y_train.shape}")
print(f"Tamanho do conjunto de validação: X_val={X_val.shape}, y_val={y_val.shape}")
print(f"Tamanho do conjunto de teste: X_test={X_test.shape}, y_test={y_test.shape}")

print(f"Tamanho treino: {len(X_train) / len(X):.2%}")
print(f"Tamanho validação: {len(X_val) / len(X):.2%}")
print(f"Tamanho teste: {len(X_test) / len(X):.2%}")

# Salvar os conjuntos em arquivos CSV
# Concatenar X_train com y_train
train_data = pd.concat([X_train, y_train], axis=1)
train_data.to_csv("train_data.csv", index=False)

# Concatenar X_val com y_val
validation_data = pd.concat([X_val, y_val], axis=1)
validation_data.to_csv("validation_data.csv", index=False)

# Concatenar X_test com y_test
test_data = pd.concat([X_test, y_test], axis=1)
test_data.to_csv("test_data.csv", index=False)

print("\nArquivos CSV salvos com sucesso:")
print("- train_data.csv")
print("- validation_data.csv")
print("- test_data.csv")

Distribuição no conjunto de treino:
class
0    0.888396
1    0.111604
Name: proportion, dtype: float64

Distribuição no conjunto de validação:
class
0    0.888365
1    0.111635
Name: proportion, dtype: float64

Distribuição no conjunto de teste:
class
0    0.888414
1    0.111586
Name: proportion, dtype: float64

Tamanho do conjunto de treino: X_train=(61055, 125), y_train=(61055,)
Tamanho do conjunto de validação: X_val=(20352, 125), y_val=(20352,)
Tamanho do conjunto de teste: X_test=(20352, 125), y_test=(20352,)
Tamanho treino: 60.00%
Tamanho validação: 20.00%
Tamanho teste: 20.00%

Arquivos CSV salvos com sucesso:
- train_data.csv
- validation_data.csv
- test_data.csv


----

# Modelagem e Otimização

In [45]:
# Carregando os conjuntos de dados de treinamento, validação e teste
train_data = pd.read_csv('train_data.csv')
validation_data = pd.read_csv('validation_data.csv')
test_data = pd.read_csv('test_data.csv')

In [46]:
# Separando features e target no conjunto de treinamento
X_train = train_data.drop('class', axis=1)
y_train = train_data['class']

# Separando features e target no conjunto de validação
X_val = validation_data.drop('class', axis=1)
y_val = validation_data['class']

# Separando features e target no conjunto de teste
X_test = test_data.drop('class', axis=1)
y_test = test_data['class']

In [47]:
# Aplicando o ADASYN no conjunto de treinamento
adasyn = ADASYN(random_state=42)
X_train, y_train = adasyn.fit_resample(X_train, y_train)

print('Distribuição das classes no conjunto de treinamento após o ADASYN:')
print(Counter(y_train))

Distribuição das classes no conjunto de treinamento após o ADASYN:
Counter({0: 54241, 1: 53008})


In [48]:
# Função para avaliar o modelo utilizando validação cruzada estratificada com k=10
def evaluate_model(model, X_train, y_train, X_val, y_val, model_name):
    model.fit(X_train, y_train)
    y_pred = model.predict(X_val)
    acc = accuracy_score(y_val, y_pred)
    f1 = f1_score(y_val, y_pred)
    recall = recall_score(y_val, y_pred)
    # Cálculo da ACSA
    cm = confusion_matrix(y_val, y_pred)
    acsa = np.mean(np.diag(cm) / np.sum(cm, axis=1))
    print(f"{model_name} - Acurácia: {acc:.4f}, F1-Score: {f1:.4f}, Recall: {recall:.4f}, ACSA: {acsa:.4f}")
    
    # Plotando a matriz de confusão
    plt.figure(figsize=(6,4))
    sns.heatmap(cm, annot=True, fmt='d', cmap='Blues')
    plt.title(f'Matriz de Confusão - {model_name}')
    plt.xlabel('Predito')
    plt.ylabel('Real')
    plt.show()
    
    # Imprimindo o relatório de classificação
    print(f"Relatório de Classificação para {model_name}:\n", classification_report(y_val, y_pred))
    
    # Plotando a curva ROC, se possível
    if hasattr(model, "predict_proba"):
        y_prob = model.predict_proba(X_val)[:,1]
        fpr, tpr, thresholds = roc_curve(y_val, y_prob)
        roc_auc = roc_auc_score(y_val, y_prob)
        plt.figure(figsize=(6,4))
        plt.plot(fpr, tpr, label=f'AUC = {roc_auc:.2f}')
        plt.plot([0,1], [0,1], 'k--')
        plt.title(f'Curva ROC - {model_name}')
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.legend(loc='lower right')
        plt.show()
    else:
        print(f"{model_name} não suporta predict_proba, curva ROC não será plotada.")
    
    return acc, f1, recall, acsa

In [49]:
# Definindo os modelos
# Dicionário para armazenar os modelos
models = {
    'K-NN': KNeighborsClassifier(),
    'LVQ': NearestCentroid(),
    'Árvore de Decisão': DecisionTreeClassifier(),
    'SVM': SVC(probability=True),
    'Random Forest': RandomForestClassifier(),
    'Rede Neural MLP': MLPClassifier(max_iter=500),
    'XGBoost': XGBClassifier(use_label_encoder=False, eval_metric='logloss'),
    'LightGBM': LGBMClassifier()
}

# Comitê de Redes Neurais Artificiais
nn1 = MLPClassifier(hidden_layer_sizes=(50,), activation='relu', solver='adam', max_iter=500, random_state=1)
nn2 = MLPClassifier(hidden_layer_sizes=(100,), activation='tanh', solver='sgd', max_iter=500, random_state=2)
nn3 = MLPClassifier(hidden_layer_sizes=(50, 50), activation='relu', solver='adam', max_iter=500, random_state=3)

committee_nn = VotingClassifier(estimators=[
    ('nn1', nn1),
    ('nn2', nn2),
    ('nn3', nn3)
], voting='soft')

models['Comitê de Redes Neurais Artificiais'] = committee_nn

# Comitê Heterogêneo (Stacking)
from sklearn.linear_model import LogisticRegression

estimators = [
    ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
    ('svm', SVC(probability=True, random_state=42)),
    ('knn', KNeighborsClassifier())
]

final_estimator = LogisticRegression()

stacking_model = StackingClassifier(estimators=estimators, final_estimator=final_estimator, cv=5)

models['Comitê Heterogêneo (Stacking)'] = stacking_model


In [None]:
# Otimização de hiperparâmetros com Optuna
# Configurando o Callback do MLflow para o Optuna
mlflc = MLflowCallback(tracking_uri=mlflow.get_tracking_uri(), metric_name='accuracy')

# Definindo funções auxiliares para construção e otimização dos modelos
def build_classifier(model_name, params):
    if model_name == 'K-NN':
        classifier = KNeighborsClassifier(
            n_neighbors=params['n_neighbors'],
            weights=params['weights'],
            algorithm=params['algorithm']
        )
    elif model_name == 'LVQ':
        classifier = NearestCentroid(
            metric=params['metric'],
            shrink_threshold=params['shrink_threshold']
        )
    elif model_name == 'Árvore de Decisão':
        classifier = DecisionTreeClassifier(
            max_depth=params['max_depth'],
            min_samples_split=params['min_samples_split'],
            criterion=params['criterion']
        )
    elif model_name == 'SVM':
        classifier = SVC(
            C=params['C'],
            kernel=params['kernel'],
            gamma=params['gamma']
        )
    elif model_name == 'Random Forest':
        classifier = RandomForestClassifier(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            min_samples_split=params['min_samples_split']
        )
    elif model_name == 'Rede Neural MLP':
        classifier = MLPClassifier(
            hidden_layer_sizes=params['hidden_layer_sizes'],
            activation=params['activation'],
            solver=params['solver'],
            alpha=params['alpha'],
            max_iter=500
        )
    elif model_name == 'Comitê de Redes Neurais Artificiais':
        nn1 = MLPClassifier(
            hidden_layer_sizes=params['hidden_layer_sizes'],
            activation=params['activation'],
            solver=params['solver'],
            max_iter=500,
            random_state=1
        )
        nn2 = MLPClassifier(
            hidden_layer_sizes=params['hidden_layer_sizes'],
            activation=params['activation'],
            solver=params['solver'],
            max_iter=500,
            random_state=2
        )
        nn3 = MLPClassifier(
            hidden_layer_sizes=params['hidden_layer_sizes'],
            activation=params['activation'],
            solver=params['solver'],
            max_iter=500,
            random_state=3
        )
        classifier = VotingClassifier(
            estimators=[
                ('nn1', nn1),
                ('nn2', nn2),
                ('nn3', nn3)
            ],
            voting='soft'
        )
    elif model_name == 'Comitê Heterogêneo (Stacking)':
        final_estimator = LogisticRegression(
            C=params['C'],
            penalty=params['penalty'],
            solver=params['solver']
        )
        estimators = [
            ('rf', RandomForestClassifier(n_estimators=100, random_state=42)),
            ('svm', SVC(random_state=42)),
            ('knn', KNeighborsClassifier())
        ]
        classifier = StackingClassifier(
            estimators=estimators,
            final_estimator=final_estimator,
            cv=5
        )
    elif model_name == 'XGBoost':
        classifier = XGBClassifier(
            n_estimators=params['n_estimators'],
            max_depth=params['max_depth'],
            learning_rate=params['learning_rate'],
            subsample=params['subsample'],
            use_label_encoder=False,
            eval_metric='logloss'
        )
    elif model_name == 'LightGBM':
        classifier = LGBMClassifier(
            n_estimators=params['n_estimators'],
            num_leaves=params['num_leaves'],
            learning_rate=params['learning_rate'],
            subsample=params['subsample']
        )
    else:
        raise ValueError(f"Modelo {model_name} não reconhecido.")
    return classifier

def objective_factory(model_name):
    def objective(trial):
        params = {}
        # Definir os hiperparâmetros para cada modelo
        if model_name == 'K-NN':
            params['n_neighbors'] = trial.suggest_int('n_neighbors', 1, 30)
            params['weights'] = trial.suggest_categorical('weights', ['uniform', 'distance'])
            params['algorithm'] = trial.suggest_categorical('algorithm', ['auto', 'ball_tree', 'kd_tree', 'brute'])
        
        elif model_name == 'LVQ':
            params['metric'] = trial.suggest_categorical('metric', ['euclidean', 'manhattan'])
            use_shrinkage = trial.suggest_categorical('use_shrinkage', [True, False])
            params['shrink_threshold'] = trial.suggest_float('shrink_threshold', 0.0, 1.0) if use_shrinkage else None
        
        elif model_name == 'Árvore de Decisão':
            params['max_depth'] = trial.suggest_int('max_depth', 1, 20)
            params['min_samples_split'] = trial.suggest_int('min_samples_split', 2, 10)
            params['criterion'] = trial.suggest_categorical('criterion', ['gini', 'entropy'])
        
        elif model_name == 'SVM':
            params['C'] = trial.suggest_float('C', 0.1, 10.0, log=True)
            params['kernel'] = trial.suggest_categorical('kernel', ['linear', 'rbf'])
            params['gamma'] = trial.suggest_categorical('gamma', ['scale', 'auto'])
        
        elif model_name == 'Random Forest':
            params['n_estimators'] = trial.suggest_int('n_estimators', 50, 300)
            params['max_depth'] = trial.suggest_int('max_depth', 2, 20)
            params['min_samples_split'] = trial.suggest_int('min_samples_split', 2, 10)
        
        elif model_name == 'Rede Neural MLP':
            params['hidden_layer_sizes'] = trial.suggest_categorical('hidden_layer_sizes', [(50,), (100,), (50,50)])
            params['activation'] = trial.suggest_categorical('activation', ['tanh', 'relu'])
            params['solver'] = trial.suggest_categorical('solver', ['sgd', 'adam'])
            params['alpha'] = trial.suggest_float('alpha', 1e-5, 1e-1, log=True)
        
        elif model_name == 'Comitê de Redes Neurais Artificiais':
            params['hidden_layer_sizes'] = trial.suggest_categorical('hidden_layer_sizes', [(50,), (100,), (50,50)])
            params['activation'] = trial.suggest_categorical('activation', ['tanh', 'relu'])
            params['solver'] = trial.suggest_categorical('solver', ['sgd', 'adam'])
        
        elif model_name == 'Comitê Heterogêneo (Stacking)':
            params['C'] = trial.suggest_float('C', 0.1, 10.0, log=True)
            params['penalty'] = trial.suggest_categorical('penalty', ['l2'])
            params['solver'] = trial.suggest_categorical('solver', ['lbfgs'])
        
        elif model_name == 'XGBoost':
            params['n_estimators'] = trial.suggest_int('n_estimators', 50, 300)
            params['max_depth'] = trial.suggest_int('max_depth', 3, 15)
            params['learning_rate'] = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
            params['subsample'] = trial.suggest_float('subsample', 0.5, 1.0)
        
        elif model_name == 'LightGBM':
            params['n_estimators'] = trial.suggest_int('n_estimators', 50, 300)
            params['num_leaves'] = trial.suggest_int('num_leaves', 31, 150)
            params['learning_rate'] = trial.suggest_float('learning_rate', 0.01, 0.3, log=True)
            params['subsample'] = trial.suggest_float('subsample', 0.5, 1.0)
        
        else:
            raise ValueError(f"Modelo {model_name} não reconhecido.")
        
        # Treinando no conjunto de treinamento balanceado
        # classifier.fit(X_train, y_train)
        # # Avaliando no conjunto de validação
        # y_pred_val = classifier.predict(X_val)
        # acc = accuracy_score(y_val, y_pred_val)
        classifier = build_classifier(model_name, params)
        
        acc = cross_val_score(classifier, X_train, y_train, scoring='accuracy', n_jobs=-1).mean()
        return acc
    return objective

# Executando as Otimizações com Optuna para SVM e Random Forest
n_trials = 1  # Número de iterações

best_models = {}
best_params = {}
cv_results = {}
# models_to_optimize = ['K-NN', 'Random Forest']

for model_name in models.keys():
    print(f"Otimização para o modelo: {model_name}")
    study = optuna.create_study(direction='maximize', study_name=model_name, sampler=optuna.samplers.TPESampler(seed=42))
    study.optimize(objective_factory(model_name), n_trials=n_trials, callbacks=[mlflc])
    
    # Armazenando os melhores hiperparâmetros
    best_params[model_name] = study.best_params
    
    # Criando o classificador com os melhores hiperparâmetros
    classifier = build_classifier(model_name, best_params[model_name])
    
    # Armazenando o classificador otimizado
    best_models[model_name] = classifier
    
    # Armazenando os scores dos folds
    acc_scores = cross_val_score(classifier, X_train, y_train, cv=10, scoring='accuracy')
    cv_results[model_name] = acc_scores


[I 2024-11-30 11:14:32,287] A new study created in memory with name: K-NN


Otimização para o modelo: K-NN


[I 2024-11-30 11:14:50,479] Trial 0 finished with value: 0.6984866944639428 and parameters: {'n_neighbors': 12, 'weights': 'uniform', 'algorithm': 'auto'}. Best is trial 0 with value: 0.6984866944639428.
[I 2024-11-30 11:15:08,977] A new study created in memory with name: LVQ


Otimização para o modelo: LVQ


[I 2024-11-30 11:15:11,655] Trial 0 finished with value: 0.6058612498819502 and parameters: {'metric': 'manhattan', 'use_shrinkage': True, 'shrink_threshold': 0.15601864044243652}. Best is trial 0 with value: 0.6058612498819502.
[I 2024-11-30 11:15:15,362] A new study created in memory with name: Árvore de Decisão


Otimização para o modelo: Árvore de Decisão


[I 2024-11-30 11:15:18,588] Trial 0 finished with value: 0.8017233767832863 and parameters: {'max_depth': 8, 'min_samples_split': 10, 'criterion': 'gini'}. Best is trial 0 with value: 0.8017233767832863.
[I 2024-11-30 11:15:27,162] A new study created in memory with name: SVM


Otimização para o modelo: SVM


In [None]:
# Avaliando os modelos otimizados
# results = []

# for model_name, model in best_models.items():
#     with mlflow.start_run(run_name=f"{model_name} - Optuna HPO"):
#         acc, f1, recall, acsa = evaluate_model(model, X_train, y_train, model_name)
#         mlflow.log_params(best_params[model_name])
#         mlflow.log_metric("accuracy", acc)
#         mlflow.log_metric("f1_score", f1)
#         mlflow.log_metric("recall", recall)
#         mlflow.log_metric("acsa", acsa)
#         # Salvar o modelo
#         mlflow.sklearn.log_model(model, model_name)
#         results.append({
#             'Modelo': model_name,
#             'Acurácia': acc,
#             'F1-Score': f1,
#             'Recall': recall,
#             'ACSA': acsa
#         })

# # Converte os resultados em DataFrame
# optimized_results_df = pd.DataFrame(results)

# # Exibindo os resultados dos modelos otimizados
# print(optimized_results_df)


K-NN - Acurácia Média: 0.7505, F1-Score Médio: 0.7975, Recall Médio: 0.9941, ACSA Média: 0.7533




Random Forest - Acurácia Média: 0.9067, F1-Score Médio: 0.8701, Recall Médio: 0.8581, ACSA Média: 0.9061




          Modelo  Acurácia  F1-Score    Recall      ACSA
0           K-NN  0.750477  0.797460  0.994097  0.753309
1  Random Forest  0.906664  0.870065  0.858084  0.906096


----

# Avaliação dos Modelos

In [None]:
# Avaliando todos os modelos
# final_results = []

# for model_name, model in models.items():
#     if model_name in best_models:
#         # Usar o modelo otimizado
#         model = best_models[model_name]
#     else:
#         # Usar o modelo padrão
#         model = model
#     acc, f1, recall, acsa = evaluate_model(model, X_train, y_train.values, model_name)
#     final_results.append({
#         'Modelo': model_name,
#         'Acurácia': acc,
#         'F1-Score': f1,
#         'Recall': recall,
#         'ACSA': acsa
#     })

# # Convertendo para DataFrame
# final_results_df = pd.DataFrame(final_results)

# # Exibindo os resultados finais
# print(final_results_df)

final_results = []

for model_name, model in models.items():
    with mlflow.start_run(run_name=f"{model_name} - Avaliação Final"):
        acc, f1, recall, acsa = evaluate_model(model, X_train, y_train, X_val, y_val, model_name)
        mlflow.log_params(best_params[model_name])
        mlflow.log_metric("accuracy", acc)
        mlflow.log_metric("f1_score", f1)
        mlflow.log_metric("recall", recall)
        mlflow.log_metric("acsa", acsa)
        mlflow.sklearn.log_model(model, model_name)
        final_results.append({
            'Modelo': model_name,
            'Acurácia': acc,
            'F1-Score': f1,
            'Recall': recall,
            'ACSA': acsa
        })

# Convertendo para DataFrame
final_results_df = pd.DataFrame(final_results)

# Exibindo os resultados finais
print(final_results_df)


K-NN - Acurácia Média: 0.7505, F1-Score Médio: 0.7975, Recall Médio: 0.9941, ACSA Média: 0.7533
LVQ - Acurácia Média: 0.5075, F1-Score Médio: 0.6171, Recall Médio: 0.8032, ACSA Média: 0.5109
Árvore de Decisão - Acurácia Média: 0.8518, F1-Score Médio: 0.8291, Recall Médio: 0.8499, ACSA Média: 0.8518


In [None]:
# Plotando as métricas
final_results_melted = final_results_df.melt(id_vars='Modelo', value_vars=['Acurácia', 'F1-Score', 'Recall', 'ACSA'], var_name='Métrica', value_name='Valor')

plt.figure(figsize=(12,8))
sns.barplot(x='Modelo', y='Valor', hue='Métrica', data=final_results_melted)
plt.xticks(rotation=45)
plt.title('Comparação das Métricas dos Modelos')
plt.legend(loc='lower right')
plt.show()


In [None]:
# # Usando os scores de validação cruzada
# model_scores = {}

# for model_name, model in models.items():
#     if model_name in best_models:
#         model = best_models[model_name]
#     else:
#         model = model
#     skf = StratifiedKFold(n_splits=10)
#     scores = cross_val_score(model, X_train, y_train.values, cv=skf, scoring='accuracy')
#     model_scores[model_name] = scores

# # Criando DataFrame com os scores
# scores_df = pd.DataFrame(model_scores)

# # Teste de Friedman
# stat, p = friedmanchisquare(*[scores_df[model] for model in scores_df.columns])
# print(f'Estatística de Friedman: {stat}, p-valor: {p}')

# # Se p-valor < 0.05, há diferença significativa
# if p < 0.05:
#     print('Diferença significativa entre os modelos. Realizando teste de Nemenyi.')
#     nemenyi = sp.posthoc_nemenyi_friedman(scores_df.values)
#     nemenyi_df = pd.DataFrame(nemenyi, index=scores_df.columns, columns=scores_df.columns)
#     print(nemenyi_df)
# else:
#     print('Não há diferença significativa entre os modelos.')

cv_results_df = pd.DataFrame(cv_results)

metricas = ['Acurácia', 'F1-Score', 'Recall', 'ACSA']

for metrica in metricas:
    print(f"\nAnálise Estatística para a Métrica: {metrica}")
    # Pivotando o DataFrame
    pivot_df = cv_results_df.pivot(index='Fold', columns='Modelo', values=metrica)
    
    # Removendo modelos que não possuem valores (caso haja)
    pivot_df = pivot_df.dropna(axis=1, how='any')
    
    # Aplicando o teste de Friedman
    stat, p = friedmanchisquare(*[pivot_df[model] for model in pivot_df.columns])
    print(f'Estatística de Friedman: {stat}, p-valor: {p}')
    
    # Se p-valor < 0.05, há diferença significativa
    if p < 0.05:
        print('Diferença significativa entre os modelos. Realizando teste de Nemenyi.')
        nemenyi = sp.posthoc_nemenyi_friedman(pivot_df.values)
        nemenyi_df = pd.DataFrame(nemenyi, index=pivot_df.columns, columns=pivot_df.columns)
        print(nemenyi_df)
    else:
        print(f'Não há diferença significativa entre os modelos para a métrica {metrica}.')


----

# Análise de Custo-Benefício

In [None]:
cost_benefit_results = []

for model_name, model in models.items():
    if model_name in best_models:
        model = best_models[model_name]
    else:
        model = model
    start_time = time.time()
    model.fit(X_train, y_train.values)
    end_time = time.time()
    training_time = end_time - start_time
    mem_usage = sys.getsizeof(model)
    cost_benefit_results.append({
        'Modelo': model_name,
        'Tempo de Treinamento (s)': training_time,
        'Uso de Memória (bytes)': mem_usage
    })

# Convertendo para DataFrame
cost_benefit_df = pd.DataFrame(cost_benefit_results)

# Exibindo os resultados
print(cost_benefit_df)


In [None]:
# Plotando o tempo de treinamento
plt.figure(figsize=(12,6))
sns.barplot(x='Modelo', y='Tempo de Treinamento (s)', data=cost_benefit_df)
plt.xticks(rotation=45)
plt.title('Tempo de Treinamento dos Modelos')
plt.show()

# Plotando o uso de memória
plt.figure(figsize=(12,6))
sns.barplot(x='Modelo', y='Uso de Memória (bytes)', data=cost_benefit_df)
plt.xticks(rotation=45)
plt.title('Uso de Memória dos Modelos')
plt.show()


----

# Teste de Estresse dos Modelos

In [None]:
# Utilizando o conjunto de teste para o teste de estresse
stress_test_results = []

for model_name, model in models.items():
    if model_name in best_models:
        model = best_models[model_name]
    else:
        model = model
    # Avaliando no conjunto de teste
    y_pred_test = model.predict(X_test)
    acc_test = accuracy_score(y_test, y_pred_test)
    f1_test = f1_score(y_test, y_pred_test)
    recall_test = recall_score(y_test, y_pred_test)
    # Cálculo da ACSA
    cm = confusion_matrix(y_test, y_pred_test)
    acsa_test = np.mean(np.diag(cm) / np.sum(cm, axis=1))
    stress_test_results.append({
        'Modelo': model_name,
        'Acurácia': acc_test,
        'F1-Score': f1_test,
        'Recall': recall_test,
        'ACSA': acsa_test
    })

# Convertendo para DataFrame
stress_test_df = pd.DataFrame(stress_test_results)

# Exibindo os resultados
print(stress_test_df)


----

# Discussão sobre Limitações e Futuras Melhorias

----

# Conclusão

----

# Referências