<a href="https://colab.research.google.com/github/MatheusABomfim/Mest_MBL2_ML_HTLV/blob/main/MEST_HTLV%2BMBL2%2BCLINICAL_Hyperparameters_tuning.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# **Loading data and packages**

**Libraries import**

In [1]:
# Import necessary libraries and packages
import os
import warnings
import pandas as pd
import numpy as np


# sklearn imports
from sklearn.model_selection import (
    train_test_split, cross_val_score, LeaveOneOut, cross_validate,
    learning_curve
)

from sklearn.ensemble import (
    ExtraTreesClassifier, RandomForestClassifier,
    AdaBoostClassifier, GradientBoostingClassifier
)
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis, QuadraticDiscriminantAnalysis
from sklearn.metrics import (
    confusion_matrix, ConfusionMatrixDisplay, classification_report,
    recall_score, accuracy_score, roc_curve, auc, make_scorer,
    precision_score, f1_score
)
from sklearn.preprocessing import (
    LabelEncoder, MinMaxScaler, MaxAbsScaler, QuantileTransformer,
    RobustScaler, StandardScaler
)
from sklearn.datasets import make_classification
from sklearn.feature_selection import SelectKBest, f_classif
from sklearn.neural_network import MLPClassifier

# imbalanced-learn imports
from imblearn.metrics import geometric_mean_score
from imblearn.over_sampling import RandomOverSampler, ADASYN, SMOTENC
from imblearn.under_sampling import RandomUnderSampler
from imblearn.pipeline import Pipeline
from imblearn.combine import SMOTEENN

# joblib imports
from joblib import dump, load

# scikit-optimize imports
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer

# statsmodels imports
from statsmodels.stats.outliers_influence import variance_inflation_factor
from statsmodels.tools.tools import add_constant

# openpyxl imports
from openpyxl import Workbook
from openpyxl.utils.dataframe import dataframe_to_rows

# ipywidgets imports
from ipywidgets import interact, widgets

# Ignore warnings
warnings.filterwarnings('ignore')
warnings.simplefilter(action='ignore', category=FutureWarning)

from sklearn.model_selection import LeaveOneOut
from sklearn.preprocessing import MinMaxScaler
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
from skopt import BayesSearchCV
from skopt.space import Integer, Real, Categorical
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import GradientBoostingClassifier, RandomForestClassifier, AdaBoostClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.svm import SVC, LinearSVC
from sklearn.neighbors import KNeighborsClassifier
from imblearn.metrics import geometric_mean_score
from sklearn.metrics import make_scorer
import warnings
import os
from joblib import dump
from imblearn.over_sampling import RandomOverSampler, ADASYN
from imblearn.under_sampling import RandomUnderSampler, TomekLinks

**Data import**

In [2]:
# Definindo variáveis globais para armazenar o DataFrame e o nome do dataset
df = pd.DataFrame()
dataset_name = None
X = None
y = None
balancing_method_selected = None  # Variável global para armazenar o método de balanceamento selecionado

# Função para listar arquivos CSV no diretório
def list_csv_files(directory):
    return [f for f in os.listdir(directory) if f.endswith('.csv')]

# Função para carregar e processar o dataset selecionado
def load_and_process_data(selected_file):
    global df, dataset_name, X, y
    dataset_name = selected_file
    file_path = os.path.join('..', 'datasets', 'feature_selection', dataset_name)

    if not os.path.exists(file_path):
        print(f"Erro: O arquivo '{file_path}' não foi encontrado.")
        return None

    df = pd.read_csv(file_path)

    # Convertendo 'Gênero' para valores numéricos
    if 'Gênero' in df.columns:
        df['Gênero'] = df['Gênero'].replace({'M': 0, 'F': 1})

    # Normalizando 'Idade_clinica'
    if 'Idade_clinica' in df.columns:
        scaler = MinMaxScaler()
        df['Idade_clinica'] = scaler.fit_transform(df[['Idade_clinica']])

    # Definindo X (features) e y (target)
    X = df.drop(columns=['HAM/TSP']) 
    y = df['HAM/TSP'] 

    print(f"Data Loaded and Processed Successfully! Dataset: {dataset_name}")
    return df.head()

# Função para definir o método de balanceamento selecionado
def set_balancing_method(method):
    global balancing_method_selected  # Declarando a variável como global
    balancing_method_selected = method  # Atribui o método selecionado à variável
    print(f"Método de balanceamento selecionado: {balancing_method_selected}")

# Função para aplicar o método de balanceamento selecionado
def apply_balancing(X, y):
    global balancing_method_selected
    if balancing_method_selected is None:
        print("Nenhum método de balanceamento foi selecionado.")
        return X, y

    if balancing_method_selected == 'Random Over Sampler':
        balancer = RandomOverSampler()
    elif balancing_method_selected == 'SMOTENC':
        # Identificar colunas categóricas
        categorical_features = [i for i, col in enumerate(X.columns) if col != 'Idade_clinica']
        balancer = SMOTENC(categorical_features=categorical_features, random_state=42)
    elif balancing_method_selected == 'Random Under Sampler':
        balancer = RandomUnderSampler()
    elif balancing_method_selected == 'Tomek Links':
        balancer = TomekLinks()
    else:
        print(f"Método de balanceamento desconhecido: {balancing_method_selected}")
        return X, y

    # Aplicando o método de balanceamento
    X_res, y_res = balancer.fit_resample(X, y)
    return X_res, y_res

# Definir o diretório onde os arquivos CSV estão localizados
directory = '..\\datasets\\feature_selection\\'

# Listar os arquivos CSV disponíveis no diretório
csv_files = list_csv_files(directory)

# Criar um widget de seleção com os arquivos listados
file_selector = widgets.Dropdown(
    options=csv_files,
    description='Select File:'
)

# Criar um widget para selecionar o método de balanceamento
balancing_methods = ['Random Over Sampler', 'SMOTENC', 'Random Under Sampler', 'Tomek Links']
method_selector = widgets.Dropdown(
    options=balancing_methods,
    description='Balancing Method:'
)

# Função interativa para carregar e processar o arquivo CSV selecionado
@interact(selected_file=file_selector)
def load_and_process_selected_file(selected_file):
    return load_and_process_data(selected_file)

# Função interativa para aplicar o método de balanceamento selecionado
@interact(selected_method=method_selector)
def apply_selected_balancing_method(selected_method):
    set_balancing_method(selected_method)
    X_res, y_res = apply_balancing(X, y)  # Aplica o balanceamento aos dados carregados
    print(f"Shape dos dados balanceados: {X_res.shape}, {y_res.shape}")


interactive(children=(Dropdown(description='Select File:', options=('df_Clinical_data_TL.csv',), value='df_Cli…

interactive(children=(Dropdown(description='Balancing Method:', options=('Random Over Sampler', 'SMOTENC', 'Ra…

# Hyperparametes tuning by bayeasian optimization

**Search for the best hyperparameters and saved definitions**

In [3]:
# Define a random state for reproducibility
RANDOM_STATE = 42

# Ignore warnings
warnings.filterwarnings('ignore')

# Common settings for all models
common_settings = {
    'cv': LeaveOneOut(),
    'n_iter': 10,
    'n_jobs': -1,
    'verbose': 0,
}

# Bayesian search function
def bayes_search(model, space):
    bs = BayesSearchCV(
        model, space,
        n_iter=common_settings['n_iter'],
        cv=common_settings['cv'],
        random_state=RANDOM_STATE,
        scoring=make_scorer(geometric_mean_score)
    )
    return bs

# Model space and hyperparameters
model_space = {
    'DT': {
        'model': DecisionTreeClassifier(random_state=RANDOM_STATE),
        'space': {
            'classifier__max_depth': Integer(3, 5),
            'classifier__min_samples_split': Integer(5, 10),
            'classifier__min_samples_leaf': Integer(1, 5),
            'classifier__criterion': Categorical(['gini', 'entropy']),
            'classifier__ccp_alpha': Real(0.0, 0.1),
        }
    },
    'GB': {
        'model': GradientBoostingClassifier(random_state=RANDOM_STATE),
        'space': {
            'classifier__n_estimators': Integer(50, 500),
            'classifier__learning_rate': Real(0.01, 1.0, 'log-uniform'),
            'classifier__max_depth': Integer(1, 10),
            'classifier__min_samples_split': Integer(2, 20),
            'classifier__min_samples_leaf': Integer(1, 20),
            'classifier__subsample': Real(0.5, 1.0, 'log-uniform'),
            'classifier__loss': Categorical(['exponential', 'log_loss']),
        }
    },
    'LR': {
        'model': LogisticRegression(random_state=RANDOM_STATE),
        'space': {
            'classifier__C': Real(0.0001, 1000, 'log-uniform'),
            'classifier__max_iter': Integer(200, 2000),
            'classifier__solver': Categorical(['liblinear', 'sag', 'saga']),
            'classifier__class_weight': Categorical(['balanced', None]),
            'classifier__tol': Real(0.0001, 0.001),
        }
    },
    'MLP': {
        'model': MLPClassifier(random_state=RANDOM_STATE),
        'space': {
            'classifier__hidden_layer_sizes': Integer(2, 16),
            'classifier__activation': Categorical(['logistic', 'tanh', 'relu']),
            'classifier__learning_rate': Categorical(['constant', 'adaptive']),
            'classifier__learning_rate_init': Real(0.001, 0.1, 'log-uniform'),
            'classifier__max_iter': Integer(200, 2000),
            'classifier__solver': Categorical(['sgd', 'adam']),
            'classifier__momentum': Real(0.1, 0.9, 'log-uniform'),
        }
    },
    'NB': {
        'model': GaussianNB(),
        'space': None,  # GaussianNB não possui hiperparâmetros para ajuste
    },
    'RF': {
        'model': RandomForestClassifier(random_state=RANDOM_STATE),
        'space': {
            'classifier__n_estimators': Integer(50, 500),
            'classifier__max_depth': Integer(3, 10),
            'classifier__min_samples_split': Integer(2, 10),
            'classifier__min_samples_leaf': Integer(1, 5),
            'classifier__criterion': Categorical(['gini', 'entropy']),
            'classifier__max_samples': Real(0.5, 1.0, 'log-uniform'),
            'classifier__class_weight': Categorical(['balanced', 'balanced_subsample']),
        }
    },
    'SVM': {
        'model': SVC(probability=True, random_state=RANDOM_STATE),
        'space': {
            'classifier__kernel': Categorical(['linear', 'poly', 'rbf', 'sigmoid']),
            'classifier__gamma': Real(0.001, 1000),
            'classifier__degree': Integer(2, 5),
            'classifier__coef0': Real(0.0001, 1, 'log-uniform'),
            'classifier__C': Real(0.001, 1000),
            'classifier__tol': Real(0.00001, 0.1, 'log-uniform'),
            'classifier__max_iter': Integer(200, 2000),
            'classifier__class_weight': Categorical([None, 'balanced']),
        }
    },
    'KNN': {
        'model': KNeighborsClassifier(),
        'space': {
            'classifier__n_neighbors': Integer(1, 20),
            'classifier__weights': Categorical(['uniform', 'distance']),
            'classifier__algorithm': Categorical(['auto', 'ball_tree', 'kd_tree', 'brute']),
            'classifier__leaf_size': Integer(20, 50),
            'classifier__p': Integer(1, 2),
        }
    },
    'AB': {
        'model': AdaBoostClassifier(random_state=RANDOM_STATE),
        'space': {
            'classifier__n_estimators': Integer(50, 500),
            'classifier__learning_rate': Real(0.01, 1.0, 'log-uniform'),
        }
    },
    'LSVM': {
        'model': LinearSVC(random_state=RANDOM_STATE),
        'space': {
            'classifier__C': Real(0.0001, 1000, 'log-uniform'),
            'classifier__tol': Real(0.00001, 0.1, 'log-uniform'),
            'classifier__max_iter': Integer(200, 2000),
            'classifier__class_weight': Categorical([None, 'balanced']),
        }
    },
}


# Template saving directory
save_dir = '..\\models\\'

# Define the balancing methods, including SMOTENC
balancing_methods = {
    'Random Over Sampler': RandomOverSampler(random_state=RANDOM_STATE),
    'SMOTENC': None,
    'Random Under Sampler': RandomUnderSampler(random_state=RANDOM_STATE),
    'Tomek Links': TomekLinks()
}

# Iterating over the models in the search space
for model_name in model_space:
    model = model_space[model_name]['model']
    space = model_space[model_name]['space']

    # Confirmação do método de balanceamento selecionado
    print(f"Método de balanceamento selecionado: {balancing_method_selected}")

    # Verificação para o SMOTENC: ajustar colunas categóricas conforme necessário
    if balancing_method_selected == 'SMOTENC':
        # Ajuste as colunas categóricas dinamicamente se necessário
        categorical_columns = [col for col in X.columns if col != 'Age']
        categorical_features = [X.columns.get_loc(col) for col in categorical_columns]
        balancing_methods['SMOTENC'] = SMOTENC(categorical_features=categorical_features, random_state=RANDOM_STATE)

    # Define the pipeline with the selected balancing method
    pipeline = Pipeline([
        ('normalization', MinMaxScaler()),
        ('balancing', balancing_methods[balancing_method_selected]),
        ('classifier', model)
    ])

    # Check that the search space is not empty
    if space:
        bs = bayes_search(pipeline, space)
        bs.fit(X, y)

        print(f'Model: {model_name}')
        print(f'Melhor valor de G-mean: {bs.best_score_}')
        print(f'Melhores parâmetros: {bs.best_params_}')

        # Saving the best model
        model_path = os.path.join(save_dir, f'best_{model_name}_model.joblib')
        dump(bs.best_estimator_, model_path)

    else:
        # Train the model directly if there are no hyperparameters to adjust
        pipeline.fit(X, y)
        print(f'Model: {model_name}')
        print('Modelo treinado diretamente sem busca Bayesiana.')

        # Saving the trained model
        model_path = os.path.join(save_dir, f'best_{model_name}_model.joblib')
        dump(pipeline, model_path)


Método de balanceamento selecionado: Tomek Links
Model: DT
Melhor valor de G-mean: 0.8099173553719008
Melhores parâmetros: OrderedDict([('classifier__ccp_alpha', 0.08373883555532845), ('classifier__criterion', 'entropy'), ('classifier__max_depth', 4), ('classifier__min_samples_leaf', 5), ('classifier__min_samples_split', 9)])
Método de balanceamento selecionado: Tomek Links
Model: GB
Melhor valor de G-mean: 0.7933884297520661
Melhores parâmetros: OrderedDict([('classifier__learning_rate', 0.06610098295419149), ('classifier__loss', 'log_loss'), ('classifier__max_depth', 9), ('classifier__min_samples_leaf', 7), ('classifier__min_samples_split', 14), ('classifier__n_estimators', 236), ('classifier__subsample', 0.6376918441602301)])
Método de balanceamento selecionado: Tomek Links
Model: LR
Melhor valor de G-mean: 0.7851239669421488
Melhores parâmetros: OrderedDict([('classifier__C', 0.6365338060906855), ('classifier__class_weight', None), ('classifier__max_iter', 1094), ('classifier__solv