In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV, cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from tqdm import tqdm
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import LeaveOneGroupOut

In [2]:
def load_data(filepath):
    df = pd.read_csv(filepath, index_col=0)
    return df

In [3]:
SIGNAL_CATEGORIES = {
    'ECG': ['HR_std', 'NN50', 'pNN50', 'TINN', 'rmsHRV', 'LF', 'HF', 'LF_HF', 'sum_f', 'rel_f', 'LF_norm', 'HF_norm'],
    'PPG': ['PPG_HR_mean', 'PPG_HR_std', 'PPG_RMSSD', 'PPG_SDNN', 'PPG_LF', 'PPG_HF', 'PPG_LF_HF', 'PPG_num_beats', 
            'PPG_RiseTime', 'PPG_DecayTime', 'PPG_PAV_mean', 'PPG_PAV_std', 'PPG_RI'],
    'EDA': ['EDA_mean', 'EDA_std', 'EDA_min', 'EDA_max', 'EDA_range', 'EDA_slope', 'scl_mean', 'scl_std', 
            'scr_mean', 'scr_std', 'corr_scl_t', 'scr_count', 'scr_amp', 'scr_sum', 'scr_area'],
    'RESP': ['Resp_mean', 'Resp_std', 'Resp_I_mean', 'Resp_I_std', 'Resp_E_mean', 'Resp_E_std', 'Resp_IE_ratio', 
             'Resp_range', 'Resp_insp_vol', 'Resp_rate', 'Resp_duration'],
}
random = 13577

In [4]:
def calculate_metrics(y_true, y_pred, stage):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, average='weighted', zero_division=1)
    recall = recall_score(y_true, y_pred, average='weighted', zero_division=1)
    f1 = f1_score(y_true, y_pred, average='weighted', zero_division=1)

    metrics_table = pd.DataFrame({
        "Métrica": ["Accuracy", "Precision", "Recall", "F1 Score"],
        "Valor": [accuracy, precision, recall, f1],
        "Etapa": stage
    })
    
    print(f"{stage} Metrics")
    print(metrics_table)
    return metrics_table

In [5]:
def plot_confusion_matrix(y_true, y_pred, title):
    conf_mat = confusion_matrix(y_true, y_pred)
    print(f"{title} - Confusion Matrix")
    print(conf_mat)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_mat, annot=True, cmap="Greens", xticklabels=set(y_true), yticklabels=set(y_true), fmt='g')
    plt.xlabel("Etiquetas predichas")
    plt.ylabel("Etiquetas verdaderas")
    plt.title(f'Matriz de confusión - {title}')
    plt.show()

In [None]:
def train_and_evaluate(X, y, model, param_grid, model_name, signal_name):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=random, stratify=y)
    
    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=10, verbose=1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    print(f'[{signal_name} - {model_name}] Best Params: {grid_search.best_params_}')
    print(f'[{signal_name} - {model_name}] Accuracy: {acc:.4f}')
    print(classification_report(y_test, y_pred))
    return acc


In [4]:
def train_and_evaluate_loso(X, y, subjects, model, param_grid, model_name, signal_name):
    logo = LeaveOneGroupOut()
    accuracies = []

    for train_idx, test_idx in logo.split(X, y, groups=subjects):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Entrenar modelo con búsqueda de hiperparámetros
        grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=10, verbose=1)
        grid_search.fit(X_train, y_train)

        # Evaluar el mejor modelo
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        accuracies.append(acc)

        print(f'[{signal_name} - {model_name}] Best Params: {grid_search.best_params_}')
        print(f'[{signal_name} - {model_name}] Accuracy: {acc:.4f}')
        print(classification_report(y_test, y_pred))

    final_acc = np.mean(accuracies)
    print(f'[{signal_name} - {model_name}] Final LOSO Accuracy: {final_acc:.4f}')
    return final_acc

In [7]:
models = {
    "RandomForest": (RandomForestClassifier(class_weight='balanced'), {"n_estimators": [50, 100, 200], "max_depth": [10, 20, None]}),
    "DecisionTree": (DecisionTreeClassifier(class_weight='balanced'), {"max_depth": [5, 10, 20, None]}),
    "KNN": (KNeighborsClassifier(), {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]})
}

results = {}

In [8]:
file = r'C:\Users\IA Lab\gusgus\WESAD_tests\features_30_075\features.csv'

df = load_data(file)

In [9]:
# Entrenamiento por cada tipo de señal
for signal, features in tqdm(SIGNAL_CATEGORIES.items(), desc="Training by signal"):
    if all(f in df.columns for f in features):  # Verificar que todas las columnas existen
        X = df[features]
        y = df['label']
        subjects = df['subject']  # Se agrega el identificador de sujetos para LOSO
        
        for model_name, (model, param_grid) in models.items():
            print(f'Training {model_name} model for {signal}...')
            acc = train_and_evaluate(X, y, model, param_grid, model_name, signal)
            results[(signal, model_name)] = acc

# Entrenamiento con todas las señales combinadas
X_all = df.drop(columns=['label', 'subject'])  # También se excluye 'subject'
y_all = df['label']
subjects_all = df['subject']

for model_name, (model, param_grid) in tqdm(models.items(), desc="Training with ALL signals"):
    print(f'Training {model_name} model for ALL signals using LOSO...')
    acc = train_and_evaluate_loso(X_all, y_all, subjects_all, model, param_grid, model_name, "ALL")
    results[("ALL", model_name)] = acc

# Mostrar resultados finales
print("\nFinal Comparison:")
for (signal, model_name), acc in results.items():
    print(f'{signal} - {model_name}: {acc:.4f}')


Training by signal:   0%|          | 0/4 [00:00<?, ?it/s]

Training RandomForest model for ECG...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[ECG - RandomForest] Best Params: {'max_depth': None, 'n_estimators': 200}
[ECG - RandomForest] Accuracy: 0.9098
              precision    recall  f1-score   support

           1       0.89      0.95      0.92      4389
           2       0.93      0.90      0.91      3550
           3       0.93      0.85      0.89      2729

    accuracy                           0.91     10668
   macro avg       0.91      0.90      0.91     10668
weighted avg       0.91      0.91      0.91     10668

Training DecisionTree model for ECG...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[ECG - DecisionTree] Best Params: {'max_depth': None}
[ECG - DecisionTree] Accuracy: 0.7988
              precision    recall  f1-score   support

           1       0.82      0.82      0.82      4389
           2       0.80      0.80      0.80      3550
           3       0.76      0.76      0.76      2729

Training by signal:  25%|██▌       | 1/4 [03:33<10:40, 213.40s/it]

[ECG - KNN] Best Params: {'n_neighbors': 7, 'weights': 'distance'}
[ECG - KNN] Accuracy: 0.6316
              precision    recall  f1-score   support

           1       0.66      0.72      0.69      4389
           2       0.63      0.61      0.62      3550
           3       0.58      0.52      0.55      2729

    accuracy                           0.63     10668
   macro avg       0.62      0.62      0.62     10668
weighted avg       0.63      0.63      0.63     10668

Training RandomForest model for PPG...
Fitting 5 folds for each of 9 candidates, totalling 45 fits


Training by signal:  25%|██▌       | 1/4 [03:33<10:41, 213.69s/it]


ValueError: 
All the 45 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
45 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\IA Lab\anaconda3\envs\ML\lib\site-packages\sklearn\model_selection\_validation.py", line 729, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\IA Lab\anaconda3\envs\ML\lib\site-packages\sklearn\base.py", line 1152, in wrapper
    return fit_method(estimator, *args, **kwargs)
  File "c:\Users\IA Lab\anaconda3\envs\ML\lib\site-packages\sklearn\ensemble\_forest.py", line 348, in fit
    X, y = self._validate_data(
  File "c:\Users\IA Lab\anaconda3\envs\ML\lib\site-packages\sklearn\base.py", line 622, in _validate_data
    X, y = check_X_y(X, y, **check_params)
  File "c:\Users\IA Lab\anaconda3\envs\ML\lib\site-packages\sklearn\utils\validation.py", line 1146, in check_X_y
    X = check_array(
  File "c:\Users\IA Lab\anaconda3\envs\ML\lib\site-packages\sklearn\utils\validation.py", line 957, in check_array
    _assert_all_finite(
  File "c:\Users\IA Lab\anaconda3\envs\ML\lib\site-packages\sklearn\utils\validation.py", line 122, in _assert_all_finite
    _assert_all_finite_element_wise(
  File "c:\Users\IA Lab\anaconda3\envs\ML\lib\site-packages\sklearn\utils\validation.py", line 171, in _assert_all_finite_element_wise
    raise ValueError(msg_err)
ValueError: Input X contains NaN.
RandomForestClassifier does not accept missing values encoded as NaN natively. For supervised learning, you might want to consider sklearn.ensemble.HistGradientBoostingClassifier and Regressor which accept missing values encoded as NaNs natively. Alternatively, it is possible to preprocess the data, for instance by using an imputer transformer in a pipeline or drop samples with missing values. See https://scikit-learn.org/stable/modules/impute.html You can find a list of all estimators that handle NaN values at the following page: https://scikit-learn.org/stable/modules/impute.html#estimators-that-handle-nan-values


*RANDOM FOREST*

In [None]:
param_gridrRD = {
    'n_estimators': [100, 200, 300, 500],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 8],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
}

randomfor_model = RandomForestClassifier(random_state=random)

print("Sin balanceo")
train_and_evaluate(randomfor_model, param_gridrRD, X_train, y_train, X_test, y_test)

*ADA BOOST*

In [None]:
param_gridADA = {
    'n_estimators': [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 58],
    'estimator__max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30]
}

base_estimator = DecisionTreeClassifier(random_state=random)
adaboost_model = AdaBoostClassifier(estimator=base_estimator, random_state=random)

train_and_evaluate(adaboost_model, param_gridADA, X_train, y_train, X_test, y_test)

*KNN*

In [None]:
param_gridKNN = {
    'n_neighbors': [2, 3, 4, 5, 8, 10, 12, 15, 18, 20],
    'weights': ['uniform', 'distance'],
    'algorithm': ['ball_tree', 'kd_tree']
}

knn_model = KNeighborsClassifier()

train_and_evaluate(knn_model, param_gridKNN, X_train, y_train, X_test, y_test)

*DECISION TREE*

In [None]:
param_gridDT = {
    'criterion': ['gini', 'entropy'],  # Función para medir la calidad de una división
    'max_depth': [None, 3, 5, 10, 20],  # Profundidad máxima del árbol
    'min_samples_split': [2, 5, 10],  # Número mínimo de muestras para dividir un nodo
    'min_samples_leaf': [1, 2, 4, 8],  # Muestras mínimas en una hoja
    'max_features': ['sqrt', 'log2', None]  # Número de características a considerar en cada división
}

decision_tree_model = DecisionTreeClassifier(random_state=random)

train_and_evaluate(decision_tree_model, param_gridDT, X_train, y_train, X_test, y_test)

*LDA*

In [None]:
param_gridLDA = {
    'solver': ['lsqr', 'eigen'],  # Excluye 'svd'
    'shrinkage': ['auto', 0.1, 0.5, 1.0]  # Compatible con 'lsqr' y 'eigen'
}

lda_model = LinearDiscriminantAnalysis()

train_and_evaluate(lda_model, param_gridLDA, X_train, y_train, X_test, y_test)