In [1]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
#import random
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.svm import SVC
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import train_test_split, RepeatedStratifiedKFold, GridSearchCV, cross_val_predict, cross_val_score
from sklearn.preprocessing import StandardScaler
from sklearn.inspection import permutation_importance
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from tqdm import tqdm
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import LeaveOneGroupOut

In [2]:
def load_data(filepath):
    df = pd.read_csv(filepath, index_col=0)
    df['Temp_slope'] = df['Temp_slope'].str.replace(r'(\d)(?=-\d)', r'\1e', regex=True)
    df['Temp_slope'] = df['Temp_slope'].str.replace(r'[^\d.e+-]', '', regex=True).astype(float)

    return df

In [3]:
SIGNAL_CATEGORIES = {
    'ECG_BVP': ['HR_std', 'NN50', 'pNN50', 'TINN', 'rmsHRV', 'LF', 'HF', 'LF_HF', 'sum_f', 'rel_f', 'LF_norm', 'HF_norm'],
    'EDA': ['EDA_mean', 'EDA_std', 'EDA_min', 'EDA_max', 'EDA_range', 'EDA_slope', 'scl_mean', 'scl_std', 'scr_mean', 'scr_std', 'corr_scl_t', 'scr_count', 'scr_amp', 'scr_sum', 'scr_area'],
    'EMG': ['EMG_mean', 'EMG_std', 'EMG_median', 'EMG_p10', 'EMG_p90', 'EMG_range', 'EMG_sum', 'EMG_f_peak', 'EMG_PSD_bands', 'EMG_peak_count', 'EMG_peak_amp_mean', 'EMG_peak_amp_std', 'EMG_peak_amp_sum', 'EMG_peak_amp_norm'],
    'RESP': ['Resp_mean', 'Resp_std', 'Resp_I_mean', 'Resp_I_std', 'Resp_E_mean', 'Resp_E_std', 'Resp_IE_ratio', 'Resp_range', 'Resp_insp_vol', 'Resp_rate', 'Resp_duration'],
    'TEMP': ['Temp_mean', 'Temp_std', 'Temp_min', 'Temp_max', 'Temp_range', 'Temp_slope']
}
random = 13577

In [None]:
def calculate_metrics(y_true, y_pred, stage):
    accuracy = accuracy_score(y_true, y_pred)
    precision = precision_score(y_true, y_pred, pos_label=1, average='weighted')
    recall = recall_score(y_true, y_pred, pos_label=1, average='weighted')
    f1 = f1_score(y_true, y_pred, pos_label=1, average='weighted')

    metrics_table = pd.DataFrame({
        "Métrica": ["Accuracy", "Precision", "Recall", "F1 Score"],
        "Valor": [accuracy, precision, recall, f1]
    })
    
    print(f"{stage} Metrics")
    print(metrics_table)
    return metrics_table

In [None]:
def plot_confusion_matrix(y_true, y_pred, stage):
    conf_mat = confusion_matrix(y_true, y_pred)
    print(f"{stage} Confusion Matrix")
    print(conf_mat)
    
    plt.figure(figsize=(8, 6))
    sns.heatmap(conf_mat, annot=True, cmap="BuPu", fmt="d")
    plt.xlabel("Etiquetas predichas")
    plt.ylabel("Etiquetas verdaderas")
    if stage == 'Validation':
        plt.title("Matriz de confusión para el conjunto de validación")
    else:
        plt.title("Matriz de confusión para el conjunto de prueba")
    plt.show()

In [27]:
def train_and_evaluate(X, y, model, param_grid, model_name, signal_name):
    
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, stratify=y)
    
    grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=10, verbose=1)
    grid_search.fit(X_train, y_train)
    
    best_model = grid_search.best_estimator_
    y_pred = best_model.predict(X_test)
    acc = accuracy_score(y_test, y_pred)
    
    print(f'[{signal_name} - {model_name}] Best Params: {grid_search.best_params_}')
    print(f'[{signal_name} - {model_name}] Accuracy: {acc:.4f}')
    print(classification_report(y_test, y_pred))
    return acc


In [4]:
def train_and_evaluate_loso(X, y, subjects, model, param_grid, model_name, signal_name):
    logo = LeaveOneGroupOut()
    accuracies = []

    for train_idx, test_idx in logo.split(X, y, groups=subjects):
        X_train, X_test = X.iloc[train_idx], X.iloc[test_idx]
        y_train, y_test = y.iloc[train_idx], y.iloc[test_idx]

        # Entrenar modelo con búsqueda de hiperparámetros
        grid_search = GridSearchCV(model, param_grid, cv=5, n_jobs=10, verbose=1)
        grid_search.fit(X_train, y_train)

        # Evaluar el mejor modelo
        best_model = grid_search.best_estimator_
        y_pred = best_model.predict(X_test)
        acc = accuracy_score(y_test, y_pred)
        accuracies.append(acc)

        print(f'[{signal_name} - {model_name}] Best Params: {grid_search.best_params_}')
        print(f'[{signal_name} - {model_name}] Accuracy: {acc:.4f}')
        print(classification_report(y_test, y_pred))

    final_acc = np.mean(accuracies)
    print(f'[{signal_name} - {model_name}] Final LOSO Accuracy: {final_acc:.4f}')
    return final_acc

In [5]:
models = {
    "RandomForest": (RandomForestClassifier(class_weight='balanced'), {"n_estimators": [50, 100, 200], "max_depth": [10, 20, None]}),
    "DecisionTree": (DecisionTreeClassifier(class_weight='balanced'), {"max_depth": [5, 10, 20, None]}),
    "KNN": (KNeighborsClassifier(), {"n_neighbors": [3, 5, 7], "weights": ["uniform", "distance"]})
}

results = {}

In [6]:
file = r'C:\Users\IALAB\Downloads\WESAD_TEST\data_Complete_60_025\may14_feats4.csv'

df = load_data(file)

In [7]:
# Entrenamiento por cada tipo de señal
for signal, features in tqdm(SIGNAL_CATEGORIES.items(), desc="Training by signal"):
    if all(f in df.columns for f in features):  # Verificar que todas las columnas existen
        X = df[features]
        y = df['label']
        subjects = df['subject']  # Se agrega el identificador de sujetos para LOSO
        
        for model_name, (model, param_grid) in models.items():
            print(f'Training {model_name} model for {signal} using LOSO...')
            acc = train_and_evaluate_loso(X, y, subjects, model, param_grid, model_name, signal)
            results[(signal, model_name)] = acc

# Entrenamiento con todas las señales combinadas
X_all = df.drop(columns=['label', 'subject'])  # También se excluye 'subject'
y_all = df['label']
subjects_all = df['subject']

for model_name, (model, param_grid) in tqdm(models.items(), desc="Training with ALL signals"):
    print(f'Training {model_name} model for ALL signals using LOSO...')
    acc = train_and_evaluate_loso(X_all, y_all, subjects_all, model, param_grid, model_name, "ALL")
    results[("ALL", model_name)] = acc

# Mostrar resultados finales
print("\nFinal Comparison:")
for (signal, model_name), acc in results.items():
    print(f'{signal} - {model_name}: {acc:.4f}')


Training by signal:   0%|          | 0/5 [00:00<?, ?it/s]

Training RandomForest model for ECG_BVP using LOSO...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[ECG_BVP - RandomForest] Best Params: {'max_depth': None, 'n_estimators': 100}
[ECG_BVP - RandomForest] Accuracy: 0.4882
              precision    recall  f1-score   support

           1       0.57      0.82      0.67      4457
           2       0.67      0.01      0.01      2460
           3       0.19      0.26      0.22      1328

    accuracy                           0.49      8245
   macro avg       0.48      0.36      0.30      8245
weighted avg       0.54      0.49      0.40      8245

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[ECG_BVP - RandomForest] Best Params: {'max_depth': 20, 'n_estimators': 50}
[ECG_BVP - RandomForest] Accuracy: 0.6417
              precision    recall  f1-score   support

           1       0.65      0.93      0.76      4441
           2       1.00      0.48      0.65      2560
           3       0.01      0.01      0.01

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[ECG_BVP - RandomForest] Best Params: {'max_depth': 20, 'n_estimators': 100}
[ECG_BVP - RandomForest] Accuracy: 0.7760
              precision    recall  f1-score   support

           1       0.78      0.87      0.82      4601
           2       0.89      1.00      0.94      2720
           3       0.02      0.01      0.01      1352

    accuracy                           0.78      8673
   macro avg       0.56      0.63      0.59      8673
weighted avg       0.70      0.78      0.73      8673

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[ECG_BVP - RandomForest] Best Params: {'max_depth': 20, 'n_estimators': 200}
[ECG_BVP - RandomForest] Accuracy: 0.4163
              precision    recall  f1-score   support

           1       0.50      0.77      0.61      4601
           2       0.00      0.00      0.00      2536
           3       0.04      0.04      0.04      1528

    accuracy                           0.42      8665
   macro avg       0.18      0.27      0.22      

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[ECG_BVP - RandomForest] Best Params: {'max_depth': None, 'n_estimators': 50}
[ECG_BVP - RandomForest] Accuracy: 0.2195
              precision    recall  f1-score   support

           1       0.06      0.03      0.04      4581
           2       0.26      0.65      0.37      2624
           3       0.65      0.04      0.08      1488

    accuracy                           0.22      8693
   macro avg       0.33      0.24      0.16      8693
weighted avg       0.22      0.22      0.15      8693

Fitting 5 folds for each of 9 candidates, totalling 45 fits
[ECG_BVP - RandomForest] Best Params: {'max_depth': None, 'n_estimators': 100}
[ECG_BVP - RandomForest] Accuracy: 0.8434
              precision    recall  f1-score   support

           1       0.91      1.00      0.95      4601
           2       0.75      1.00      0.86      2692
           3       0.00      0.00      0.00      1352

    accuracy                           0.84      8645
   macro avg       0.55      0.67      0.60   

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[ECG_BVP - RandomForest] Best Params: {'max_depth': 20, 'n_estimators': 50}
[ECG_BVP - RandomForest] Accuracy: 0.7250
              precision    recall  f1-score   support

           1       0.74      0.81      0.78      4605
           2       0.74      0.97      0.84      2772
           3       0.01      0.00      0.00      1488

    accuracy                           0.72      8865
   macro avg       0.50      0.59      0.54      8865
weighted avg       0.62      0.72      0.67      8865

[ECG_BVP - RandomForest] Final LOSO Accuracy: 0.5343
Training DecisionTree model for ECG_BVP using LOSO...
Fitting 5 folds for each of 4 candidates, totalling 20 fits
[ECG_BVP - DecisionTree] Best Params: {'max_depth': None}
[ECG_BVP - DecisionTree] Accuracy: 0.2489
              precision    recall  f1-score   support

           1       0.37      0.15      0.22      4457
           2       0.89      0.07      0.12      2460
           3       0.19      0.90      0.32      1328

    accuracy    

  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


[ECG_BVP - DecisionTree] Best Params: {'max_depth': 20}
[ECG_BVP - DecisionTree] Accuracy: 0.6958
              precision    recall  f1-score   support

           1       0.81      0.74      0.77      4605
           2       0.65      0.98      0.78      2772
           3       0.11      0.04      0.06      1488

    accuracy                           0.70      8865
   macro avg       0.52      0.59      0.54      8865
weighted avg       0.64      0.70      0.66      8865

[ECG_BVP - DecisionTree] Final LOSO Accuracy: 0.5046
Training KNN model for ECG_BVP using LOSO...
Fitting 5 folds for each of 6 candidates, totalling 30 fits
[ECG_BVP - KNN] Best Params: {'n_neighbors': 7, 'weights': 'uniform'}
[ECG_BVP - KNN] Accuracy: 0.3478
              precision    recall  f1-score   support

           1       0.54      0.48      0.51      4457
           2       0.62      0.15      0.24      2460
           3       0.10      0.26      0.14      1328

    accuracy                           0.3

Training by signal:  20%|██        | 1/5 [1:14:18<4:57:13, 4458.41s/it]

[ECG_BVP - KNN] Best Params: {'n_neighbors': 7, 'weights': 'uniform'}
[ECG_BVP - KNN] Accuracy: 0.2759
              precision    recall  f1-score   support

           1       0.38      0.36      0.37      4605
           2       0.18      0.24      0.21      2772
           3       0.13      0.06      0.08      1488

    accuracy                           0.28      8865
   macro avg       0.23      0.22      0.22      8865
weighted avg       0.28      0.28      0.27      8865

[ECG_BVP - KNN] Final LOSO Accuracy: 0.3944
Training RandomForest model for EDA using LOSO...
Fitting 5 folds for each of 9 candidates, totalling 45 fits
[EDA - RandomForest] Best Params: {'max_depth': None, 'n_estimators': 100}
[EDA - RandomForest] Accuracy: 0.4412
              precision    recall  f1-score   support

           1       0.55      0.58      0.56      4457
           2       0.36      0.35      0.35      2460
           3       0.19      0.16      0.18      1328

    accuracy                   

Training by signal:  20%|██        | 1/5 [1:36:39<6:26:38, 5799.63s/it]


KeyboardInterrupt: 

*RANDOM FOREST*

In [None]:
param_gridrRD = {
    'n_estimators': [100, 200, 300, 500],
    'max_features': ['sqrt', 'log2'],
    'min_samples_split': [2, 5, 10],
    'min_samples_leaf': [1, 2, 4, 8],
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy'],
}

randomfor_model = RandomForestClassifier(random_state=random)

print("Sin balanceo")
train_and_evaluate(randomfor_model, param_gridrRD, X_train, y_train, X_test, y_test)

*ADA BOOST*

In [None]:
param_gridADA = {
    'n_estimators': [2, 3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30, 35, 40, 45, 50, 55, 58],
    'estimator__max_depth': [3, 4, 5, 6, 7, 8, 9, 10, 15, 20, 25, 30]
}

base_estimator = DecisionTreeClassifier(random_state=random)
adaboost_model = AdaBoostClassifier(estimator=base_estimator, random_state=random)

train_and_evaluate(adaboost_model, param_gridADA, X_train, y_train, X_test, y_test)

*KNN*

In [None]:
param_gridKNN = {
    'n_neighbors': [2, 3, 4, 5, 8, 10, 12, 15, 18, 20],
    'weights': ['uniform', 'distance'],
    'algorithm': ['ball_tree', 'kd_tree']
}

knn_model = KNeighborsClassifier()

train_and_evaluate(knn_model, param_gridKNN, X_train, y_train, X_test, y_test)

*DECISION TREE*

In [None]:
param_gridDT = {
    'criterion': ['gini', 'entropy'],  # Función para medir la calidad de una división
    'max_depth': [None, 3, 5, 10, 20],  # Profundidad máxima del árbol
    'min_samples_split': [2, 5, 10],  # Número mínimo de muestras para dividir un nodo
    'min_samples_leaf': [1, 2, 4, 8],  # Muestras mínimas en una hoja
    'max_features': ['sqrt', 'log2', None]  # Número de características a considerar en cada división
}

decision_tree_model = DecisionTreeClassifier(random_state=random)

train_and_evaluate(decision_tree_model, param_gridDT, X_train, y_train, X_test, y_test)

*LDA*

In [None]:
param_gridLDA = {
    'solver': ['lsqr', 'eigen'],  # Excluye 'svd'
    'shrinkage': ['auto', 0.1, 0.5, 1.0]  # Compatible con 'lsqr' y 'eigen'
}

lda_model = LinearDiscriminantAnalysis()

train_and_evaluate(lda_model, param_gridLDA, X_train, y_train, X_test, y_test)