In [12]:
import pandas as pd
import numpy as np
import mne
import os
import re

def load_and_label_eeg(patient, file_numbers, pre_seizure_duration=100):
    """
    Carrega e processa arquivos EEG EDF para um paciente específico.

    Args:
        patient (str): Número do paciente (ex: "08").
        file_numbers (list): Lista de números dos arquivos a carregar (ex: ["02", "05"]).
        pre_seizure_duration (int): Tempo antes da crise para marcar como 1 (ajustável).

    Returns:
        pd.DataFrame: DataFrame com os dados EEG e labels aplicados.
    """

    # Lista para armazenar DataFrames
    dfs = []

    # Carregar arquivos EDF
    for file_num in file_numbers:
        file_name = f"chb{patient}_{file_num}"  # Nome do arquivo sem extensão
        file_path = f"data/{file_name}.edf"  # Caminho completo

        if os.path.exists(file_path):  
            print(f"Loading {file_path}...")
            
            raw = mne.io.read_raw_edf(file_path, preload=True)
            data, times = raw[:]
            
            df_temp = pd.DataFrame(data.T, columns=raw.ch_names)
            df_temp["time"] = times  
            df_temp["file"] = file_name  # Adiciona sem a extensão .edf
            
            dfs.append(df_temp)
        else:
            print(f"File {file_path} not found. Skipping...")

    # Criar DataFrame final
    if not dfs:
        print("Nenhum arquivo carregado. Retornando DataFrame vazio.")
        return pd.DataFrame()

    df = pd.concat(dfs, ignore_index=True)
    
    # Inicializar coluna de labels
    df["label"] = 0  

    # Ler resumo
    summary_file = f"data/chb{patient}-summary.txt"

    if not os.path.exists(summary_file):
        print(f"Summary file {summary_file} not found. Returning DataFrame without labels.")
        return df

    with open(summary_file, "r") as f:
        summary_text = f.read()

    # Encontrar arquivos e crises no resumo
    file_seizures = {}
    seizure_pattern = re.compile(rf"File Name: (chb{patient}_\d+)\.edf.*?Number of Seizures in File: (\d+)(.*?)\n\n", re.DOTALL)

    for file, num_seizures, details in seizure_pattern.findall(summary_text):
        num_seizures = int(num_seizures)
        
        if num_seizures > 0:  
            seizure_times = re.findall(r"Seizure \d+ Start Time: (\d+) seconds\nSeizure \d+ End Time: (\d+) seconds", details)
            file_seizures[file] = [(int(start), int(end)) for start, end in seizure_times]

    # Aplicar labels se houver crises
    for file, seizures in file_seizures.items():
        for start, end in seizures:
            df.loc[(df["file"] == file) & (df["time"] >= start - pre_seizure_duration) & (df["time"] < start), "label"] = 1
            df.loc[(df["file"] == file) & (df["time"] >= start) & (df["time"] <= end), "label"] = 2

    return df


In [13]:
# Parâmetros desejados
patient = "08"
file_numbers = ["02", "05", "11", "13", "21"]
pre_seizure_duration = 100  # Ajustável

# Chamar a função
df = load_and_label_eeg(patient, file_numbers, pre_seizure_duration)

df


Loading data/chb08_02.edf...
Extracting EDF parameters from c:\Users\ASUS.LAPTOP-SFDPA4G4\Desktop\Faculdade\Mestrado\2semestre\BDA\BDA_project\data\chb08_02.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 921599  =      0.000 ...  3599.996 secs...


  raw = mne.io.read_raw_edf(file_path, preload=True)


Loading data/chb08_05.edf...
Extracting EDF parameters from c:\Users\ASUS.LAPTOP-SFDPA4G4\Desktop\Faculdade\Mestrado\2semestre\BDA\BDA_project\data\chb08_05.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 921599  =      0.000 ...  3599.996 secs...


  raw = mne.io.read_raw_edf(file_path, preload=True)


Loading data/chb08_11.edf...
Extracting EDF parameters from c:\Users\ASUS.LAPTOP-SFDPA4G4\Desktop\Faculdade\Mestrado\2semestre\BDA\BDA_project\data\chb08_11.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 921599  =      0.000 ...  3599.996 secs...


  raw = mne.io.read_raw_edf(file_path, preload=True)


Loading data/chb08_13.edf...
Extracting EDF parameters from c:\Users\ASUS.LAPTOP-SFDPA4G4\Desktop\Faculdade\Mestrado\2semestre\BDA\BDA_project\data\chb08_13.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 921599  =      0.000 ...  3599.996 secs...


  raw = mne.io.read_raw_edf(file_path, preload=True)


Loading data/chb08_21.edf...
Extracting EDF parameters from c:\Users\ASUS.LAPTOP-SFDPA4G4\Desktop\Faculdade\Mestrado\2semestre\BDA\BDA_project\data\chb08_21.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 921599  =      0.000 ...  3599.996 secs...


  raw = mne.io.read_raw_edf(file_path, preload=True)


Unnamed: 0,FP1-F7,F7-T7,T7-P7,P7-O1,FP1-F3,F3-C3,C3-P3,P3-O1,FP2-F4,F4-C4,...,FZ-CZ,CZ-PZ,P7-T7,T7-FT9,FT9-FT10,FT10-T8,T8-P8-1,time,file,label
0,-1.611722e-04,2.637363e-05,-1.953602e-07,7.873016e-05,-4.161172e-05,7.833944e-05,-3.936508e-04,3.006593e-04,1.799267e-04,8.029304e-05,...,1.113553e-05,7.755800e-05,5.860806e-07,5.411477e-05,2.676435e-05,5.528694e-05,5.958486e-05,0.000000,chb08_02,0
1,1.953602e-07,1.953602e-07,1.953602e-07,1.953602e-07,1.953602e-07,1.953602e-07,1.953602e-07,1.953602e-07,1.953602e-07,1.953602e-07,...,1.953602e-07,1.953602e-07,1.953602e-07,1.953602e-07,1.953602e-07,1.953602e-07,1.953602e-07,0.003906,chb08_02,0
2,1.953602e-07,-1.953602e-07,1.953602e-07,9.768010e-07,1.953602e-07,1.953602e-07,1.953602e-07,5.860806e-07,1.953602e-07,1.953602e-07,...,1.953602e-07,1.953602e-07,1.953602e-07,1.953602e-07,1.758242e-06,-9.768010e-07,1.953602e-07,0.007812,chb08_02,0
3,-9.768010e-07,1.953602e-07,-1.367521e-06,3.321123e-06,-9.768010e-07,1.953602e-07,-1.953602e-07,2.148962e-06,-9.768010e-07,-1.758242e-06,...,1.953602e-07,-5.860806e-07,1.758242e-06,-2.539683e-06,2.930403e-06,-1.953602e-07,5.860806e-07,0.011719,chb08_02,0
4,-4.102564e-06,5.274725e-06,1.953602e-07,-2.930403e-06,-9.768010e-07,2.148962e-06,-1.953602e-07,-2.539683e-06,5.860806e-07,-2.930403e-06,...,1.953602e-07,9.768010e-07,1.953602e-07,-2.539683e-06,-1.269841e-05,1.660562e-05,-1.953602e-07,0.015625,chb08_02,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4607995,-1.699634e-05,2.520147e-05,-6.466422e-05,1.230769e-05,-8.302808e-05,5.802198e-05,2.539683e-06,-2.324786e-05,-1.347985e-05,4.903541e-05,...,2.324786e-05,-6.837607e-06,6.505495e-05,-5.372405e-05,3.770452e-05,6.935287e-05,-6.114774e-05,3599.980469,chb08_21,0
4607996,-1.738706e-05,-2.871795e-05,-3.418803e-05,3.223443e-05,-8.302808e-05,5.997558e-05,5.274725e-06,-3.223443e-05,-1.894994e-05,5.411477e-05,...,2.598291e-05,-9.963370e-06,3.457875e-05,1.758242e-06,3.770452e-05,2.520147e-05,-2.168498e-05,3599.984375,chb08_21,0
4607997,-5.860806e-07,-9.787546e-05,6.837607e-06,5.684982e-05,-6.388278e-05,5.724054e-05,-4.102564e-06,-2.442002e-05,-3.457875e-05,5.059829e-05,...,1.934066e-05,-9.963370e-06,-6.446886e-06,6.935287e-05,3.184371e-05,-5.274725e-06,-2.148962e-06,3599.988281,chb08_21,0
4607998,5.528694e-05,-7.013431e-05,9.045177e-05,-5.450549e-05,4.200244e-05,8.009768e-06,-7.619048e-06,-2.207570e-05,-4.473748e-05,5.137973e-05,...,1.816850e-05,-7.228327e-06,-9.006105e-05,4.903541e-05,2.559219e-05,-2.285714e-05,1.269841e-05,3599.992188,chb08_21,0


In [14]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

def scale_and_select_features(X_train, X_val, X_test, y_train, y_val):
    """
    Scale the features and perform feature selection (RFE) once for all models.
    
    Args:
        X_train (pd.DataFrame): Training features.
        X_val (pd.DataFrame): Validation features.
        X_test (pd.DataFrame): Test features.
        y_train (pd.Series): Training labels.
        y_val (pd.Series): Validation labels.
    
    Returns:
        X_train_rfe, X_val_rfe, X_test_rfe: Feature-selected and scaled data.
    """
    # Step 1: Scale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Step 2: Feature Selection using RFE with Logistic Regression
    model = LogisticRegression(max_iter=1000, random_state=42)
    rfe = RFE(estimator=model, n_features_to_select=5, verbose=2)  # Selecting top 5 features
    X_train_rfe = rfe.fit_transform(X_train_scaled, y_train)
    X_val_rfe = rfe.transform(X_val_scaled)
    X_test_rfe = rfe.transform(X_test_scaled)
    
    return X_train_rfe, X_val_rfe, X_test_rfe


In [16]:
X = df.drop(columns=["label", "time", "file"])
y = df["label"]

from sklearn.model_selection import train_test_split

def split_data(X, y, test_size=0.15, val_size=0.15, random_state=42):
    """
    Split the data into training, validation, and test sets.
    
    Args:
        X (pd.DataFrame): Features (input data).
        y (pd.Series): Labels (output data).
        test_size (float): Proportion of the data to be used for the test set.
        val_size (float): Proportion of the data to be used for the validation set.
        random_state (int): Seed for reproducibility.
    
    Returns:
        X_train, X_val, X_test, y_train, y_val: Split data.
    """
    # First, split the data into training set (70%) and temporary set (30%)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(test_size + val_size), random_state=random_state, stratify=y)
    
    # Now split the temporary set into validation (50% of the temp set) and test (50% of the temp set)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(test_size / (test_size + val_size)), random_state=random_state, stratify=y_temp)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

# Assuming you have your features X and labels y defined
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

# Now you can use these sets for training, validation, and testing


# Step 1: Scale the features and perform feature selection (RFE)
X_train_rfe, X_val_rfe, X_test_rfe = scale_and_select_features(X_train, X_val, X_test, y_train, y_val)




Fitting estimator with 23 features.
Fitting estimator with 22 features.
Fitting estimator with 21 features.
Fitting estimator with 20 features.
Fitting estimator with 19 features.
Fitting estimator with 18 features.
Fitting estimator with 17 features.
Fitting estimator with 16 features.
Fitting estimator with 15 features.
Fitting estimator with 14 features.
Fitting estimator with 13 features.
Fitting estimator with 12 features.
Fitting estimator with 11 features.
Fitting estimator with 10 features.
Fitting estimator with 9 features.
Fitting estimator with 8 features.
Fitting estimator with 7 features.
Fitting estimator with 6 features.


In [29]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

def train_logistic_model_with_grid_search(X_train_rfe, y_train, X_val_rfe, y_val):
    """
    Train Logistic Regression model with GridSearchCV and calculate performance metrics.
    
    Args:
        X_train_rfe (pd.DataFrame): Training data with selected features.
        y_train (pd.Series): Labels for training data.
        X_val_rfe (pd.DataFrame): Validation data with selected features.
        y_val (pd.Series): Labels for validation data.
    
    Returns:
        dict: Results with best parameters, accuracy, and confusion matrix for Logistic Regression.
    """
    # Define model and parameter grid
    model = LogisticRegression(max_iter=1000, random_state=42)
    param_grid = {
    "C": [0.01, 0.1, 1],
    "penalty": ['l2', 'elasticnet', 'l1'],  # Regularization strength with a range of values
    "solver": ["lbfgs"],  # Using lbfgs for faster convergence
    "class_weight": ["balanced"],  # Adjust for class imbalance
}

    
    # Step 1: Grid Search for Logistic Regression
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring="accuracy", n_jobs=-1, verbose=2)
    grid_search.fit(X_train_rfe, y_train)
    
    # Best model found
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Make predictions on the validation set
    y_val_pred = best_model.predict(X_val_rfe)
    
    # Calculate validation accuracy
    val_accuracy = accuracy_score(y_val, y_val_pred)
    
    # Generate confusion matrix
    val_confusion = confusion_matrix(y_val, y_val_pred)

    # Print results
    print(f"\n🔹 Best Hyperparameters: {best_params}")
    print(f"🔹 Validation Accuracy: {val_accuracy}")
    print(f"🔹 Validation Confusion Matrix:\n{val_confusion}")

    # Return results
    return {
        "best_model": best_model,
        "best_params": best_params,
        "val_accuracy": val_accuracy,
        "val_confusion_matrix": val_confusion
    }


In [30]:
# Step 2: Train Logistic Regression model and get results
results = train_logistic_model_with_grid_search(X_train_rfe, y_train, X_val_rfe, y_val)

# The results will contain the best hyperparameters, accuracy, and confusion matrix

Fitting 3 folds for each of 9 candidates, totalling 27 fits


18 fits failed out of a total of 27.
The score on these train-test partitions for these parameters will be set to nan.
If these failures are not expected, you can try to debug them by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
9 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ASUS.LAPTOP-SFDPA4G4\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\model_selection\_validation.py", line 888, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ASUS.LAPTOP-SFDPA4G4\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\base.py", line 1473, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ASUS.LAPTOP-SFDPA4G4\AppData\Local\Programs\Python\Python312\Lib\site-packages\sklearn\linear_model\_logistic.py", line


🔹 Best Hyperparameters: {'C': 1, 'class_weight': 'balanced', 'penalty': 'l2', 'solver': 'lbfgs'}
🔹 Validation Accuracy: 0.31831886574074075
🔹 Validation Confusion Matrix:
[[200298 177054 259357]
 [  5984   5453   7763]
 [ 11394   9626  14271]]
