In [1]:
import pandas as pd
import numpy as np
import mne
import os
import re

def load_and_label_eeg(patient, file_numbers, pre_seizure_duration=100):
    """
    Carrega e processa arquivos EEG EDF para um paciente específico.

    Args:
        patient (str): Número do paciente (ex: "08").
        file_numbers (list): Lista de números dos arquivos a carregar (ex: ["02", "05"]).
        pre_seizure_duration (int): Tempo antes da crise para marcar como 1 (ajustável).

    Returns:
        pd.DataFrame: DataFrame com os dados EEG e labels aplicados.
    """

    # Lista para armazenar DataFrames
    dfs = []

    # Carregar arquivos EDF
    for file_num in file_numbers:
        file_name = f"chb{patient}_{file_num}"  # Nome do arquivo sem extensão
        file_path = f"data/{file_name}.edf"  # Caminho completo

        if os.path.exists(file_path):  
            print(f"Loading {file_path}...")
            
            raw = mne.io.read_raw_edf(file_path, preload=True)
            data, times = raw[:]
            
            df_temp = pd.DataFrame(data.T, columns=raw.ch_names)
            df_temp["time"] = times  
            df_temp["file"] = file_name  # Adiciona sem a extensão .edf
            
            dfs.append(df_temp)
        else:
            print(f"File {file_path} not found. Skipping...")

    # Criar DataFrame final
    if not dfs:
        print("Nenhum arquivo carregado. Retornando DataFrame vazio.")
        return pd.DataFrame()

    df = pd.concat(dfs, ignore_index=True)
    
    # Inicializar coluna de labels
    df["label"] = 0  

    # Ler resumo
    summary_file = f"data/chb{patient}-summary.txt"

    if not os.path.exists(summary_file):
        print(f"Summary file {summary_file} not found. Returning DataFrame without labels.")
        return df

    with open(summary_file, "r") as f:
        summary_text = f.read()

    # Encontrar arquivos e crises no resumo
    file_seizures = {}
    seizure_pattern = re.compile(rf"File Name: (chb{patient}_\d+)\.edf.*?Number of Seizures in File: (\d+)(.*?)\n\n", re.DOTALL)

    for file, num_seizures, details in seizure_pattern.findall(summary_text):
        num_seizures = int(num_seizures)
        
        if num_seizures > 0:  
            seizure_times = re.findall(r"Seizure \d+ Start Time: (\d+) seconds\nSeizure \d+ End Time: (\d+) seconds", details)
            file_seizures[file] = [(int(start), int(end)) for start, end in seizure_times]

    # Aplicar labels se houver crises
    for file, seizures in file_seizures.items():
        for start, end in seizures:
            df.loc[(df["file"] == file) & (df["time"] >= start - pre_seizure_duration) & (df["time"] < start), "label"] = 1
            df.loc[(df["file"] == file) & (df["time"] >= start) & (df["time"] <= end), "label"] = 2

    return df


In [3]:
# Parâmetros desejados
patient = "08"
file_numbers = ["02", "05", "11", "13", "21"]
pre_seizure_duration = 100  # Ajustável

# Chamar a função
df = load_and_label_eeg(patient, file_numbers, pre_seizure_duration)

df.to_csv("data/converted_df.csv", index=False)


Loading data/chb08_02.edf...
Extracting EDF parameters from /Users/sofiagomes/Documents/NOVA IMS/Big Data Analytics/BigDataAnalysis/data/chb08_02.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 921599  =      0.000 ...  3599.996 secs...


  raw = mne.io.read_raw_edf(file_path, preload=True)


Loading data/chb08_05.edf...
Extracting EDF parameters from /Users/sofiagomes/Documents/NOVA IMS/Big Data Analytics/BigDataAnalysis/data/chb08_05.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 921599  =      0.000 ...  3599.996 secs...


  raw = mne.io.read_raw_edf(file_path, preload=True)


Loading data/chb08_11.edf...
Extracting EDF parameters from /Users/sofiagomes/Documents/NOVA IMS/Big Data Analytics/BigDataAnalysis/data/chb08_11.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 921599  =      0.000 ...  3599.996 secs...


  raw = mne.io.read_raw_edf(file_path, preload=True)


Loading data/chb08_13.edf...
Extracting EDF parameters from /Users/sofiagomes/Documents/NOVA IMS/Big Data Analytics/BigDataAnalysis/data/chb08_13.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 921599  =      0.000 ...  3599.996 secs...


  raw = mne.io.read_raw_edf(file_path, preload=True)


Loading data/chb08_21.edf...
Extracting EDF parameters from /Users/sofiagomes/Documents/NOVA IMS/Big Data Analytics/BigDataAnalysis/data/chb08_21.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 921599  =      0.000 ...  3599.996 secs...


  raw = mne.io.read_raw_edf(file_path, preload=True)


In [5]:
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

def scale_and_select_features(X_train, X_val, X_test, y_train, y_val):
    """
    Scale the features and perform feature selection (RFE) once for all models.
    
    Args:
        X_train (pd.DataFrame): Training features.
        X_val (pd.DataFrame): Validation features.
        X_test (pd.DataFrame): Test features.
        y_train (pd.Series): Training labels.
        y_val (pd.Series): Validation labels.
    
    Returns:
        X_train_rfe, X_val_rfe, X_test_rfe: Feature-selected and scaled data.
    """
    # Step 1: Scale the data
    scaler = StandardScaler()
    X_train_scaled = scaler.fit_transform(X_train)
    X_val_scaled = scaler.transform(X_val)
    X_test_scaled = scaler.transform(X_test)
    
    # Step 2: Feature Selection using RFE with Logistic Regression
    model = LogisticRegression(max_iter=1000, random_state=42)
    rfe = RFE(estimator=model, n_features_to_select=5, verbose=2)  # Selecting top 5 features
    X_train_rfe = rfe.fit_transform(X_train_scaled, y_train)
    X_val_rfe = rfe.transform(X_val_scaled)
    X_test_rfe = rfe.transform(X_test_scaled)
    
    return X_train_rfe, X_val_rfe, X_test_rfe


KeyboardInterrupt: 

In [None]:
X = df.drop(columns=["label", "time", "file"])
y = df["label"]

from sklearn.model_selection import train_test_split

def split_data(X, y, test_size=0.15, val_size=0.15, random_state=42):
    """
    Split the data into training, validation, and test sets.
    
    Args:
        X (pd.DataFrame): Features (input data).
        y (pd.Series): Labels (output data).
        test_size (float): Proportion of the data to be used for the test set.
        val_size (float): Proportion of the data to be used for the validation set.
        random_state (int): Seed for reproducibility.
    
    Returns:
        X_train, X_val, X_test, y_train, y_val: Split data.
    """
    # First, split the data into training set (70%) and temporary set (30%)
    X_train, X_temp, y_train, y_temp = train_test_split(X, y, test_size=(test_size + val_size), random_state=random_state, stratify=y)
    
    # Now split the temporary set into validation (50% of the temp set) and test (50% of the temp set)
    X_val, X_test, y_val, y_test = train_test_split(X_temp, y_temp, test_size=(test_size / (test_size + val_size)), random_state=random_state, stratify=y_temp)
    
    return X_train, X_val, X_test, y_train, y_val, y_test

# Assuming you have your features X and labels y defined
X_train, X_val, X_test, y_train, y_val, y_test = split_data(X, y)

# Now you can use these sets for training, validation, and testing


# Step 1: Scale the features and perform feature selection (RFE)
X_train_rfe, X_val_rfe, X_test_rfe = scale_and_select_features(X_train, X_val, X_test, y_train, y_val)




In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, confusion_matrix

def train_logistic_model_with_grid_search(X_train_rfe, y_train, X_val_rfe, y_val):
    """
    Train Logistic Regression model with GridSearchCV and calculate performance metrics.
    
    Args:
        X_train_rfe (pd.DataFrame): Training data with selected features.
        y_train (pd.Series): Labels for training data.
        X_val_rfe (pd.DataFrame): Validation data with selected features.
        y_val (pd.Series): Labels for validation data.
    
    Returns:
        dict: Results with best parameters, accuracy, and confusion matrix for Logistic Regression.
    """
    # Define model and parameter grid
    model = LogisticRegression(max_iter=1000, random_state=42)
    param_grid = {
    "C": [0.01, 0.1, 1],
    "penalty": ['l2', 'elasticnet', 'l1'],  # Regularization strength with a range of values
    "solver": ["lbfgs"],  # Using lbfgs for faster convergence
    "class_weight": ["balanced"],  # Adjust for class imbalance
}

    
    # Step 1: Grid Search for Logistic Regression
    grid_search = GridSearchCV(model, param_grid, cv=3, scoring="accuracy", n_jobs=-1, verbose=2)
    grid_search.fit(X_train_rfe, y_train)
    
    # Best model found
    best_model = grid_search.best_estimator_
    best_params = grid_search.best_params_

    # Make predictions on the validation set
    y_val_pred = best_model.predict(X_val_rfe)
    
    # Calculate validation accuracy
    val_accuracy = accuracy_score(y_val, y_val_pred)
    
    # Generate confusion matrix
    val_confusion = confusion_matrix(y_val, y_val_pred)

    # Print results
    print(f"\n🔹 Best Hyperparameters: {best_params}")
    print(f"🔹 Validation Accuracy: {val_accuracy}")
    print(f"🔹 Validation Confusion Matrix:\n{val_confusion}")

    # Return results
    return {
        "best_model": best_model,
        "best_params": best_params,
        "val_accuracy": val_accuracy,
        "val_confusion_matrix": val_confusion
    }


In [None]:
# Step 2: Train Logistic Regression model and get results
results = train_logistic_model_with_grid_search(X_train_rfe, y_train, X_val_rfe, y_val)

# The results will contain the best hyperparameters, accuracy, and confusion matrix