In [None]:
import os
import numpy as np
#import cupy as cp
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import scipy.signal as scisig
import scipy.stats
from scipy.interpolate import interp1d 
from collections import Counter
import scipy.interpolate as interp
#import cupyx.scipy.signal as cusig
import neurokit2 as nk
from scipy.signal import find_peaks, welch, butter, filtfilt, convolve
from scipy.optimize import curve_fit
#import cupyx.scipy.signal as cpysig
import gc
from scipy.linalg import toeplitz
from scipy import linalg

In [108]:
import faulthandler
faulthandler.enable()

In [109]:
fs_dict = {'PPG': 1000, 'EDA': 1000, 'RESP': 1000, 'ECG': 1000, 'label': 1000}
WINDOW_IN_SECONDS = 30
STRIDE_IN_SECONDS = 0.75
label_dict = {'baseline': 1, 'cognitive load': 2, 'stress': 3, 'stress + cognitive load': 4}
int_to_label = {1: 'baseline', 2: 'cognitive load', 3: 'stress', 4: 'stress + cognitive load'}
feat_names = None
DATA_PATH = r'dataProcessed/'
SAVE_PATH = r'features_30_075/'

In [110]:
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

In [111]:
#def eda_stats(y):
#    Fs = fs_dict['EDA']
#    yn = (y - y.mean()) / y.std()
#    print(yn)
#    print("calculating eda stats")
#    [r, p, t, l, d, e, obj] = cvxEDA.cvxEDA(yn, 1. / Fs)
#    return [r, p, t, l, d, e, obj]

In [None]:
def butter_filter(data, cutoff, fs, order=4, filter_type='low'):
    """
    Aplica un filtro Butterworth a los datos.

    Parámetros:
    - data: array de la señal a filtrar.
    - cutoff: frecuencia de corte (o tupla en caso de bandpass).
    - fs: frecuencia de muestreo.
    - order: orden del filtro.
    - filter_type: 'low', 'high' o 'band'.

    Retorna:
    - Señal filtrada.
    """
    nyq = 0.5 * fs  # Frecuencia de Nyquist

    # Normalizar la frecuencia de corte
    if isinstance(cutoff, (list, tuple)):  # Band-pass o Band-stop
        normal_cutoff = [c / nyq for c in cutoff]
    else:  # Low-pass o High-pass
        normal_cutoff = cutoff / nyq

    # Crear el filtro Butterworth
    b, a = scisig.butter(order, normal_cutoff, btype=filter_type, analog=False)

    # Aplicar el filtro con filtfilt para evitar desfases
    return scisig.filtfilt(b, a, np.array(data))


def get_slope(signal):
    if len(signal) < 2:  # Evita divisiones por 0 o valores no válidos
        return 0
    
    # Convertir a pandas.Series si es necesario
    if not isinstance(signal, pd.Series):
        signal = pd.Series(signal)

    resu = (signal.iloc[-1] - signal.iloc[0]) / (len(signal) - 1)  # Evita dividir entre len(signal)

    # Convertir a escalar si es un array
    if isinstance(resu, np.ndarray):
        resu = resu.flatten()[0]
    
    return resu

def get_peak_freq(x, fs):
    f, Pxx = scisig.periodogram(x, fs=fs)
    psd_dict = {amp: freq for amp, freq in zip(Pxx, f)}
    return psd_dict[max(psd_dict.keys())]

def get_window_stats(data, label=-1):
    mean_features = np.mean(data)
    std_features = np.std(data)
    min_features = np.amin(data)
    max_features = np.amax(data)

    features = {'mean': mean_features, 'std': std_features, 'min': min_features, 'max': max_features,
                'label': label}
    return features


def get_net_accel(data):
    return (data['ACC_x'] ** 2 + data['ACC_y'] ** 2 + data['ACC_z'] ** 2).apply(lambda x: np.sqrt(x))


def get_peak_freq(x):
    f, Pxx = scisig.periodogram(x, fs=8)
    psd_dict = {amp: freq for amp, freq in zip(Pxx, f)}
    peak_freq = psd_dict[max(psd_dict.keys())]
    return peak_freq


# https://github.com/MITMediaLabAffectiveComputing/eda-explorer/blob/master/AccelerometerFeatureExtractionScript.py
def filterSignalFIR(eda, cutoff=0.4, numtaps=64):
    f = cutoff / (fs_dict['ACC'] / 2.0)
    FIR_coeff = scisig.firwin(numtaps, f)

    return scisig.lfilter(FIR_coeff, 1, eda.flatten())

def triangle(x, a, b, c):
    return np.maximum(0, a - np.abs(x - b) / c)

def eda_deconvolution(eda, fs):
    """Deconvolución EDA inspirada en Choi et al. (2012)"""
    n = len(eda)
    
    # Crear IRF doble exponencial (respuesta impulsiva fisiológica)
    t = np.arange(0, 10 * fs) / fs
    tau1 = 0.75  # tiempo de subida
    tau2 = 2.0   # tiempo de caída
    irf = (np.exp(-t / tau2) - np.exp(-t / tau1))
    irf = irf / np.max(irf)  # normalizar
    
    # Crear matriz de convolución (Toeplitz)
    col = np.r_[irf[0], np.zeros(n - 1)]
    row = irf
    X = toeplitz(col, row)
    
    # Resolver sistema regularizado: ||Xh - eda||^2 + λ||h||^2
    signal_energy = np.sum(eda ** 2)
    lamb = 0.01 * signal_energy / len(eda)
    XtX = X.T @ X + lamb * np.eye(X.shape[1])
    Xty = X.T @ eda
    scr = linalg.solve(XtX, Xty)

    # Reconstrucción de componente tónica (SCL)
    scl = eda - convolve(scr, irf, mode='full')[:n]

    return scl, scr

In [None]:
def compute_eda_metrics(eda_signal, fs, show_plots=False):
    eda_signal = np.array(eda_signal).flatten()
    
    if len(eda_signal) < 2:
        return [np.nan] * 16

    # --------- Señal total (EDA) ----------
    eda_mean = np.mean(eda_signal)
    eda_std = np.std(eda_signal)
    eda_min = np.min(eda_signal)
    eda_max = np.max(eda_signal)
    eda_range = eda_max - eda_min
    eda_slope = (eda_signal[-1] - eda_signal[0]) / len(eda_signal)

    # --------- Separación con deconvolución ----------
    scl, scr = eda_deconvolution(eda_signal, fs)

    # --------- Métricas SCL ----------
    scl_mean = np.mean(scl)
    scl_std = np.std(scl)

    time = np.arange(len(scl)) / fs
    corr_SCL_t = np.corrcoef(time, scl)[0, 1] if len(scl) > 1 else 0
    corr_SCL_t = np.nan_to_num(corr_SCL_t)

    # --------- Detección de picos SCR ----------
    scr_peaks, _ = find_peaks(scr, height=0.01, distance=int(1.0 * fs))
    scr_values = scr[scr_peaks] if len(scr_peaks) > 0 else np.array([np.nan])

    scr_count = len(scr_peaks)
    scr_mean = np.nanmean(scr_values) if scr_count > 0 else 0
    scr_std = np.nanstd(scr_values) if scr_count > 0 else 0
    scr_amp = np.nanmax(scr_values) if scr_count > 0 else 0
    scr_sum = np.nansum(scr_values) if scr_count > 0 else 0
    scr_area = np.trapz(scr_values, dx=1/fs) if scr_count > 1 else 0

    # --------- Visualización ----------
    if show_plots:
        scr_norm = (scr - np.mean(scr)) / np.std(scr)
        start_sec = 5
        end_sec = 25
        zoom_start = int(start_sec * fs)
        zoom_end = int(end_sec * fs)

        fig, axs = plt.subplots(3, 1, figsize=(14, 10))
        fig.suptitle("Análisis de EDA con deconvolución", fontsize=16)

        axs[0].plot(eda_signal, label='EDA Original', linewidth=1)
        axs[0].plot(scl, label='SCL (Tónica)', linewidth=1)
        axs[0].plot(scr, label='SCR (Fásica)', linewidth=1)
        axs[0].scatter(scr_peaks, scr[scr_peaks], c='red', label='SCR Peaks')
        axs[0].set_title("Señal completa con componentes")
        axs[0].legend()
        axs[0].grid(True)

        axs[1].plot(scr_norm, color='green', label='SCR Normalizada')
        axs[1].scatter(scr_peaks, scr_norm[scr_peaks], c='red', label='Picos detectados')
        axs[1].axhline(0, color='gray', linestyle='--', linewidth=0.5)
        axs[1].set_title("SCR Normalizada")
        axs[1].legend()
        axs[1].grid(True)

        zoom_peaks = [p for p in scr_peaks if zoom_start <= p < zoom_end]
        axs[2].plot(scr[zoom_start:zoom_end], label='SCR')
        axs[2].scatter(
            [p - zoom_start for p in zoom_peaks],
            [scr[p] for p in zoom_peaks],
            c='red', label='SCR Peaks'
        )
        axs[2].set_title(f"Zoom: muestras {zoom_start} a {zoom_end}")
        axs[2].legend()
        axs[2].grid(True)

        plt.tight_layout(rect=[0, 0, 1, 0.96])
        plt.show()

    return eda_mean, eda_std, eda_min, eda_max, eda_range, eda_slope, scl_mean, scl_std, scr_mean, scr_std, corr_SCL_t, scr_count, scr_amp, scr_sum, scr_area

In [114]:
def compute_hrv_metrics(ecg_signal, fs, show_plots=False):
    ecg_signal = np.array(ecg_signal).flatten()
    
    # Detección de picos R mejorada
    peaks, _ = scisig.find_peaks(ecg_signal, distance=fs*0.4, height=np.mean(ecg_signal) + np.std(ecg_signal))

    if len(peaks) < 2:
        return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan

    # Cálculo de intervalos RR
    rr_intervals = np.diff(peaks) * (1000 / fs)  # Convertir a milisegundos
    rr_intervals = rr_intervals[(rr_intervals > 400) & (rr_intervals < 1500)]  # Filtro fisiológico

    if len(rr_intervals) < 2:
        return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan

    hr_mean = 60000 / np.mean(rr_intervals) if len(rr_intervals) > 0 else np.nan
    hr_std = np.std(60000 / rr_intervals) if len(rr_intervals) > 0 else np.nan

    # Features temporales
    nn50 = np.sum(np.abs(np.diff(rr_intervals)) > 50)
    pnn50 = (nn50 / len(rr_intervals)) * 100 if len(rr_intervals) > 0 else np.nan
    rms_hrv = np.sqrt(np.mean(np.square(np.diff(rr_intervals))))

    # Cálculo de TINN
    hist, bin_edges = np.histogram(rr_intervals, bins='auto')
    tinn = bin_edges[np.argmax(hist)] if len(hist) > 0 else np.nan

    # Interpolación mejorada
    time_rr = np.cumsum(rr_intervals) / 1000
    time_rr = time_rr - time_rr[0]  # Normalizar inicio en 0

    fs_resample = 4.0  # Frecuencia de resampleo
    time_resampled = np.arange(0, time_rr[-1], 1 / fs_resample)
    interp_func = interp1d(time_rr, rr_intervals, kind="linear", fill_value="extrapolate")
    rr_resampled = interp_func(time_resampled)

    # Análisis espectral (PSD)
    fxx, pxx = scisig.welch(rr_resampled, fs=fs_resample, nperseg=min(len(rr_resampled), 256))
    lf_band, hf_band = (0.04, 0.15), (0.15, 0.4)

    lf_mask = (fxx >= lf_band[0]) & (fxx <= lf_band[1])
    hf_mask = (fxx >= hf_band[0]) & (fxx <= hf_band[1])

    lf = np.trapz(pxx[lf_mask], fxx[lf_mask]) if np.any(lf_mask) else np.nan
    hf = np.trapz(pxx[hf_mask], fxx[hf_mask]) if np.any(hf_mask) else np.nan
    lf_hf_ratio = lf / hf if (hf > 0 and not np.isnan(hf)) else np.nan

    sum_f = np.trapz(pxx, fxx)
    rel_f = (lf + hf) / sum_f if sum_f > 0 else np.nan
    lf_norm = (lf / (lf + hf)) * 100 if (lf + hf) > 0 else np.nan
    hf_norm = (hf / (lf + hf)) * 100 if (lf + hf) > 0 else np.nan

    if show_plots:
        plt.figure(figsize=(10, 4))
        plt.plot(ecg_signal, label="ECG", color='gray')
        plt.scatter(peaks, ecg_signal[peaks], color="red", label="Picos R")
        plt.legend()
        plt.title("Detección de Picos R en ECG")
        plt.xlabel("Tiempo (ms)")
        plt.ylabel("Amplitud")
        plt.show()

        plt.figure()
        plt.plot(time_rr, rr_intervals, "o-", label="Original RR")
        plt.plot(time_resampled, rr_resampled, "x-", label="Interpolado")
        plt.legend()
        plt.title("Interpolación de RR intervals")
        plt.show()

        plt.figure()
        plt.semilogy(fxx, pxx)  # Usa escala logarítmica
        plt.axvspan(0.04, 0.15, color="blue", alpha=0.3, label="LF Band")
        plt.axvspan(0.15, 0.4, color="red", alpha=0.3, label="HF Band")
        plt.legend()
        plt.title("Espectro HRV")
        plt.xlabel("Frecuencia (Hz)")
        plt.ylabel("PSD")
        plt.show()

    return hr_mean, hr_std, nn50, pnn50, tinn, rms_hrv, lf, hf, lf_hf_ratio, sum_f, rel_f, lf_norm, hf_norm

In [None]:
def compute_respiration_metrics(resp_signal, fs, show_plots=False):
    resp_signal = np.array(resp_signal).flatten()
    peaks, _ = scisig.find_peaks(resp_signal, distance=fs * 0.3)
    troughs, _ = scisig.find_peaks(-resp_signal, distance=fs * 0.3)
    
    if len(peaks) < 2 or len(troughs) < 2:
        return 0, 0, 0, 0, 0, 0, 0, 0, 0

    inspiration_durations = np.diff(peaks) / fs
    expiration_durations = np.diff(troughs) / fs

    I_mean = np.mean(inspiration_durations)
    I_std = np.std(inspiration_durations)
    E_mean = np.mean(expiration_durations)
    E_std = np.std(expiration_durations)

    ie_ratio = I_mean / E_mean if E_mean > 0 else 0
    resp_range = np.max(resp_signal) - np.min(resp_signal)
    insp_vol = np.mean(resp_signal[peaks]) - np.mean(resp_signal[troughs])
    resp_rate = len(peaks) / (len(resp_signal) / fs)
    resp_duration = len(resp_signal) / fs

    if show_plots:
        plt.plot(resp_signal, label="Señal Respiratoria")
        plt.scatter(peaks, resp_signal[peaks], color='red', label="Picos")
        plt.scatter(troughs, resp_signal[troughs], color='blue', label="Valles")
        plt.legend()
        plt.show()

    return I_mean, I_std, E_mean, E_std, ie_ratio, resp_range, insp_vol, resp_rate, resp_duration

In [116]:
def compute_ppg_features(ppg_signal, fs, show_plots=False):
    ppg_signal = np.array(ppg_signal).flatten()

    if len(ppg_signal) < 2:
        return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan

    # Detección de picos en PPG
    peak_indices, _ = scisig.find_peaks(ppg_signal, distance=fs*0.4, prominence=(np.std(ppg_signal) * 0.5, None))

    if len(peak_indices) < 2:
        return np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan, np.nan

    ibi = np.diff(peak_indices) / fs
    hr = 60 / ibi

    hr_mean = np.mean(hr)
    hr_std = np.std(hr)
    rmssd = np.sqrt(np.mean(np.diff(ibi) ** 2))
    sdnn = np.std(ibi)

    # Frecuencia de muestreo para IBI
    fs_ibi = max(1 / np.mean(ibi), 0.5)  # No menor a 0.5 Hz

    # PSD con Welch si hay suficientes puntos
    if len(ibi) > 4:  
        fxx, pxx = scisig.welch(ibi, fs=fs_ibi, nperseg=min(128, len(ibi)))
    else:
        fxx, pxx = np.array([]), np.array([])

    # Cálculo de LF y HF solo si hay valores en la PSD
    if np.sum(pxx) > 0:
        lf_band = (fxx >= 0.04) & (fxx < 0.15)
        hf_band = (fxx >= 0.15) & (fxx < 0.4)

        lf = np.trapz(pxx[lf_band]) if np.any(lf_band) else np.nan
        hf = np.trapz(pxx[hf_band]) if np.any(hf_band) else np.nan
        lf_hf_ratio = lf / hf if hf > 0 else np.nan
    else:
        lf, hf, lf_hf_ratio = np.nan, np.nan, np.nan

    # Características adicionales
    amplitudes = ppg_signal[peak_indices]
    pav_mean = np.mean(amplitudes)
    pav_std = np.std(amplitudes)

    rise_times = []
    for peak in peak_indices:
        # Buscar valle anterior
        start = max(peak - int(0.5 * fs), 0)
        valley_region = ppg_signal[start:peak]
        if len(valley_region) == 0:
            continue
        valley_idx = np.argmin(valley_region) + start
        rise_times.append((peak - valley_idx) / fs)

    decay_times = []
    for peak in peak_indices:
        end = min(peak + int(0.5 * fs), len(ppg_signal))
        decay_region = ppg_signal[peak:end]
        if len(decay_region) == 0:
            continue
        valley_idx = np.argmin(decay_region) + peak
        decay_times.append((valley_idx - peak) / fs)


    ri_list = []
    for peak in peak_indices:
        end = min(peak + int(0.5 * fs), len(ppg_signal))
        secondary_peaks, _ = scisig.find_peaks(ppg_signal[peak:end], prominence=0.1)
        if len(secondary_peaks) > 0:
            sec_peak_amp = ppg_signal[peak + secondary_peaks[0]]
            ri_list.append(sec_peak_amp / ppg_signal[peak])
        else:
            # Alternativa: usar mínimo como valle reflejado
            min_idx = np.argmin(ppg_signal[peak:end])
            ri_list.append(ppg_signal[peak + min_idx] / ppg_signal[peak])

    rise_time = np.mean(rise_times)

    decay_time = np.mean(decay_times)

    ri = np.mean(ri_list)

    if show_plots:
        plt.figure(figsize=(10, 4))
        plt.plot(ppg_signal, label="ECG", color='gray')
        plt.scatter(peak_indices, ppg_signal[peak_indices], color="red", label="Picos R")
        plt.legend()
        plt.title("Detección de Picos R en ECG")
        plt.xlabel("Tiempo (ms)")
        plt.ylabel("Amplitud")
        plt.show()

    return hr_mean, hr_std, rmssd, sdnn, lf, hf, lf_hf_ratio, len(peak_indices), rise_time, decay_time, pav_mean, pav_std, ri

In [117]:
class SubjectData:
    def __init__(self, main_path, subject_number, part=1, total_parts=1):
        self.name = f'{subject_number:03d}'
        files = sorted([f for f in os.listdir(main_path) if f.startswith(self.name)])
        if not files:
            raise FileNotFoundError(f"No se encontró archivo para el sujeto {self.name}")

        # Cargar todo el CSV
        full_data = pd.read_csv(os.path.join(main_path, files[0]))

        # Calcular índices para recorte
        total_len = len(full_data)
        start = (part - 1) * (total_len // total_parts)
        end = part * (total_len // total_parts) if part < total_parts else total_len

        # Guardar solo la parte seleccionada
        self.data = full_data.iloc[start:end].reset_index(drop=True)
        self.labels = self.data['label']

    def get_data(self):
        return self.data.drop(columns=['time', 'label'])

    def get_labels(self):
        return self.labels

In [118]:
"""class SubjectData:
    def __init__(self, main_path, subject_number):
        self.name = f'{subject_number:03d}'  # Asegurar que el número tenga formato 000
        file_path = os.path.join(main_path, f"{self.name}*.csv")  # Ruta de los archivos CSV

        files = sorted([f for f in os.listdir(os.path.join(main_path)) if f.startswith(self.name)])
        if not files:
            raise FileNotFoundError(f"No se encontró archivo para el sujeto {self.name}")

        self.data = pd.read_csv(os.path.join(main_path, files[0]))

        self.labels = self.data['label']

    def get_data(self):
        # Retorna los datos fisiológicos excepto el tiempo y la etiqueta
        return self.data.drop(columns=['time', 'label'])"""

'class SubjectData:\n    def __init__(self, main_path, subject_number):\n        self.name = f\'{subject_number:03d}\'  # Asegurar que el número tenga formato 000\n        file_path = os.path.join(main_path, f"{self.name}*.csv")  # Ruta de los archivos CSV\n\n        files = sorted([f for f in os.listdir(os.path.join(main_path)) if f.startswith(self.name)])\n        if not files:\n            raise FileNotFoundError(f"No se encontró archivo para el sujeto {self.name}")\n\n        self.data = pd.read_csv(os.path.join(main_path, files[0]))\n\n        self.labels = self.data[\'label\']\n\n    def get_data(self):\n        # Retorna los datos fisiológicos excepto el tiempo y la etiqueta\n        return self.data.drop(columns=[\'time\', \'label\'])'

In [119]:
def compute_features(data_dict, fs_dict):
    feature_dict = {}
    # ECG y BVP

    ecg_signal = data_dict['ECG']
    eda_signal = data_dict['EDA']
    ppg_signal = data_dict['PPG']
    resp_signal = data_dict['RESP']


    hr_mean, hr_std, nn50, pNN50, tinn, rmsHRV, lf, hf, lf_hf, sum_f, rel_f, lf_norm, hf_norm = compute_hrv_metrics(ecg_signal, fs_dict['ECG'])

    feature_dict.update({
        'HR_mean': hr_mean, 'HR_std' : hr_std,
        'NN50': nn50, 'pNN50': pNN50, 'TINN': tinn, 
        'rmsHRV': rmsHRV, 'LF': lf, 'HF': hf, 'LF_HF': lf_hf,
        'sum_f': sum_f, 'rel_f': rel_f, 'LF_norm': lf_norm, 'HF_norm': hf_norm
    })


    hr_mean, hr_std, rmssd, sdnn, lf, hf, lf_hf, num_beats, rise_time, decay_time, pav_mean, pav_std, ri = compute_ppg_features(ppg_signal, fs_dict['PPG'])

    feature_dict.update({
        'PPG_HR_mean': hr_mean, 'PPG_HR_std': hr_std, 'PPG_RMSSD': rmssd, 'PPG_SDNN': sdnn,
        'PPG_LF': lf, 'PPG_HF': hf, 'PPG_LF_HF': lf_hf, 'PPG_num_beats': num_beats,
        'PPG_RiseTime': rise_time, 'PPG_DecayTime': decay_time, 
        'PPG_PAV_mean': pav_mean, 'PPG_PAV_std': pav_std, 'PPG_RI': ri
    })

    eda_mean, eda_std, eda_min, eda_max, eda_range, eda_slope, scl_mean, scl_std, scr_mean, scr_std, corr_scl_t, scr_count, scr_amp, scr_sum, scr_area = compute_eda_metrics(eda_signal, fs_dict['EDA'])

    feature_dict.update({
        'EDA_mean': eda_mean, 'EDA_std': eda_std,
        'EDA_min': eda_min, 'EDA_max': eda_max,
        'EDA_range': eda_range, 'EDA_slope': eda_slope,
        'scl_mean': scl_mean, 'scl_std': scl_std, 'scr_mean': scr_mean, 'scr_std': scr_std,
        'corr_scl_t': corr_scl_t, 'scr_count': scr_count, 'scr_amp': scr_amp,
        'scr_sum': scr_sum, 'scr_area': scr_area
    })

    feature_dict.update({
        'Resp_mean': np.mean(resp_signal), 'Resp_std': np.std(resp_signal),
    })

    I_mean, I_std, E_mean, E_std, ie_ratio, resp_range, insp_vol, resp_rate, resp_duration = compute_respiration_metrics(resp_signal, fs_dict['RESP'])
              
    feature_dict.update({
        'Resp_I_mean': I_mean, 'Resp_I_std': I_std, 'Resp_E_mean': E_mean, 'Resp_E_std': E_std,
        'Resp_IE_ratio': ie_ratio, 'Resp_range': resp_range, 'Resp_insp_vol': insp_vol,
        'Resp_rate': resp_rate, 'Resp_duration': resp_duration
    })

    # Convertir a DataFrame con solo una fila
    df = pd.DataFrame([feature_dict])

    df["EDA_slope"] = df["EDA_slope"].apply(lambda x: x[0] if isinstance(x, list) else x)

    df.fillna(0, inplace=True)

    return df

In [120]:
def get_samples(data_dict, labels, fs_dict, stride_seconds):
    global feat_names
    global WINDOW_IN_SECONDS

    samples = []
    all_samples = []
    
    # Convertir tiempo a muestras
    window_len = int(fs_dict['ECG'] * WINDOW_IN_SECONDS)  # Se toma una señal como referencia
    stride_len = int(fs_dict['ECG'] * stride_seconds)  

    num_ventanas = (len(labels) - window_len) // stride_len + 1
    print(f"El número de ventanas esperadas es: {num_ventanas}")

    last_progress = -10
    processed = 0

    all_samples = pd.DataFrame()

    for start in range(0, len(labels) - window_len + 1, stride_len):
        end = start + window_len

        processed += 1  # Incrementar contador manualmente
        progress = processed / num_ventanas * 100

        if processed % 500 == 0:
            gc.collect()

        if processed % 50 == 0:

            print(f"\rProgreso: {progress:.4f}% completado", end="", flush=True)

        # Extraer ventana de cada señal
        #print(f"\nVentana {start} - {end} - Numero {processed}", end="", flush= True)
        window_data = {key: val[start:end] for key, val in data_dict.items()}
        window_labels = labels[start:end]  # Extraer etiquetas de la ventana

        # Aplicar hard labeling: etiqueta más frecuente en la ventana
        label_counts = Counter(window_labels)
        most_common_labels = label_counts.most_common()  # [(label1, count1), (label2, count2), ...]
        
        # Si hay empate, tomar la primera que aparece en la ventana original
        max_count = most_common_labels[0][1]
        candidate_labels = [label for label, count in most_common_labels if count == max_count]
        chosen_label = next(label for label in window_labels if label in candidate_labels)

        features_df = compute_features(window_data, fs_dict)

        # Agregar la etiqueta al DataFrame de features
        features_df['label'] = chosen_label

        all_samples = pd.concat([all_samples, features_df], ignore_index=True)

    if all_samples.empty:
        print("Advertencia: No se generaron muestras en get_samples(), devolviendo DataFrame vacío.")

    print("\n Procesamiento de ventanas completado.")  
    
    return all_samples

In [121]:
def make_patient_data(subject_id):
    global SAVE_PATH
    global WINDOW_IN_SECONDS

    # Crear objeto SubjectData
    subject = SubjectData(main_path=DATA_PATH, subject_number=subject_id)

    # Obtener datos del pecho (ahora en DataFrame)
    data_df = subject.get_data()

    print("Columnas disponibles:", data_df.columns)
    print("Etiquetas:", subject.labels.value_counts())

    # Definir clases de interés
    valid_labels = {1, 2, 3, 4}
    
    # Filtrar datos válidos
    mask = subject.labels.isin(valid_labels)
    filtered_labels = subject.labels[mask].reset_index(drop=True)
    filtered_data = data_df[mask].reset_index(drop=True)

    filtered_data = filtered_data.copy()

    filtered_data['ECG'] = butter_filter(filtered_data['ECG'], cutoff=(0.5, 40), fs=fs_dict['ECG'], order=4, filter_type='band')
    filtered_data['PPG'] = butter_filter(filtered_data['PPG'], cutoff=(0.5, 8), fs=fs_dict['PPG'], order=2, filter_type='band')
    filtered_data['EDA'] = butter_filter(filtered_data['EDA'], cutoff=1, fs=fs_dict['EDA'], order=2, filter_type='low')
    filtered_data['RESP'] = butter_filter(filtered_data['RESP'], cutoff=0.5, fs=fs_dict['RESP'], order=4, filter_type='low')

    print("Data lista para procesar: " + str(len(filtered_labels)))

    # Calcular características
    samples = get_samples(filtered_data, filtered_labels, fs_dict=fs_dict, stride_seconds=STRIDE_IN_SECONDS)

    print("Características calculadas")

    if not isinstance(samples, pd.DataFrame):
        all_samples = pd.concat(samples, ignore_index=True)
    else:
        all_samples = samples.copy()

    all_samples['label'] = all_samples['label'].astype(int)
    all_samples = pd.concat([all_samples.drop('label', axis=1), pd.get_dummies(all_samples['label'])], axis=1)

    # Guardar como CSV
    all_samples.to_csv(f'{SAVE_PATH}/{subject_id}_features.csv', index=False)

    # **Liberar memoria**
    del subject, all_samples, samples, data_df, filtered_data, filtered_labels
    gc.collect()  # Forzar recolección de basura

In [122]:
def combine_files(subjects):
    df_list = []
    for s in subjects:
        df = pd.read_csv(f'{SAVE_PATH}/{s}_features.csv')
        df['subject'] = s
        df_list.append(df)

    df = pd.concat(df_list)

    print(df.head(10))
    print(df.columns)

    df[['1', '2', '3', '4']] = df[['1', '2', '3', '4']].fillna(0).astype(int)
    df[['1', '2', '3', '4']] = df[['1', '2', '3', '4']].astype(int)
    df['label'] = df[['1', '2', '3', '4']].idxmax(axis=1).astype(int)
    df.drop(['1', '2', '3', '4'], axis=1, inplace=True)

    df.reset_index(drop=True, inplace=True)

    df.to_csv(f'{SAVE_PATH}/features.csv')

    counts = df['label'].value_counts()

    print("Índices en counts:", counts.index.tolist())
    print("Claves en int_to_label:", int_to_label.keys())

    print('Number of samples per class:')
    for label, number in zip(counts.index, counts.values):
        print(f'{int_to_label[label]}: {number}')

In [123]:
#subject_ids = [4, 6, 7, 8, 9, 10, 11, 13, 15, 19, 21]
subject_ids = [4, 6, 7, 8, 9, 10, 11, 13, 14, 15, 18, 19, 20, 21, 22, 29, 30, 35, 37, 38, 39, 40, 41, 42, 43, 44]

for patient in subject_ids:
    print(f'Processing data for {patient}...')
    make_patient_data(patient)


Processing data for 4...
Columnas disponibles: Index(['RESP', 'PPG', 'ECG', 'EDA'], dtype='object')
Etiquetas: label
3    1506462
0    1505087
1    1503834
4     954719
2     601913
Name: count, dtype: int64
Data lista para procesar: 4566928
El número de ventanas esperadas es: 6050
Progreso: 100.0000% completado
 Procesamiento de ventanas completado.
Características calculadas
Processing data for 6...
Columnas disponibles: Index(['RESP', 'PPG', 'ECG', 'EDA'], dtype='object')
Etiquetas: label
1    1501180
0    1498629
3    1464787
4     917460
2     600581
Name: count, dtype: int64
Data lista para procesar: 4484008
El número de ventanas esperadas es: 5939
Progreso: 99.3433% completado
 Procesamiento de ventanas completado.
Características calculadas
Processing data for 7...
Columnas disponibles: Index(['RESP', 'PPG', 'ECG', 'EDA'], dtype='object')
Etiquetas: label
0    1498972
1    1498311
3    1485776
4     918503
2     604285
Name: count, dtype: int64
Data lista para procesar: 4506875

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Progreso: 99.8350% completado
 Procesamiento de ventanas completado.
Características calculadas
Processing data for 40...
Columnas disponibles: Index(['RESP', 'PPG', 'ECG', 'EDA'], dtype='object')
Etiquetas: label
0    1549810
3    1541245
1    1500659
4     980675
2     600854
Name: count, dtype: int64
Data lista para procesar: 4623433
El número de ventanas esperadas es: 6125
Progreso: 99.5918% completado
 Procesamiento de ventanas completado.
Características calculadas
Processing data for 41...
Columnas disponibles: Index(['RESP', 'PPG', 'ECG', 'EDA'], dtype='object')
Etiquetas: label
0    1590416
3    1541626
1    1511648
4     939817
2     626288
Name: count, dtype: int64
Data lista para procesar: 4619379
El número de ventanas esperadas es: 6120
Progreso: 99.6732% completado
 Procesamiento de ventanas completado.
Características calculadas
Processing data for 42...
Columnas disponibles: Index(['RESP', 'PPG', 'ECG', 'EDA'], dtype='object')
Etiquetas: label
0    1575144
3    1525032


  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=

Progreso: 49.1562% completado

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Progreso: 99.9508% completado
 Procesamiento de ventanas completado.
Características calculadas
Processing data for 43...
Columnas disponibles: Index(['RESP', 'PPG', 'ECG', 'EDA'], dtype='object')
Etiquetas: label
3    1526183
0    1507906
1    1502948
4     965664
2     611827
Name: count, dtype: int64
Data lista para procesar: 4606622
El número de ventanas esperadas es: 6103
Progreso: 63.0837% completado

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=

Progreso: 63.9030% completado

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)
  return _methods._mean(a, axis=

Progreso: 99.9508% completado
 Procesamiento de ventanas completado.
Características calculadas
Processing data for 44...
Columnas disponibles: Index(['RESP', 'PPG', 'ECG', 'EDA'], dtype='object')
Etiquetas: label
0    1508188
3    1505979
1    1502093
4     933912
2     600754
Name: count, dtype: int64
Data lista para procesar: 4542738
El número de ventanas esperadas es: 6017
Progreso: 68.9712% completado

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


Progreso: 99.7175% completado
 Procesamiento de ventanas completado.
Características calculadas


In [124]:
subject_ids = [4, 6, 7, 8, 9, 10, 11, 13, 15, 19, 21]

combine_files(subject_ids)
print('Processing complete.')

     HR_mean    HR_std  NN50      pNN50        TINN     rmsHRV          LF  \
0  64.799331  3.230688     9  29.032258  914.000000  39.770592  246.312945   
1  65.235118  5.630104    10  31.250000  908.090909  60.664067  275.814200   
2  64.987815  5.129443    10  31.250000  897.333333  57.220287  317.871631   
3  64.328073  3.313657     9  28.125000  914.000000  39.608976  354.793217   
4  64.290899  3.361635     8  25.806452  914.000000  38.530940  371.603869   
5  64.089312  3.182478     7  22.580645  914.000000  38.275754  402.452944   
6  63.921919  3.201196     7  22.580645  914.000000  38.783158  429.329815   
7  64.306528  5.160580     7  21.875000  963.625000  43.966630  440.258436   
8  64.277569  6.968145     6  19.354839  963.000000  63.117879  445.182266   
9  63.528930  2.906676     5  16.129032  914.000000  35.052342  447.987485   

           HF     LF_HF        sum_f  ...  Resp_IE_ratio  Resp_range  \
0  232.861720  1.057765  1416.924553  ...       0.980761   10.256197 

In [125]:
print('holi')

holi
