In [453]:
import os
import numpy as np
#import cupy as cp
import pandas as pd
import matplotlib.pyplot as plt
from scipy import stats
import scipy.signal as scisig
import scipy.stats
from scipy.interpolate import interp1d 
from collections import Counter
import scipy.interpolate as interp
#import cupyx.scipy.signal as cusig
import neurokit2 as nk
from scipy.signal import find_peaks, welch, butter, filtfilt, convolve
from scipy.optimize import curve_fit
#import cupyx.scipy.signal as cpysig
import gc
import pickle

In [454]:
import faulthandler
faulthandler.enable()

In [455]:
fs_dict = {'ACC': 700, 'ECG': 700, 'EMG': 700, 'EDA': 700, 'Temp': 700, 'Resp': 700, 'label': 700}
WINDOW_IN_SECONDS = 60
STRIDE_IN_SECONDS = 0.25
label_dict = {'baseline': 1, 'stress': 2, 'amusement': 3}
int_to_label = {1: 'baseline', 2: 'stress', 3: 'amusement'}
feat_names = None
DATA_PATH = r'WESAD/'
SAVE_PATH = r'WESAD_DATA_60_025/'


In [456]:
if not os.path.exists(SAVE_PATH):
    os.makedirs(SAVE_PATH)

In [457]:
#def eda_stats(y):
#    Fs = fs_dict['EDA']
#    yn = (y - y.mean()) / y.std()
#    print(yn)
#    print("calculating eda stats")
#    [r, p, t, l, d, e, obj] = cvxEDA.cvxEDA(yn, 1. / Fs)
#    return [r, p, t, l, d, e, obj]

In [458]:
def compute_eda_metrics(eda_signal, fs, show_plots=False):
    eda_signal = np.array(eda_signal).flatten()
    
    if len(eda_signal) < 2:
        return [np.nan] * 16  # Retorna 16 NaNs si la señal no es válida

    # --------- Señal total (EDA) ----------
    eda_mean = np.mean(eda_signal)
    eda_std = np.std(eda_signal)
    eda_min = np.min(eda_signal)
    eda_max = np.max(eda_signal)
    eda_range = eda_max - eda_min
    eda_slope = (eda_signal[-1] - eda_signal[0]) / len(eda_signal)

    # --------- Estimación de componentes ----------
    # Estimación de componente tónica (SCL): suavizado con ventana larga (10s)
    win_scl = int(fs * 10)
    kernel = np.ones(win_scl) / win_scl
    scl = convolve(eda_signal, kernel, mode='same', method='auto')

    # Estimación de componente fásica (SCR): diferencia entre señal y SCL
    scr = eda_signal - scl

    # --------- Métricas SCL ----------
    scl_mean = np.mean(scl)
    scl_std = np.std(scl)
    
    time = np.arange(len(scl)) / fs
    corr_SCL_t = np.corrcoef(time, scl)[0, 1] if len(scl) > 1 else 0
    corr_SCL_t = np.nan_to_num(corr_SCL_t)

    # --------- Detección de picos SCR ----------
    scr_peaks, _ = find_peaks(scr, height=0.01, prominence=0.05, distance=int(1.0 * fs))  # Ajustable
    scr_values = scr[scr_peaks] if len(scr_peaks) > 0 else np.array([np.nan])
    
    scr_count = len(scr_peaks)

    scr_mean = np.nanmean(scr_values) if scr_count > 0 else 0
    scr_std = np.nanstd(scr_values) if scr_count > 0 else 0
    scr_amp = np.nanmax(scr_values) if scr_count > 0 else 0
    scr_sum = np.nansum(scr_values) if scr_count > 0 else 0
    scr_area = np.trapz(scr_values, dx=1/fs) if scr_count > 1 else 0

    if show_plots:
        scr_norm = (scr - np.mean(scr)) / np.std(scr)
        start_sec = 5
        end_sec = 60
        zoom_start = int(start_sec * fs)
        zoom_end = int(end_sec * fs)

        fig, axs = plt.subplots(3, 1, figsize=(14, 10))
        fig.suptitle("Análisis de EDA y detección de picos SCR", fontsize=16)

        # 1. Señal completa
        axs[0].plot(eda_signal, label='EDA Filtrada', linewidth=1)
        axs[0].plot(scl, label='SCL (Tónica)', linewidth=1)
        axs[0].plot(scr, label='SCR (Fásica)', linewidth=1)
        axs[0].scatter(scr_peaks, scr[scr_peaks], c='red', label='SCR Peaks')
        axs[0].set_title("Señal completa con componentes")
        axs[0].legend()
        axs[0].grid(True)

        # 2. SCR normalizada
        axs[1].plot(scr_norm, color='green', label='SCR Normalizada')
        axs[1].scatter(scr_peaks, scr_norm[scr_peaks], c='red', label='Picos detectados')
        axs[1].axhline(0, color='gray', linestyle='--', linewidth=0.5)
        axs[1].set_title("SCR Normalizada")
        axs[1].legend()
        axs[1].grid(True)

        # 3. Zoom en una región
        zoom_peaks = [p for p in scr_peaks if zoom_start <= p < zoom_end]
        #axs[2].plot(eda_signal[zoom_start:zoom_end], label='EDA')
        #axs[2].plot(scl[zoom_start:zoom_end], label='SCL')
        axs[2].plot(scr[zoom_start:zoom_end], label='SCR')
        axs[2].scatter(
            [p - zoom_start for p in zoom_peaks],
            [scr[p] for p in zoom_peaks],
            c='red', label='SCR Peaks'
        )
        axs[2].set_title(f"Zoom: muestras {zoom_start} a {zoom_end}")
        axs[2].legend()
        axs[2].grid(True)

        plt.tight_layout(rect=[0, 0, 1, 0.96])
        plt.show()

    return eda_mean, eda_std, eda_min, eda_max, eda_range, eda_slope, scl_mean, scl_std, scr_mean, scr_std, corr_SCL_t, scr_count, scr_amp, scr_sum, scr_area

In [459]:
def compute_emg_features(emg_signal, fs, show_plots=False):
    emg_signal = np.array(emg_signal).flatten()

    # --- Estadísticos de la señal cruda ---
    emg_mean = np.mean(emg_signal)
    emg_std = np.std(emg_signal)
    emg_median = np.median(emg_signal)
    emg_p10 = np.percentile(emg_signal, 10)
    emg_p90 = np.percentile(emg_signal, 90)
    emg_range = np.max(emg_signal) - np.min(emg_signal)
    emg_sum = np.sum(emg_signal)

    # --- Espectro de potencia ---
    fxx, pxx = scisig.welch(emg_signal, fs=fs, nperseg=1024)
    f_peak = fxx[np.argmax(pxx)]

    psd_bands = [
        np.trapz(pxx[(fxx >= 0) & (fxx < 10)]),
        np.trapz(pxx[(fxx >= 10) & (fxx < 20)]),
        np.trapz(pxx[(fxx >= 20) & (fxx < 50)]),
        np.trapz(pxx[(fxx >= 50) & (fxx < 100)]),
        np.trapz(pxx[(fxx >= 100) & (fxx < 150)]),
        np.trapz(pxx[(fxx >= 150) & (fxx < 250)]),
        np.trapz(pxx[(fxx >= 250) & (fxx < 350)])
    ]
    psd_total = sum(psd_bands)
    psd_rel = [band / psd_total if psd_total > 0 else 0 for band in psd_bands]

    # --- Detección de picos ---
    threshold = np.mean(emg_signal) + 2 * np.std(emg_signal)
    peak_indices, _ = scisig.find_peaks(emg_signal, height=threshold)
    peak_values = emg_signal[peak_indices] if len(peak_indices) > 0 else np.array([0])
    
    peak_count = len(peak_indices)
    peak_amp_mean = np.mean(peak_values)
    peak_amp_std = np.std(peak_values)
    peak_amp_sum = np.sum(peak_values)
    peak_amp_norm = peak_amp_sum / len(emg_signal)

    # --- Gráfica opcional ---
    if show_plots:
        time = np.arange(len(emg_signal)) / fs
        plt.figure(figsize=(12, 4))
        plt.plot(time, emg_signal, label='EMG Signal')
        plt.plot(time[peak_indices], emg_signal[peak_indices], 'rx', label='Detected Peaks')
        plt.axhline(y = threshold, color='gray', linestyle='--', label='Mean + STD')
        plt.xlabel('Time (s)')
        plt.ylabel('Amplitude')
        plt.title('EMG Signal with Detected Peaks')
        plt.legend()
        plt.grid(True)
        plt.tight_layout()
        plt.show()

    return emg_mean, emg_std, emg_median, emg_p10, emg_p90, emg_range, emg_sum, f_peak, *psd_rel, peak_count, peak_amp_mean, peak_amp_std, peak_amp_sum, peak_amp_norm

In [460]:
def compute_hrv_metrics(ecg_signal, fs, show_plots=False):
    ecg_signal = np.array(ecg_signal).flatten()
    
    # Detección de picos R mejorada
    peaks, _ = scisig.find_peaks(ecg_signal, distance=fs*0.4, height=np.mean(ecg_signal) + np.std(ecg_signal))

    if len(peaks) < 2:
        return 0,0,0,0,0,0,0,0,0,0,0,0,0

    # Cálculo de intervalos RR
    rr_intervals = np.diff(peaks) * (1000 / fs)  # Convertir a milisegundos
    rr_intervals = rr_intervals[(rr_intervals > 400) & (rr_intervals < 1500)]  # Filtro fisiológico

    if len(rr_intervals) < 2:
        return 0,0,0,0,0,0,0,0,0,0,0,0,0

    hr_mean = 60000 / np.mean(rr_intervals) if len(rr_intervals) > 0 else np.nan
    hr_std = np.std(60000 / rr_intervals) if len(rr_intervals) > 0 else np.nan

    # Features temporales
    nn50 = np.sum(np.abs(np.diff(rr_intervals)) > 50)
    pnn50 = (nn50 / len(rr_intervals)) * 100 if len(rr_intervals) > 0 else np.nan
    rms_hrv = np.sqrt(np.mean(np.square(np.diff(rr_intervals))))

    # Cálculo de TINN
    hist, bin_edges = np.histogram(rr_intervals, bins='auto')
    tinn = bin_edges[np.argmax(hist)] if len(hist) > 0 else np.nan

    # Interpolación mejorada
    time_rr = np.cumsum(rr_intervals) / 1000
    time_rr = time_rr - time_rr[0]  # Normalizar inicio en 0

    fs_resample = 4.0  # Frecuencia de resampleo
    time_resampled = np.arange(0, time_rr[-1], 1 / fs_resample)
    interp_func = interp1d(time_rr, rr_intervals, kind="linear", fill_value="extrapolate")
    rr_resampled = interp_func(time_resampled)

    # Análisis espectral (PSD)
    fxx, pxx = scisig.welch(rr_resampled, fs=fs_resample, nperseg=min(len(rr_resampled), 256))
    lf_band, hf_band = (0.04, 0.15), (0.15, 0.4)

    lf_mask = (fxx >= lf_band[0]) & (fxx <= lf_band[1])
    hf_mask = (fxx >= hf_band[0]) & (fxx <= hf_band[1])

    lf = np.trapz(pxx[lf_mask], fxx[lf_mask]) if np.any(lf_mask) else np.nan
    hf = np.trapz(pxx[hf_mask], fxx[hf_mask]) if np.any(hf_mask) else np.nan
    lf_hf_ratio = lf / hf if (hf > 0 and not np.isnan(hf)) else np.nan

    sum_f = np.trapz(pxx, fxx)
    rel_f = (lf + hf) / sum_f if sum_f > 0 else np.nan
    lf_norm = (lf / (lf + hf)) * 100 if (lf + hf) > 0 else np.nan
    hf_norm = (hf / (lf + hf)) * 100 if (lf + hf) > 0 else np.nan

    if show_plots:
        plt.figure(figsize=(10, 4))
        plt.plot(ecg_signal, label="ECG", color='gray')
        plt.scatter(peaks, ecg_signal[peaks], color="red", label="Picos R")
        plt.legend()
        plt.title("Detección de Picos R en ECG")
        plt.xlabel("Tiempo (ms)")
        plt.ylabel("Amplitud")
        plt.show()

        plt.figure()
        plt.plot(time_rr, rr_intervals, "o-", label="Original RR")
        plt.plot(time_resampled, rr_resampled, "x-", label="Interpolado")
        plt.legend()
        plt.title("Interpolación de RR intervals")
        plt.show()

        plt.figure()
        plt.semilogy(fxx, pxx)  # Usa escala logarítmica
        plt.axvspan(0.04, 0.15, color="blue", alpha=0.3, label="LF Band")
        plt.axvspan(0.15, 0.4, color="red", alpha=0.3, label="HF Band")
        plt.legend()
        plt.title("Espectro HRV")
        plt.xlabel("Frecuencia (Hz)")
        plt.ylabel("PSD")
        plt.show()

    return hr_mean, hr_std, nn50, pnn50, tinn, rms_hrv, lf, hf, lf_hf_ratio, sum_f, rel_f, lf_norm, hf_norm

In [461]:
def compute_emg_peaks(emg_signal, threshold=0.05):
    # Normalizar señal EMG
    emg_signal = emg_signal.flatten()
    emg_signal = (emg_signal - np.min(emg_signal)) / (np.max(emg_signal) - np.min(emg_signal))

    # Detectar picos que superen el umbral
    peaks, _ = scisig.find_peaks(emg_signal, height=threshold)
    
    # Obtener amplitudes de los picos detectados
    peak_amplitudes = emg_signal[peaks] if len(peaks) > 0 else [0]

    return len(peaks), np.mean(peak_amplitudes)

In [462]:
def compute_respiration_metrics(resp_signal, fs, show_plots=False):
    resp_signal = np.array(resp_signal).flatten()
    peaks, _ = scisig.find_peaks(resp_signal, distance=fs * 0.3)
    troughs, _ = scisig.find_peaks(-resp_signal, distance=fs * 0.3)
    
    if len(peaks) < 2 or len(troughs) < 2:
        return 0, 0, 0, 0, 0, 0, 0, 0, 0

    inspiration_durations = np.diff(peaks) / fs
    expiration_durations = np.diff(troughs) / fs

    I_mean = np.mean(inspiration_durations)
    I_std = np.std(inspiration_durations)
    E_mean = np.mean(expiration_durations)
    E_std = np.std(expiration_durations)

    ie_ratio = I_mean / E_mean if E_mean > 0 else 0
    resp_range = np.max(resp_signal) - np.min(resp_signal)
    insp_vol = np.mean(resp_signal[peaks]) - np.mean(resp_signal[troughs])
    resp_rate = len(peaks) / (len(resp_signal) / fs)
    resp_duration = len(resp_signal) / fs

    if show_plots:
        plt.plot(resp_signal, label="Señal Respiratoria")
        plt.scatter(peaks, resp_signal[peaks], color='red', label="Picos")
        plt.scatter(troughs, resp_signal[troughs], color='blue', label="Valles")
        plt.legend()
        plt.show()

    return I_mean, I_std, E_mean, E_std, ie_ratio, resp_range, insp_vol, resp_rate, resp_duration


In [463]:
class SubjectData:

    def __init__(self, main_path, subject_number):
        self.name = f'S{subject_number}'
        self.subject_keys = ['signal', 'label', 'subject']
        self.signal_keys = ['chest', 'wrist']
        self.chest_keys = ['ACC', 'ECG', 'EMG', 'EDA', 'Temp', 'Resp']
        self.wrist_keys = ['ACC', 'BVP', 'EDA', 'TEMP']
        with open(os.path.join(main_path, self.name) + '/' + self.name + '.pkl', 'rb') as file:
            self.data = pickle.load(file, encoding='latin1')
        self.labels = pd.DataFrame(self.data['label'], columns=["label"])

    def get_wrist_data(self):
        data = self.data['signal']['wrist']
        data.update({'Resp': self.data['signal']['chest']['Resp']})
        return data

    def get_chest_data(self):
        tmp = self.data['signal']['chest']
        acc = tmp["ACC"]
        tmp_acc = pd.DataFrame(acc, columns=["ACC_X", "ACC_Y", "ACC_Z"])

        tmp_other = pd.DataFrame({
            "ECG": tmp["ECG"].ravel(),
            "EMG": tmp["EMG"].ravel(),
            "EDA": tmp["EDA"].ravel(),
            "Temp": tmp["Temp"].ravel(),
            "Resp": tmp["Resp"].ravel()
        })

        df = pd.concat([tmp_acc, tmp_other], axis=1)

        return df

    def extract_features(self):  # only wrist
        results = \
            {
                key: get_statistics(self.get_chest_data()[key].flatten(), self.labels, key)
                for key in self.chest_keys
            }
        return results

In [464]:
def butter_filter(data, cutoff, fs, order=4, filter_type='low'):
    """
    Aplica un filtro Butterworth a los datos.

    Parámetros:
    - data: array de la señal a filtrar.
    - cutoff: frecuencia de corte (o tupla en caso de bandpass).
    - fs: frecuencia de muestreo.
    - order: orden del filtro.
    - filter_type: 'low', 'high' o 'band'.

    Retorna:
    - Señal filtrada.
    """
    nyq = 0.5 * fs  # Frecuencia de Nyquist

    # Normalizar la frecuencia de corte
    if isinstance(cutoff, (list, tuple)):  # Band-pass o Band-stop
        normal_cutoff = [c / nyq for c in cutoff]
    else:  # Low-pass o High-pass
        normal_cutoff = cutoff / nyq

    # Crear el filtro Butterworth
    b, a = scisig.butter(order, normal_cutoff, btype=filter_type, analog=False)

    # Aplicar el filtro con filtfilt para evitar desfases
    return scisig.filtfilt(b, a, np.array(data))


def get_slope(signal):
    if len(signal) < 2:  # Evita divisiones por 0 o valores no válidos
        return 0
    
    # Convertir a pandas.Series si es necesario
    if not isinstance(signal, pd.Series):
        signal = pd.Series(signal)

    resu = (signal.iloc[-1] - signal.iloc[0]) / (len(signal) - 1)  # Evita dividir entre len(signal)

    # Convertir a escalar si es un array
    if isinstance(resu, np.ndarray):
        resu = resu.flatten()[0]
    
    return resu

def get_peak_freq(x, fs):
    f, Pxx = scisig.periodogram(x, fs=fs)
    psd_dict = {amp: freq for amp, freq in zip(Pxx, f)}
    return psd_dict[max(psd_dict.keys())]

def get_window_stats(data, label=-1):
    mean_features = np.mean(data)
    std_features = np.std(data)
    min_features = np.amin(data)
    max_features = np.amax(data)

    features = {'mean': mean_features, 'std': std_features, 'min': min_features, 'max': max_features,
                'label': label}
    return features


def get_net_accel(data):
    return (data['ACC_x'] ** 2 + data['ACC_y'] ** 2 + data['ACC_z'] ** 2).apply(lambda x: np.sqrt(x))


def get_peak_freq(x):
    f, Pxx = scisig.periodogram(x, fs=8)
    psd_dict = {amp: freq for amp, freq in zip(Pxx, f)}
    peak_freq = psd_dict[max(psd_dict.keys())]
    return peak_freq


# https://github.com/MITMediaLabAffectiveComputing/eda-explorer/blob/master/AccelerometerFeatureExtractionScript.py
def filterSignalFIR(eda, cutoff=0.4, numtaps=64):
    f = cutoff / (fs_dict['ACC'] / 2.0)
    FIR_coeff = scisig.firwin(numtaps, f)

    return scisig.lfilter(FIR_coeff, 1, eda.flatten())


def triangle(x, a, b, c):
    return np.maximum(0, a - np.abs(x - b) / c)

In [465]:
def compute_features(data_dict, fs_dict):
    feature_dict = {}

    # ECG y BVP
    ecg_signal = data_dict['ECG']
    eda_signal = data_dict['EDA']
    emg_signal = data_dict['EMG']
    resp_signal = data_dict['Resp']
    temp_signal = data_dict['Temp']
    
    hr_mean, hr_std, nn50, pNN50, tinn, rmsHRV, lf, hf, lf_hf, sum_f, rel_f, lf_norm, hf_norm = compute_hrv_metrics(ecg_signal, fs_dict['ECG'])

    feature_dict.update({
        'HR_mean': hr_mean, 'HR_std' : hr_std,
        'NN50': nn50, 'pNN50': pNN50, 'TINN': tinn, 
        'rmsHRV': rmsHRV, 'LF': lf, 'HF': hf, 'LF_HF': lf_hf,
        'sum_f': sum_f, 'rel_f': rel_f, 'LF_norm': lf_norm, 'HF_norm': hf_norm
    })

    eda_mean, eda_std, eda_min, eda_max, eda_range, eda_slope, scl_mean, scl_std, scr_mean, scr_std, corr_scl_t, scr_count, scr_amp, scr_sum, scr_area = compute_eda_metrics(eda_signal, fs_dict['EDA'])

    feature_dict.update({
        'EDA_mean': eda_mean, 'EDA_std': eda_std,
        'EDA_min': eda_min, 'EDA_max': eda_max,
        'EDA_range': eda_range, 'EDA_slope': eda_slope,
        'scl_mean': scl_mean, 'scl_std': scl_std, 'scr_mean': scr_mean, 'scr_std': scr_std,
        'corr_scl_t': corr_scl_t, 'scr_count': scr_count, 'scr_amp': scr_amp,
        'scr_sum': scr_sum, 'scr_area': scr_area
    })


    emg_features = compute_emg_features(emg_signal, fs_dict['EMG'])
    feature_dict.update({
        'EMG_mean': emg_features[0],
        'EMG_std': emg_features[1],
        'EMG_median': emg_features[2],
        'EMG_p10': emg_features[3],
        'EMG_p90': emg_features[4],
        'EMG_range': emg_features[5],
        'EMG_sum': emg_features[6],
        'EMG_f_peak': emg_features[7],
        'EMG_psd_0_10Hz': emg_features[8],
        'EMG_psd_10_20Hz': emg_features[9],
        'EMG_psd_20_50Hz': emg_features[10],
        'EMG_psd_50_100Hz': emg_features[11],
        'EMG_psd_100_150Hz': emg_features[12],
        'EMG_psd_150_250Hz': emg_features[13],
        'EMG_psd_250_350Hz': emg_features[14],
        'EMG_peak_count': emg_features[15],
        'EMG_peak_amp_mean': emg_features[16],
        'EMG_peak_amp_std': emg_features[17],
        'EMG_peak_amp_sum': emg_features[18],
        'EMG_peak_amp_norm': emg_features[19]
    })


    feature_dict.update({
        'Resp_mean': np.mean(resp_signal), 'Resp_std': np.std(resp_signal),
    })

    I_mean, I_std, E_mean, E_std, ie_ratio, resp_range, insp_vol, resp_rate, resp_duration = compute_respiration_metrics(resp_signal, fs_dict['Resp'])
              
    feature_dict.update({
        'Resp_I_mean': I_mean, 'Resp_I_std': I_std, 'Resp_E_mean': E_mean, 'Resp_E_std': E_std,
        'Resp_IE_ratio': ie_ratio, 'Resp_range': resp_range, 'Resp_insp_vol': insp_vol,
        'Resp_rate': resp_rate, 'Resp_duration': resp_duration
    })

    # Temperatura
    feature_dict.update({
        'Temp_mean': np.mean(temp_signal), 'Temp_std': np.std(temp_signal),
        'Temp_min': np.min(temp_signal), 'Temp_max': np.max(temp_signal),
        'Temp_range': np.max(temp_signal) - np.min(temp_signal), 'Temp_slope': get_slope(temp_signal)
    })

    # Convertir a DataFrame con solo una fila
    df = pd.DataFrame([feature_dict])

    df["EDA_slope"] = df["EDA_slope"].apply(lambda x: x[0] if isinstance(x, list) else x)
    df["Temp_slope"] = df["Temp_slope"].apply(lambda x: x[0] if isinstance(x, list) else x)


    df.fillna(0, inplace=True)

    return df

In [466]:
def get_samples(data_dict, labels_df, fs_dict, stride_seconds):
    global feat_names
    global WINDOW_IN_SECONDS

    samples = []
    all_samples = []

    labels = labels_df['label'] 
    
    # Convertir tiempo a muestras
    window_len = int(fs_dict['ECG'] * WINDOW_IN_SECONDS)  # Se toma una señal como referencia
    stride_len = int(fs_dict['ECG'] * stride_seconds)  

    num_ventanas = (len(labels) - window_len) // stride_len + 1
    print(f"El número de ventanas esperadas es: {num_ventanas}")

    last_progress = -10
    processed = 0

    all_samples = pd.DataFrame()

    for start in range(0, len(labels) - window_len + 1, stride_len):
        end = start + window_len

        processed += 1  # Incrementar contador manualmente
        progress = processed / num_ventanas * 100

        if processed % 500 == 0:
            gc.collect()

        if processed % 50 == 0:

            print(f"\rProgreso: {progress:.4f}% completado", end="", flush=True)

        # Extraer ventana de cada señal
        #print(f"\nVentana {start} - {end} - Numero {processed}", end="", flush= True)
        window_data = {key: val[start:end] for key, val in data_dict.items()}
        window_labels = labels[start:end]  # Extraer etiquetas de la ventana

        # Aplicar hard labeling: etiqueta más frecuente en la ventana
        label_counts = Counter(window_labels)
        most_common_labels = label_counts.most_common()  # [(label1, count1), (label2, count2), ...]
        
        # Si hay empate, tomar la primera que aparece en la ventana original
        max_count = most_common_labels[0][1]
        candidate_labels = [label for label, count in most_common_labels if count == max_count]
        chosen_label = next(label for label in window_labels if label in candidate_labels)

        features_df = compute_features(window_data, fs_dict)

        # Agregar la etiqueta al DataFrame de features
        features_df['label'] = chosen_label

        all_samples = pd.concat([all_samples, features_df], ignore_index=True)

    if all_samples.empty:
        print("Advertencia: No se generaron muestras en get_samples(), devolviendo DataFrame vacío.")

    print("\n Procesamiento de ventanas completado.")  
    
    return all_samples

In [467]:
def make_patient_data(subject_id):
    global SAVE_PATH
    global WINDOW_IN_SECONDS

    subject = SubjectData(main_path=DATA_PATH, subject_number=subject_id)

    # Obtener datos del pecho (ahora en DataFrame)
    data_df = subject.get_chest_data()


    print("Columnas disponibles:", data_df.columns)
    print("Etiquetas:", subject.labels.value_counts())

    # Definir clases de interés
    valid_labels = {1, 2, 3}
    
    # Filtrar datos válidos
    mask = subject.labels['label'].isin(valid_labels)
    filtered_labels = subject.labels[mask].reset_index(drop=True)
    filtered_data = data_df[mask].reset_index(drop=True)

    filtered_data = filtered_data.copy()
    
    filtered_data['EDA'] = butter_filter(filtered_data['EDA'], cutoff=5, fs=fs_dict['EDA'], order=4, filter_type='low')
    filtered_data['EMG'] = butter_filter(filtered_data['EMG'], cutoff=50, fs=fs_dict['EMG'], order=4, filter_type='low')
    filtered_data['Resp'] = butter_filter(filtered_data['Resp'], cutoff=(0.1, 0.35), fs=fs_dict['Resp'], order=2, filter_type='band')

    print("Data lista para procesar: " + str(len(filtered_labels)))


    samples = get_samples(filtered_data, filtered_labels, fs_dict=fs_dict, stride_seconds=STRIDE_IN_SECONDS)

    print("Características calculadas")

    if not isinstance(samples, pd.DataFrame):
        all_samples = pd.concat(samples, ignore_index=True)
    else:
        all_samples = samples.copy()

    all_samples['label'] = all_samples['label'].astype(int)
    all_samples = pd.concat([all_samples.drop('label', axis=1), pd.get_dummies(all_samples['label'])], axis=1)

    # Guardar como CSV
    all_samples.to_csv(f'{SAVE_PATH}/{subject_id}_features.csv', index=False)

    # **Liberar memoria**
    del subject, all_samples, samples, data_df, filtered_data, filtered_labels
    gc.collect()  # Forzar recolección de basura
    samples = None

In [468]:
def combine_files(subjects):
    df_list = []
    for s in subjects:
        df = pd.read_csv(f'{SAVE_PATH}/{s}_features.csv')
        df['subject'] = s
        df_list.append(df)

    df = pd.concat(df_list)

    print(df.head(10))
    print(df.columns)

    df[['1', '2', '3']] = df[['1', '2', '3']].fillna(0).astype(int)
    df[['1', '2', '3']] = df[['1', '2', '3']].astype(int)
    df['label'] = df[['1', '2', '3']].idxmax(axis=1).astype(int)
    df.drop(['1', '2', '3'], axis=1, inplace=True)

    df.reset_index(drop=True, inplace=True)

    df.to_csv(f'{SAVE_PATH}/features.csv')

    counts = df['label'].value_counts()

    print("Índices en counts:", counts.index.tolist())
    print("Claves en int_to_label:", int_to_label.keys())

    print('Number of samples per class:')
    for label, number in zip(counts.index, counts.values):
        print(f'{int_to_label[label]}: {number}')

In [469]:
#subject_ids = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17]
subject_ids = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17]

for patient in subject_ids:
    print(f'Processing data for S{patient}...')
    make_patient_data(patient)


Processing data for S2...
Columnas disponibles: Index(['ACC_X', 'ACC_Y', 'ACC_Z', 'ECG', 'EMG', 'EDA', 'Temp', 'Resp'], dtype='object')
Etiquetas: label
0        2142701
1         800800
4         537599
2         430500
3         253400
6          45500
7          44800
Name: count, dtype: int64
Data lista para procesar: 1484700
El número de ventanas esperadas es: 8245
Progreso: 99.4542% completado
 Procesamiento de ventanas completado.
Características calculadas
Processing data for S3...
Columnas disponibles: Index(['ACC_X', 'ACC_Y', 'ACC_Z', 'ECG', 'EMG', 'EDA', 'Temp', 'Resp'], dtype='object')
Etiquetas: label
0        2345699
1         798000
4         546001
2         448000
3         262500
5          51100
6          46900
7          46900
Name: count, dtype: int64
Data lista para procesar: 1508500
El número de ventanas esperadas es: 8381
Progreso: 99.6301% completado
 Procesamiento de ventanas completado.
Características calculadas
Processing data for S4...
Columnas disponible

In [470]:
subject_ids = [2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 13, 14, 15, 16, 17]

combine_files(subject_ids)
print('Processing complete.')

     HR_mean     HR_std  NN50      pNN50        TINN     rmsHRV           LF  \
0  80.696549   9.931009    17  21.518987  655.178571  61.766682  6836.774620   
1  80.594867   9.915636    18  22.500000  655.178571  61.677656  6858.923359   
2  80.594867   9.915636    18  22.500000  655.178571  61.677656  6858.923359   
3  80.727963   9.906418    18  22.784810  655.178571  62.070924  6994.721115   
4  80.529192   9.976642    19  23.750000  655.178571  62.198173  7015.619134   
5  80.529192   9.976642    19  23.750000  655.178571  62.198173  7015.619134   
6  80.657316   9.972152    19  24.050633  655.178571  62.407231  7123.762842   
7  80.657316   9.972152    19  24.050633  655.178571  62.407231  7123.762842   
8  80.394315  10.109708    19  23.750000  655.178571  62.251314  7137.272136   
9  80.461721  10.148366    18  22.784810  655.178571  62.297080  7240.430642   

            HF     LF_HF        sum_f  ...  Temp_mean  Temp_std   Temp_min  \
0   998.941320  6.844020  8791.542465  ..

In [471]:
print('holi')

holi
