# Imports

In [None]:
import os, re, json, numpy as np, pandas as pd
from pathlib import Path
import h5py
import pywt
from collections import defaultdict
import mne
import pyedflib
from scipy.signal import welch
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import classification_report, confusion_matrix
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import sys, time, shutil, subprocess, requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from tqdm.auto import tqdm
from multiprocessing import Pool
from datetime import datetime
import pickle
from collections import defaultdict
import warnings
import matplotlib.pyplot as plt
from scipy import signal, stats
warnings.filterwarnings("ignore", category=RuntimeWarning)

# 1. DWT

In [None]:
class PSG_DWT_BatchProcessor:
    """
    Procesador batch de se√±ales PSG usando DWT
    Procesa TODOS los archivos de la base de datos Sleep-EDF
    """
    
    def __init__(self, data_path, output_path=None):
        """
        Inicializa el procesador batch
        
        Parameters:
        -----------
        data_path : str
            Ruta a la carpeta con archivos PSG
        output_path : str
            Ruta donde guardar los resultados (opcional)
        """
        self.data_path = Path(data_path)
        self.wavelet = 'db4'  # Daubechies-4
        self.level = 4  # 4 niveles
        self.epoch_duration = 30  # segundos
        
        # Configurar carpeta de salida
        if output_path is None:
            self.output_path = self.data_path / "dwt_results"
        else:
            self.output_path = Path(output_path)
        
        self.output_path.mkdir(parents=True, exist_ok=True)
        
        # Para almacenar todos los resultados
        self.all_subjects_data = {}
        
    def load_psg_file(self, filepath):
        """Carga un archivo PSG"""
        try:
            raw = mne.io.read_raw_edf(filepath, preload=True, verbose=False)
            return raw
        except Exception as e:
            print(f" Error cargando {filepath.name}: {e}")
            return None
    
    def extract_channels(self, raw):
        """Extrae canales EOG y EEG"""
        channels_data = {}
        ch_names = raw.ch_names
        sfreq = raw.info['sfreq']
        
        # Buscar EOG
        eog_channels = [ch for ch in ch_names if 'EOG' in ch.upper()]
        
        # Buscar EEG (t√≠picamente Fpz-Cz y Pz-Oz en Sleep-EDF)
        eeg_channels = [ch for ch in ch_names if 'EEG' in ch.upper() or 
                       'Fpz' in ch or 'Pz' in ch]
        
        # Extraer EOG
        for ch in eog_channels:
            data, times = raw[ch, :]
            channels_data[ch] = {
                'data': data[0],
                'sfreq': sfreq,
                'type': 'EOG'
            }
        
        # Extraer EEG
        for ch in eeg_channels:
            data, times = raw[ch, :]
            channels_data[ch] = {
                'data': data[0],
                'sfreq': sfreq,
                'type': 'EEG'
            }
        
        return channels_data
    
    def apply_dwt(self, signal):
        """
        Aplica DWT y retorna los 8 coeficientes
        
        Returns:
        --------
        dict con A1, A2, A3, A4, D1, D2, D3, D4
        """
        # Descomposici√≥n completa nivel 4
        coeffs_4 = pywt.wavedec(signal, self.wavelet, level=4)
        # coeffs_4 = [cA4, cD4, cD3, cD2, cD1]
        
        # Descomposiciones parciales para obtener A1, A2, A3
        coeffs_3 = pywt.wavedec(signal, self.wavelet, level=3)
        coeffs_2 = pywt.wavedec(signal, self.wavelet, level=2)
        coeffs_1 = pywt.wavedec(signal, self.wavelet, level=1)
        
        dwt_coeffs = {
            'A1': coeffs_1[0],
            'A2': coeffs_2[0],
            'A3': coeffs_3[0],
            'A4': coeffs_4[0],
            'D1': coeffs_4[4],
            'D2': coeffs_4[3],
            'D3': coeffs_4[2],
            'D4': coeffs_4[1],
        }
        
        return dwt_coeffs
    
    def segment_into_epochs(self, signal, sfreq):
        """Segmenta en √©pocas de 30s"""
        samples_per_epoch = int(self.epoch_duration * sfreq)
        n_epochs = len(signal) // samples_per_epoch
        
        epochs = []
        for i in range(n_epochs):
            start = i * samples_per_epoch
            end = start + samples_per_epoch
            epochs.append(signal[start:end])
        
        return epochs
    
    def process_single_file(self, filepath, subject_id):
        """
        Procesa UN archivo PSG completo
        
        Returns:
        --------
        dict con los resultados DWT de todos los canales
        """
        print(f"\n Procesando: {filepath.name}")
        
        # Cargar archivo
        raw = self.load_psg_file(filepath)
        if raw is None:
            return None
        
        # Extraer canales
        channels_data = self.extract_channels(raw)
        
        if not channels_data:
            print(f"   ‚ö†Ô∏è  No se encontraron canales EOG/EEG")
            return None
        
        # Procesar cada canal
        subject_results = {
            'subject_id': subject_id,
            'filename': filepath.name,
            'channels': {}
        }
        
        for ch_name, ch_data in channels_data.items():
            signal = ch_data['data']
            sfreq = ch_data['sfreq']
            ch_type = ch_data['type']
            
            # Segmentar en √©pocas
            epochs = self.segment_into_epochs(signal, sfreq)
            # Aplicar DWT a cada √©poca
            epochs_dwt = []
            for epoch in epochs:
                dwt_coeffs = self.apply_dwt(epoch)
                epochs_dwt.append(dwt_coeffs)
            
            # Guardar resultados del canal
            subject_results['channels'][ch_name] = {
                'type': ch_type,
                'sfreq': sfreq,
                'n_epochs': len(epochs),
                'dwt_coeffs': epochs_dwt
            }
        
        return subject_results
    
    def process_all_files(self):
        """
        Procesa TODOS los archivos PSG en la carpeta
        """
        print("\n" + "="*80)
        print(" PROCESAMIENTO BATCH DE TODOS LOS ARCHIVOS SLEEP-EDF")
        print("="*80)
        
        # Buscar todos los archivos EDF
        edf_files = sorted(list(self.data_path.glob("*.edf")))
        
        # Filtrar solo archivos PSG (no hypnogram)
        psg_files = [f for f in edf_files if 'PSG' in f.name]
        
        print(f"\n Archivos PSG encontrados: {len(psg_files)}")
        print(f" Archivos totales EDF: {len(edf_files)}")
        
        if not psg_files:
            print(" No se encontraron archivos PSG")
            return
        
        # Procesar cada archivo
        print(f"\n  Iniciando procesamiento...")
        print("="*80)
        
        successful = 0
        failed = 0
        
        for i, filepath in enumerate(psg_files, 1):
            # Extraer ID del sujeto del nombre del archivo
            subject_id = filepath.stem.split('-')[0]  # Ej: SC4001E0
            
            print(f"\n[{i}/{len(psg_files)}] Sujeto: {subject_id}")
            
            results = self.process_single_file(filepath, subject_id)
            
            if results:
                self.all_subjects_data[subject_id] = results
                successful += 1
            else:
                failed += 1       
        return self.all_subjects_data
    
    def save_results(self, format='pickle'):
        """
        Guarda los resultados en disco
        
        Parameters:
        -----------
        format : str
            'pickle', 'numpy', o 'both'
        """
        print(f"\n Guardando resultados en: {self.output_path}")
        
        timestamp = datetime.now().strftime("%Y%m%d_%H%M%S")
        
        if format in ['pickle', 'both']:
            # Guardar como pickle (conserva toda la estructura)
            pickle_file = self.output_path / f"dwt_all_subjects_{timestamp}.pkl"
            with open(pickle_file, 'wb') as f:
                pickle.dump(self.all_subjects_data, f)
            print(f"   ‚úì Pickle guardado: {pickle_file.name}")
        
        if format in ['numpy', 'both']:
            # Guardar como arrays numpy (m√°s ligero pero menos estructura)
            numpy_dir = self.output_path / f"dwt_numpy_{timestamp}"
            numpy_dir.mkdir(exist_ok=True)
            
            for subject_id, subject_data in self.all_subjects_data.items():
                subject_dir = numpy_dir / subject_id
                subject_dir.mkdir(exist_ok=True)
                
                for ch_name, ch_data in subject_data['channels'].items():
                    # Convertir lista de diccionarios a arrays numpy
                    n_epochs = len(ch_data['dwt_coeffs'])
                    
                    # Crear arrays para cada coeficiente
                    coeff_arrays = {}
                    for coeff_name in ['A1', 'A2', 'A3', 'A4', 'D1', 'D2', 'D3', 'D4']:
                        coeff_list = [epoch[coeff_name] for epoch in ch_data['dwt_coeffs']]
                        coeff_arrays[coeff_name] = np.array(coeff_list)
                    
                    # Guardar cada coeficiente
                    np.savez(
                        subject_dir / f"{ch_name}_dwt.npz",
                        **coeff_arrays,
                        sfreq=ch_data['sfreq'],
                        n_epochs=n_epochs
                    )
            
            print(f"   ‚úì Arrays numpy guardados en: {numpy_dir.name}/")
        
        # Guardar resumen en CSV
        self.save_summary_csv(timestamp)
    
    def save_summary_csv(self, timestamp):
        """Guarda un resumen en CSV"""
        summary_data = []
        
        for subject_id, subject_data in self.all_subjects_data.items():
            for ch_name, ch_data in subject_data['channels'].items():
                summary_data.append({
                    'subject_id': subject_id,
                    'filename': subject_data['filename'],
                    'channel': ch_name,
                    'type': ch_data['type'],
                    'sfreq': ch_data['sfreq'],
                    'n_epochs': ch_data['n_epochs']
                })
        
        df = pd.DataFrame(summary_data)
        csv_file = self.output_path / f"dwt_summary_{timestamp}.csv"
        df.to_csv(csv_file, index=False)
        print(f"   ‚úì Resumen CSV guardado: {csv_file.name}")
        
        return df
    
    def get_statistics(self):
        """Obtiene estad√≠sticas del procesamiento"""
        print("\n" + "="*80)
        print(" ESTAD√çSTICAS DETALLADAS")
        print("="*80)
        
        total_epochs = 0
        channels_count = {'EOG': 0, 'EEG': 0}
        
        for subject_id, subject_data in self.all_subjects_data.items():
            subject_epochs = 0
            print(f"\nüîπ Sujeto: {subject_id}")
            
            for ch_name, ch_data in subject_data['channels'].items():
                n_epochs = ch_data['n_epochs']
                ch_type = ch_data['type']
                
                print(f"   {ch_name} ({ch_type}): {n_epochs} √©pocas")
                
                subject_epochs += n_epochs
                channels_count[ch_type] += 1
            
            total_epochs += subject_epochs
            print(f"   Total √©pocas: {subject_epochs}")
        
        print("\n" + "-"*80)
        print(f" TOTALES:")
        print(f"   ‚Ä¢ Total sujetos: {len(self.all_subjects_data)}")
        print(f"   ‚Ä¢ Total canales EOG: {channels_count['EOG']}")
        print(f"   ‚Ä¢ Total canales EEG: {channels_count['EEG']}")
        print(f"   ‚Ä¢ Total √©pocas procesadas: {total_epochs}")
        print(f"   ‚Ä¢ Total coeficientes DWT: {total_epochs * 8}")
    
    def visualize_sample(self, subject_id=None, channel=None, epoch_idx=0):
        """Visualiza una muestra de los resultados"""
        if subject_id is None:
            subject_id = list(self.all_subjects_data.keys())[0]
        
        if subject_id not in self.all_subjects_data:
            print(f" Sujeto {subject_id} no encontrado")
            return
        
        subject_data = self.all_subjects_data[subject_id]
        
        if channel is None:
            channel = list(subject_data['channels'].keys())[0]
        
        if channel not in subject_data['channels']:
            print(f" Canal {channel} no encontrado")
            return
        
        epoch_data = subject_data['channels'][channel]['dwt_coeffs'][epoch_idx]
        
        # Visualizar
        fig, axes = plt.subplots(8, 1, figsize=(15, 12))
        fig.suptitle(f'DWT - Sujeto: {subject_id} - Canal: {channel} - √âpoca: {epoch_idx}', 
                     fontsize=16, fontweight='bold')
        
        coeff_names = ['A1', 'A2', 'A3', 'A4', 'D1', 'D2', 'D3', 'D4']
        colors = ['steelblue']*4 + ['darkorange']*4
        
        for i, (coeff_name, color) in enumerate(zip(coeff_names, colors)):
            axes[i].plot(epoch_data[coeff_name], linewidth=0.8, color=color)
            axes[i].set_ylabel(coeff_name, fontweight='bold')
            axes[i].grid(True, alpha=0.3)
            axes[i].set_title(f'{coeff_name} ({len(epoch_data[coeff_name])} muestras)')
        
        axes[-1].set_xlabel('Muestras', fontweight='bold')
        plt.tight_layout()
        plt.show()


# ============================================================================
# FUNCI√ìN PRINCIPAL
# ============================================================================

def main():
    """Funci√≥n principal para procesar todos los archivos"""
    
    print("\n" + "="*80)
    print(" PROCESAMIENTO BATCH DWT - SLEEP-EDF DATABASE")
    print("   Extracci√≥n de 8 coeficientes (A1-A4, D1-D4)")
    print("   Canales: EOG + EEG")
    print("="*80)
    
    # Configurar rutas
    data_path = r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf\sleep-cassette"
    
    # Verificar que existe
    if not Path(data_path).exists():
        print(f" ERROR: La ruta no existe: {data_path}")
        return None
    
    # Crear procesador
    processor = PSG_DWT_BatchProcessor(data_path)
    
    # Procesar todos los archivos
    results = processor.process_all_files()
    
    if not results:
        print(" No se procesaron archivos")
        return None
    
    # Mostrar estad√≠sticas
    processor.get_statistics()
    
    # Guardar resultados
    print("\n" + "="*80)
    processor.save_results(format='numpy')  # Guarda en pickle Y numpy
    
    # Visualizar ejemplo
    print("\n" + "="*80)
    print(" VISUALIZACI√ìN DE MUESTRA")
    print("="*80)
    processor.visualize_sample()
    
    print("\n" + "="*80)
    print(" PROCESAMIENTO COMPLETADO EXITOSAMENTE")
    print("="*80)
    print(f"\n Resultados guardados en: {processor.output_path}")
    
    return processor, results


# ============================================================================
# EJECUTAR
# ============================================================================

if __name__ == "__main__":
    processor, results = main()

# 2. Extracci√≥n de features (8 subbandas x 13 features = 104 features por √©poca y por canal (3) )

In [None]:
"""
Extracci√≥n de Features - VERSI√ìN THREADING
Soluci√≥n definitiva para Windows usando ThreadPoolExecutor
"""

import numpy as np
import pandas as pd
from pathlib import Path
from scipy import signal, stats
from tqdm import tqdm
from concurrent.futures import ThreadPoolExecutor, as_completed
import warnings
import gc

warnings.filterwarnings('ignore')

# ===================== CONFIGURACI√ìN =====================

ALLOWED_CHANNELS = ['EEG Fpz-Cz', 'EEG Pz-Oz', 'EOG horizontal']

# ===================== FUNCIONES DE FEATURES =====================

def compute_psd_cached(signal_data):
    """Calcula PSD."""
    try:
        nperseg = min(256, len(signal_data))
        freqs, psd = signal.welch(signal_data, nperseg=nperseg, noverlap=nperseg//2)
        psd_norm = psd / (np.sum(psd) + 1e-10)
        return psd_norm[psd_norm > 1e-10]
    except:
        return np.array([1e-10])

def compute_all_entropies(signal_data):
    """Calcula todas las entrop√≠as."""
    try:
        p = compute_psd_cached(signal_data)
        p_squared = p ** 2
        p_squared = p_squared[p_squared > 1e-10]
        
        shannon = -np.sum(p_squared * np.log2(p_squared + 1e-10))
        log_energy = -np.sum(np.log2(p_squared + 1e-10))
        norm = -np.sum(p ** 2)
        
        threshold = np.mean(p)
        thresh_ent = np.sum(p > threshold)
        
        N = len(p)
        count_below = np.sum(p <= threshold)
        min_sum = np.sum(np.minimum(p ** 2, threshold ** 2))
        sure = N - count_below + min_sum
        
        return {
            'shannon_entropy': shannon,
            'log_energy_entropy': log_energy,
            'norm_entropy': norm,
            'threshold_entropy': thresh_ent,
            'sure_entropy': sure
        }
    except:
        return {k: 0.0 for k in ['shannon_entropy', 'log_energy_entropy', 
                                  'norm_entropy', 'threshold_entropy', 'sure_entropy']}

def compute_statistical_features(signal_data):
    """Calcula estad√≠sticas."""
    try:
        return {
            'variance': np.var(signal_data, ddof=0),
            'skewness': stats.skew(signal_data, bias=False),
            'kurtosis': stats.kurtosis(signal_data, bias=False)
        }
    except:
        return {'variance': 0.0, 'skewness': 0.0, 'kurtosis': 0.0}

def dispersion_entropy_fast(signal_data, m=3, c=6, d=1):
    """Dispersion Entropy."""
    try:
        N = len(signal_data)
        mu = np.mean(signal_data)
        sigma = np.std(signal_data)
        
        if sigma < 1e-10:
            return 0.0
        
        y = stats.norm.cdf(signal_data, loc=mu, scale=sigma)
        z = np.clip(np.round(c * y + 0.5).astype(np.int32), 1, c)
        
        max_idx = N - (m - 1) * d
        if max_idx < 1:
            return 0.0
        
        patterns = {}
        for i in range(max_idx):
            pattern = tuple(z[i + j * d] for j in range(m))
            patterns[pattern] = patterns.get(pattern, 0) + 1
        
        probabilities = np.array(list(patterns.values())) / max_idx
        probabilities = probabilities[probabilities > 1e-10]
        
        return -np.sum(probabilities * np.log(probabilities + 1e-10))
    except:
        return 0.0

def rcmde_fast(signal_data, m=3, c=6, d=1, tau=4):
    """RCMDE."""
    try:
        N = len(signal_data)
        n_segments = N // tau
        
        if n_segments < m:
            return dispersion_entropy_fast(signal_data, m, c, d)
        
        truncated = signal_data[:n_segments * tau]
        reshaped = truncated.reshape(n_segments, tau)
        
        entropies = []
        for k in range(min(tau, 4)):
            coarse_grained = reshaped[:, k:].mean(axis=1)
            
            if len(coarse_grained) >= m:
                de = dispersion_entropy_fast(coarse_grained, m, c, d)
                entropies.append(de)
        
        return np.mean(entropies) if entropies else 0.0
    except:
        return 0.0

def ar_coefficients_fast(signal_data, order=4):
    """AR coefficients."""
    try:
        centered = signal_data - np.mean(signal_data)
        r = np.correlate(centered, centered, mode='full')
        r = r[len(r)//2:order+1]
        r = r / (r[0] + 1e-10)
        
        from scipy.linalg import toeplitz, solve
        R = toeplitz(r[:order])
        return solve(R, r[1:], assume_a='pos', check_finite=False)
    except:
        return np.zeros(order)

def extract_features_from_subband(subband_data, subband_name, channel_name):
    """Extrae features de una subbanda."""
    features = {}
    prefix = f"{channel_name}_{subband_name}_"
    
    try:
        if len(subband_data) < 10:
            raise ValueError("Insufficient data")
        
        entropies = compute_all_entropies(subband_data)
        for key, value in entropies.items():
            features[prefix + key] = value
        
        stats_features = compute_statistical_features(subband_data)
        for key, value in stats_features.items():
            features[prefix + key] = value
        
        features[prefix + 'rcmde'] = rcmde_fast(subband_data)
        
        ar_coeffs = ar_coefficients_fast(subband_data, order=4)
        for i, coeff in enumerate(ar_coeffs):
            features[prefix + f'ar_coeff_{i+1}'] = coeff
            
    except:
        for feat in ['shannon_entropy', 'log_energy_entropy', 'norm_entropy',
                     'threshold_entropy', 'sure_entropy', 'variance', 
                     'skewness', 'kurtosis', 'rcmde']:
            features[prefix + feat] = 0.0
        for i in range(4):
            features[prefix + f'ar_coeff_{i+1}'] = 0.0
    
    return features

def extract_features_from_epoch(dwt_coeffs_epoch, channel_name):
    """Extrae features de una √©poca."""
    all_features = {}
    
    for subband_name in ['D1', 'D2', 'D3', 'D4', 'A1', 'A2', 'A3', 'A4']:
        subband_data = dwt_coeffs_epoch[subband_name]
        features = extract_features_from_subband(subband_data, subband_name, channel_name)
        all_features.update(features)
    
    return all_features

# ===================== PROCESAMIENTO CON THREADING =====================

class FileProcessor:
    """Procesa archivos usando threading (funciona en Windows)."""
    
    def __init__(self):
        pass
    
    def process_file(self, npz_path, subject_id):
        """Procesa un archivo .npz."""
        try:
            channel_name = npz_path.stem.replace("_dwt", "")
            
            if channel_name not in ALLOWED_CHANNELS:
                return []
            
            with np.load(npz_path) as data:
                sfreq = float(data['sfreq'])
                n_epochs = int(data['n_epochs'])
                ch_type = 'eog' if 'EOG' in channel_name else 'eeg'
                
                channel_features = []
                max_epochs = min(n_epochs, 1000)
                
                for epoch_idx in range(max_epochs):
                    try:
                        dwt_coeffs = {
                            'D1': data['D1'][epoch_idx],
                            'D2': data['D2'][epoch_idx],
                            'D3': data['D3'][epoch_idx],
                            'D4': data['D4'][epoch_idx],
                            'A1': data['A1'][epoch_idx],
                            'A2': data['A2'][epoch_idx],
                            'A3': data['A3'][epoch_idx],
                            'A4': data['A4'][epoch_idx]
                        }
                        
                        features = extract_features_from_epoch(dwt_coeffs, channel_name)
                        
                        features['subject_id'] = subject_id
                        features['channel_name'] = channel_name
                        features['channel_type'] = ch_type
                        features['epoch_num'] = epoch_idx
                        features['sfreq'] = sfreq
                        
                        channel_features.append(features)
                        
                    except:
                        continue
            
            return channel_features
            
        except Exception as e:
            return []

def extract_features_threading(numpy_base_dir, output_dir, n_workers=12, save_every=10):
    """
    Extracci√≥n usando THREADING en lugar de multiprocessing.
    Funciona perfectamente en Windows con datos grandes.
    """
    print("="*70)
    print("EXTRACCI√ìN CON THREADING - 100% COMPATIBLE WINDOWS")
    print("="*70)
    
    numpy_base_dir = Path(numpy_base_dir)
    output_dir = Path(output_dir)
    output_dir.mkdir(parents=True, exist_ok=True)
    
    subject_dirs = sorted([d for d in numpy_base_dir.iterdir() if d.is_dir()])
    n_subjects = len(subject_dirs)
    
    print(f"\n Configuraci√≥n:")
    print(f"   - Sujetos: {n_subjects}")
    print(f"   - Workers (threads): {n_workers}")
    print(f"   - Checkpoint cada: {save_every} sujetos")
    print(f"   - Canales: {', '.join(ALLOWED_CHANNELS)}")
    
    processor = FileProcessor()
    all_features = []
    processed_subjects = 0
    failed_count = 0
    
    print(f"\n Iniciando...\n")
    
    # Usar tqdm para barra de progreso
    for subject_dir in tqdm(subject_dirs, desc="Extrayendo", unit="sujeto"):
        subject_id = subject_dir.name
        
        npz_files = list(subject_dir.glob("*.npz"))
        npz_files = [f for f in npz_files 
                     if f.stem.replace("_dwt", "") in ALLOWED_CHANNELS]
        
        if not npz_files:
            continue
        
        subject_features = []
        
        # Threading en lugar de multiprocessing
        with ThreadPoolExecutor(max_workers=n_workers) as executor:
            futures = {
                executor.submit(processor.process_file, npz_file, subject_id): npz_file 
                for npz_file in npz_files
            }
            
            for future in as_completed(futures):
                try:
                    result = future.result(timeout=180)
                    subject_features.extend(result)
                except Exception as e:
                    failed_count += 1
                    continue
        
        if subject_features:
            all_features.extend(subject_features)
        
        processed_subjects += 1
        
        # Checkpoint
        if processed_subjects % save_every == 0 and all_features:
            df_temp = pd.DataFrame(all_features)
            checkpoint_path = output_dir / f"checkpoint_{processed_subjects}.pkl"
            df_temp.to_pickle(checkpoint_path)
            tqdm.write(f"    Checkpoint: {len(df_temp)} √©pocas guardadas")
        
        if processed_subjects % 5 == 0:
            gc.collect()
    
    print(f"\n{'='*70}")
    print("COMPLETADO")
    print(f"{'='*70}")
    
    if failed_count > 0:
        print(f"\n {failed_count} archivos fallaron (se omitieron)")
    
    if not all_features:
        print("\n No se extrajeron features")
        return pd.DataFrame()
    
    df_features = pd.DataFrame(all_features)
    
    metadata_cols = ['subject_id', 'channel_name', 'channel_type', 'epoch_num', 'sfreq']
    feature_cols = [col for col in df_features.columns if col not in metadata_cols]
    df_features = df_features[metadata_cols + feature_cols]
    
    print(f"\n Extracci√≥n exitosa:")
    print(f"  - √âpocas: {len(df_features):,}")
    print(f"  - Features: {len(feature_cols)}")
    print(f"  - Sujetos: {df_features['subject_id'].nunique()}")
    print(f"  - Canales: {df_features['channel_name'].nunique()}")
    
    return df_features

# ===================== AUXILIARES =====================

def find_latest_numpy_dir(base_path):
    """Encuentra carpeta m√°s reciente."""
    base_path = Path(base_path)
    numpy_dirs = list(base_path.glob("dwt_numpy_*"))
    
    if not numpy_dirs:
        raise FileNotFoundError(f" No hay carpetas 'dwt_numpy_*' en: {base_path}")
    
    numpy_dirs = sorted(numpy_dirs, key=lambda x: x.stat().st_mtime, reverse=True)
    print(f"\n {numpy_dirs[0].name}")
    return numpy_dirs[0]

def add_sleep_stages(df_features, hypnogram_dir):
    """Agrega etapas de sue√±o."""
    print(f"\n{'='*70}")
    print("AGREGANDO ETAPAS DE SUE√ëO")
    print(f"{'='*70}")
    
    import mne
    hypnogram_dir = Path(hypnogram_dir)
    
    stage_map = {
        'Sleep stage W': 0, 'Sleep stage 1': 1, 'Sleep stage 2': 2,
        'Sleep stage 3': 3, 'Sleep stage 4': 3, 'Sleep stage R': 4,
        'Sleep stage ?': -1, 'Movement time': -1
    }
    
    df_features['sleep_stage'] = -1
    
    for subject_id in tqdm(df_features['subject_id'].unique(), desc="Hypnogramas"):
        hyp_files = list(hypnogram_dir.glob(f"{subject_id}*-Hypnogram.edf"))
        
        if not hyp_files:
            continue
        
        try:
            annotations = mne.read_annotations(hyp_files[0])
            stages = [stage_map[desc] for desc in annotations.description 
                     if desc in stage_map and stage_map[desc] != -1]
            
            subject_mask = df_features['subject_id'] == subject_id
            subject_epochs = df_features[subject_mask].groupby('epoch_num').first().index
            
            for epoch_num in subject_epochs:
                if epoch_num < len(stages):
                    epoch_mask = (df_features['subject_id'] == subject_id) & \
                                (df_features['epoch_num'] == epoch_num)
                    df_features.loc[epoch_mask, 'sleep_stage'] = stages[epoch_num]
        except:
            continue
    
    n_before = len(df_features)
    df_features = df_features[df_features['sleep_stage'] != -1].copy()
    n_after = len(df_features)
    
    print(f"\n √âpocas etiquetadas: {n_after}/{n_before}")
    
    stage_names = {0: 'Wake', 1: 'S1', 2: 'S2', 3: 'S3/S4', 4: 'REM'}
    stage_counts = df_features['sleep_stage'].value_counts().sort_index()
    
    total = len(df_features)
    print("\nDistribuci√≥n:")
    for stage, count in stage_counts.items():
        percentage = (count / total) * 100
        bar = '‚ñà' * int(percentage / 2)
        print(f"{stage_names.get(stage, f'Stage {stage}'):6} | {bar} {count:6d} ({percentage:5.1f}%)")
    
    return df_features

def save_features(df_features, output_dir):
    """Guarda features en m√∫ltiples formatos optimizados."""
    print(f"\n{'='*70}")
    print("GUARDANDO FEATURES")
    print(f"{'='*70}")
    
    output_dir = Path(output_dir)
    
    # 1. PARQUET (recomendado) - Compresi√≥n, r√°pido, universal
    parquet_path = output_dir / "features_complete.parquet"
    df_features.to_parquet(parquet_path, engine='pyarrow', compression='snappy', index=False)
    size_mb = parquet_path.stat().st_size / (1024**2)
    print(f" Parquet: {parquet_path.name} ({size_mb:.1f} MB)")
    
    # 2. PICKLE (backup r√°pido) - Solo Python pero ultra r√°pido
    pkl_path = output_dir / "features_complete.pkl"
    df_features.to_pickle(pkl_path, compression='gzip')
    size_mb = pkl_path.stat().st_size / (1024**2)
    print(f" Pickle (comprimido): {pkl_path.name} ({size_mb:.1f} MB)")
    
    # 3. CSV (opcional) - Solo si necesitas compartir con otros programas
    # Comentado por defecto para ahorrar espacio
    # csv_path = output_dir / "features_complete.csv"
    # df_features.to_csv(csv_path, index=False)
    # print(f"‚úì CSV: {csv_path.name}")
    
    # 4. Resumen en texto
    info_path = output_dir / "features_info.txt"
    with open(info_path, 'w', encoding='utf-8') as f:
        f.write("DATASET DE FEATURES - RESUMEN\n")
        f.write("="*70 + "\n\n")
        f.write(f"√âpocas totales: {len(df_features):,}\n")
        f.write(f"Sujetos √∫nicos: {df_features['subject_id'].nunique()}\n")
        f.write(f"Canales: {', '.join(df_features['channel_name'].unique())}\n")
        
        metadata_cols = ['subject_id', 'channel_name', 'channel_type', 'epoch_num', 'sfreq', 'sleep_stage']
        feature_cols = [c for c in df_features.columns if c not in metadata_cols]
        f.write(f"Features por √©poca: {len(feature_cols)}\n\n")
        
        f.write("DISTRIBUCI√ìN DE ETAPAS DE SUE√ëO:\n")
        if 'sleep_stage' in df_features.columns:
            stage_names = {0: 'Wake', 1: 'Stage 1', 2: 'Stage 2', 3: 'Stage 3/4', 4: 'REM'}
            stage_counts = df_features['sleep_stage'].value_counts().sort_index()
            total = len(df_features)
            for stage, count in stage_counts.items():
                if stage >= 0:
                    percentage = (count / total) * 100
                    f.write(f"  {stage_names.get(stage, f'Stage {stage}'):12} : {count:7,} ({percentage:5.1f}%)\n")
        
        f.write(f"\nPRIMEROS 5 FEATURE NAMES:\n")
        for feat in feature_cols[:5]:
            f.write(f"  - {feat}\n")
        f.write(f"  ... ({len(feature_cols)-5} m√°s)\n")
    
    print(f"‚úì Info: {info_path.name}")
    
    print(f"\n Para cargar:")
    print(f"   df = pd.read_parquet('{parquet_path.name}')  # Recomendado")
    print(f"   df = pd.read_pickle('{pkl_path.name}')       # Alternativa r√°pida")

# ===================== MAIN =====================

if __name__ == "__main__":
    DWT_RESULTS_DIR = r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf\sleep-cassette\dwt_results"
    HYPNOGRAM_DIR = r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf\sleep-cassette"
    OUTPUT_DIR = r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf\features_batch"
    
    # Con threading puedes usar m√°s workers (12-24)
    N_WORKERS =22
    SAVE_EVERY = 10
    
    print("\n EXTRACCI√ìN CON THREADING")
    print(f"   Workers: {N_WORKERS}")
    
    try:
        NUMPY_BASE_DIR = find_latest_numpy_dir(DWT_RESULTS_DIR)
    except FileNotFoundError as e:
        print(str(e))
        exit(1)
    
    subject_dirs = [d for d in NUMPY_BASE_DIR.iterdir() if d.is_dir()]
    if not subject_dirs:
        print(" Carpeta vac√≠a")
        exit(1)
    
    print(f"‚úì {len(subject_dirs)} sujetos\n")
    
    # Extracci√≥n con threading
    df_features = extract_features_threading(
        NUMPY_BASE_DIR, 
        OUTPUT_DIR,
        n_workers=N_WORKERS,
        save_every=SAVE_EVERY
    )
    
    if df_features.empty:
        print("\n No se generaron features")
        exit(1)
    
    # Agregar etapas
    df_features = add_sleep_stages(df_features, HYPNOGRAM_DIR)
    
    # Guardar
    save_features(df_features, OUTPUT_DIR)
    
    print(f"\n{'='*70}")
    print(" COMPLETADO")
    print(f"{'='*70}")

# Fix del script anterior

In [None]:
"""
Verificador y Reparador de Sleep Stages
Para el archivo features_complete_labeled.parquet
"""

import pandas as pd
import numpy as np
import mne
from pathlib import Path
from tqdm import tqdm

def verify_labels(parquet_file):
    """
    Verifica el estado de las etiquetas en el archivo parquet
    """
    print("\n" + "="*80)
    print(" VERIFICANDO ETIQUETAS")
    print("="*80)
    
    print(f"\n Cargando: {Path(parquet_file).name}")
    df = pd.read_parquet(parquet_file)
    
    print(f"    Shape: {df.shape}")
    print(f"    Tama√±o: {Path(parquet_file).stat().st_size / (1024*1024):.1f} MB")
    
    # Verificar columnas
    print(f"\n Columnas disponibles:")
    for col in df.columns:
        print(f"   - {col}")
    
    # Verificar si existe columna de sleep stage
    sleep_cols = [col for col in df.columns if 'sleep' in col.lower() or 'stage' in col.lower()]
    
    if not sleep_cols:
        print(f"\n No se encontr√≥ columna de sleep stages")
        return df, False
    
    print(f"\n Columnas de sleep encontradas: {sleep_cols}")
    
    # Analizar cada columna
    for col in sleep_cols:
        print(f"\n Analizando columna: {col}")
        print(f"   Tipo: {df[col].dtype}")
        print(f"   Valores √∫nicos: {df[col].nunique()}")
        print(f"   Valores no-nulos: {df[col].notna().sum():,} / {len(df):,}")
        print(f"   Porcentaje etiquetado: {(df[col].notna().sum() / len(df) * 100):.1f}%")
        
        if df[col].notna().sum() > 0:
            print(f"\n   Distribuci√≥n de valores:")
            value_counts = df[col].value_counts()
            for val, count in value_counts.items():
                print(f"      {str(val):15s}: {count:8,}")
    
    # Verificar metadata
    print(f"\n Metadata del DataFrame:")
    print(f"   Sujetos √∫nicos: {df['subject_id'].nunique() if 'subject_id' in df.columns else 'N/A'}")
    print(f"   Canales √∫nicos: {df['channel'].nunique() if 'channel' in df.columns else 'N/A'}")
    
    # Verificar muestra
    print(f"\n Primeras 5 filas (columnas principales):")
    cols_to_show = ['subject_id', 'channel', 'epoch_idx'] + sleep_cols
    cols_to_show = [c for c in cols_to_show if c in df.columns]
    print(df[cols_to_show].head())
    
    # Determinar si necesita reparaci√≥n
    needs_fix = False
    if sleep_cols:
        main_col = sleep_cols[0]
        labeled_pct = (df[main_col].notna().sum() / len(df)) * 100
        if labeled_pct < 50:
            needs_fix = True
            print(f"\n  NECESITA REPARACI√ìN: Solo {labeled_pct:.1f}% etiquetado")
        else:
            print(f"\n BIEN ETIQUETADO: {labeled_pct:.1f}% completo")
    
    return df, needs_fix


def fix_labels_fast(df, data_path):
    """
    Repara las etiquetas de manera eficiente
    """
    print("\n" + "="*80)
    print("REPARANDO ETIQUETAS")
    print("="*80)
    
    data_path = Path(data_path)
    
    # Identificar columna de sleep stage
    sleep_col = None
    for col in df.columns:
        if 'sleep' in col.lower() and 'stage' in col.lower():
            sleep_col = col
            break
    
    if sleep_col is None:
        print("     Creando nueva columna 'sleep_stage'")
        sleep_col = 'sleep_stage'
        df[sleep_col] = None
    
    print(f"    Usando columna: {sleep_col}")
    
    # Cargar hypnogramas
    unique_subjects = df['subject_id'].unique()
    print(f"\n Procesando {len(unique_subjects)} sujetos...")
    
    hypnogram_cache = {}
    
    for subject_id in tqdm(unique_subjects, desc="Cargando hypnogramas"):
        # Buscar hypnogram
        hypno_pattern = f"{subject_id}*Hypnogram*.edf"
        hypno_files = list(data_path.glob(hypno_pattern))
        
        if not hypno_files:
            continue
        
        try:
            annotations = mne.read_annotations(hypno_files[0], verbose=False)
            
            sleep_stages = []
            for desc in annotations.description:
                if 'Sleep stage' in desc:
                    stage = desc.replace('Sleep stage ', '').strip()
                    
                    stage_map = {
                        'W': 'W',
                        '?': 'Unknown',
                        '1': 'S1',
                        '2': 'S2',
                        '3': 'S3',
                        '4': 'S4',
                        'R': 'REM'
                    }
                    
                    mapped_stage = stage_map.get(stage, stage)
                    sleep_stages.append(mapped_stage)
            
            hypnogram_cache[subject_id] = sleep_stages
            
        except Exception as e:
            continue
    
    print(f"    Hypnogramas cargados: {len(hypnogram_cache)}")
    
    # Asignar etiquetas
    print(f"\n  Asignando etiquetas...")
    
    assigned = 0
    for subject_id, sleep_stages in tqdm(hypnogram_cache.items(), desc="Asignando"):
        # M√°scara para este sujeto
        mask = df['subject_id'] == subject_id
        
        # Asignar seg√∫n epoch_idx
        for idx in df[mask].index:
            epoch_idx = df.loc[idx, 'epoch_idx']
            
            if epoch_idx < len(sleep_stages):
                df.loc[idx, sleep_col] = sleep_stages[epoch_idx]
                assigned += 1
    
    print(f"\n Etiquetas asignadas: {assigned:,} / {len(df):,}")
    
    # Mostrar distribuci√≥n
    if assigned > 0:
        print(f"\n Distribuci√≥n de sleep stages:")
        stage_counts = df[sleep_col].value_counts()
        for stage, count in stage_counts.items():
            if stage and pd.notna(stage):
                pct = (count / assigned) * 100
                print(f"   {str(stage):10s}: {count:8,} ({pct:5.1f}%)")
    
    return df


def save_fixed_file(df, original_file):
    """Guarda el archivo reparado"""
    print("\n" + "="*80)
    print("GUARDANDO ARCHIVO REPARADO")
    print("="*80)
    
    output_file = Path(original_file).parent / "features_complete_labeled_FIXED.parquet"
    
    print(f"\n Guardando en: {output_file.name}")
    df.to_parquet(output_file, index=False)
    
    size_mb = output_file.stat().st_size / (1024*1024)
    print(f"    Tama√±o: {size_mb:.1f} MB")
    
    # Info
    info_file = output_file.with_suffix('.txt')
    with open(info_file, 'w', encoding='utf-8') as f:
        f.write("="*80 + "\n")
        f.write("ARCHIVO REPARADO\n")
        f.write("="*80 + "\n\n")
        
        f.write(f"Shape: {df.shape}\n")
        f.write(f"Tama√±o: {size_mb:.1f} MB\n\n")
        
        f.write("Sleep Stages:\n")
        sleep_cols = [col for col in df.columns if 'sleep' in col.lower()]
        if sleep_cols:
            stage_counts = df[sleep_cols[0]].value_counts()
            for stage, count in stage_counts.items():
                f.write(f"  {str(stage):15s}: {count:8,}\n")
        
        f.write(f"\nSujetos: {df['subject_id'].nunique()}\n")
        f.write(f"Canales: {df['channel'].nunique()}\n")
        f.write(f"√âpocas: {len(df):,}\n")
    
    print(f"    Info guardada: {info_file.name}")
    
    return output_file


def main():
    """Funci√≥n principal"""
    
    print("\n" + "="*80)
    print("VERIFICADOR Y REPARADOR DE SLEEP STAGES")
    print("="*80)
    
    # RUTAS
    parquet_file = r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf\features_batch\features_complete_labeled.parquet"
    data_path = r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf\sleep-cassette"
    
    # Verificar
    if not Path(parquet_file).exists():
        print(f" No se encuentra: {parquet_file}")
        return
    
    if not Path(data_path).exists():
        print(f" No se encuentra: {data_path}")
        return
    
    # 1. Verificar estado actual
    df, needs_fix = verify_labels(parquet_file)
    
    # 2. Decidir si reparar
    if needs_fix:
        print("\n" + "="*80)
        print("  El archivo necesita reparaci√≥n")
        print("="*80)
        
        try:
            response = input("\n¬øProceder con la reparaci√≥n? (s/n): ").strip().lower()
            if response != 's':
                print("‚ùå Reparaci√≥n cancelada")
                return df
        except:
            print("\n‚úì Continuando autom√°ticamente...")
        
        # Reparar
        df_fixed = fix_labels_fast(df, data_path)
        
        # Guardar
        output_file = save_fixed_file(df_fixed, parquet_file)
        
        print("\n" + "="*80)
        print(" REPARACI√ìN COMPLETADA")
        print("="*80)
        print(f"\n Usa el nuevo archivo:")
        print(f"   df = pd.read_parquet('{output_file.name}')")
        
        return df_fixed
    else:
        print("\n" + "="*80)
        print(" El archivo est√° correctamente etiquetado")
        print("="*80)
        print(f"\n Puedes usarlo directamente:")
        print(f"   df = pd.read_parquet('{Path(parquet_file).name}')")
        
        return df


if __name__ == "__main__":
    df = main()

In [None]:
"""
Fix r√°pido para asignar sleep stages al archivo features_complete_labeled.parquet
Las etiquetas est√°n en -1, necesitamos leer los hypnogramas
"""

import pandas as pd
import numpy as np
import mne
from pathlib import Path
from tqdm import tqdm

def fix_sleep_stages_fast(features_file, data_path):
    """
    Repara las sleep stages leyendo hypnogramas
    """
    print("\n" + "="*80)
    print(" REPARANDO SLEEP STAGES")
    print("="*80)
    
    # Cargar features
    print(f"\n Cargando: {Path(features_file).name}")
    df = pd.read_parquet(features_file)
    print(f"    Shape: {df.shape}")
    print(f"    Sleep stages actuales: {df['sleep_stage'].unique()}")
    
    # Cargar hypnogramas
    data_path = Path(data_path)
    print(f"\n Leyendo hypnogramas desde: {data_path}")
    
    hypnogram_cache = {}
    unique_subjects = df['subject_id'].unique()
    
    print(f"   Sujetos a procesar: {len(unique_subjects)}")
    
    for subject_id in tqdm(unique_subjects, desc="Cargando hypnogramas"):
        # Buscar hypnogram
        hypno_pattern = f"{subject_id}*Hypnogram*.edf"
        hypno_files = list(data_path.glob(hypno_pattern))
        
        if not hypno_files:
            continue
        
        try:
            annotations = mne.read_annotations(hypno_files[0], verbose=False)
            
            sleep_stages = []
            for desc in annotations.description:
                if 'Sleep stage' in desc:
                    stage = desc.replace('Sleep stage ', '').strip()
                    
                    # Mapeo
                    stage_map = {
                        'W': 'W',
                        '?': 'Unknown',
                        '1': 'S1',
                        '2': 'S2',
                        '3': 'S3',
                        '4': 'S4',
                        'R': 'REM'
                    }
                    
                    mapped_stage = stage_map.get(stage, stage)
                    sleep_stages.append(mapped_stage)
            
            hypnogram_cache[subject_id] = sleep_stages
            
        except Exception as e:
            continue
    
    print(f"\n   ‚úì Hypnogramas cargados: {len(hypnogram_cache)}")
    
    # Asignar etiquetas
    print(f"\n Asignando sleep stages...")
    
    assigned = 0
    df['sleep_stage_fixed'] = None
    
    for subject_id in tqdm(hypnogram_cache.keys(), desc="Asignando"):
        sleep_stages = hypnogram_cache[subject_id]
        
        # M√°scara de sujeto
        mask = df['subject_id'] == subject_id
        subject_indices = df[mask].index
        
        # Asignar seg√∫n epoch_idx
        for idx in subject_indices:
            epoch_idx = df.loc[idx, 'epoch_idx']
            
            if epoch_idx < len(sleep_stages):
                df.loc[idx, 'sleep_stage_fixed'] = sleep_stages[epoch_idx]
                assigned += 1
    
    print(f"\n    √âpocas etiquetadas: {assigned:,} / {len(df):,}")
    
    # Usar la columna fija
    df['sleep_stage'] = df['sleep_stage_fixed']
    df = df.drop(columns=['sleep_stage_fixed'])
    
    # Filtrar solo con etiquetas v√°lidas
    valid_stages = ['W', 'S1', 'S2', 'S3', 'S4', 'REM']
    df = df[df['sleep_stage'].isin(valid_stages)]
    
    print(f"    Shape final: {df.shape}")
    
    # Mostrar distribuci√≥n
    print(f"\n Distribuci√≥n de Sleep Stages:")
    stage_counts = df['sleep_stage'].value_counts()
    for stage, count in stage_counts.items():
        pct = (count / len(df)) * 100
        print(f"      {stage:5s}: {count:7,} ({pct:5.1f}%)")
    
    return df


def save_fixed_file(df, original_file):
    """Guarda archivo reparado"""
    print("\n" + "="*80)
    print(" GUARDANDO ARCHIVO REPARADO")
    print("="*80)
    
    output_file = Path(original_file).parent / "features_FIXED.parquet"
    
    print(f"\n Guardando: {output_file.name}")
    df.to_parquet(output_file, index=False)
    
    size_mb = output_file.stat().st_size / (1024*1024)
    print(f"    Tama√±o: {size_mb:.1f} MB")
    print(f"    Shape: {df.shape}")
    
    return output_file


def main():
    """Funci√≥n principal"""
    
    print("\n" + "="*80)
    print(" FIX R√ÅPIDO - Sleep Stages en -1")
    print("="*80)
    
    # Rutas
    features_file = r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf\features_batch\features_complete_labeled.parquet"
    data_path = r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf\sleep-cassette"
    
    # Verificar
    if not Path(features_file).exists():
        print(f" No se encuentra: {features_file}")
        return
    
    if not Path(data_path).exists():
        print(f" No se encuentra: {data_path}")
        return
    
    # Reparar
    df_fixed = fix_sleep_stages_fast(features_file, data_path)
    
    # Guardar
    output_file = save_fixed_file(df_fixed, features_file)
    
    print("\n" + "="*80)
    print(" REPARACI√ìN COMPLETADA")
    print("="*80)
    print(f"\n Usa el nuevo archivo:")
    print(f"   {output_file.name}")
    print(f"\n   df = pd.read_parquet('{output_file.name}')")


if __name__ == "__main__":
    main()

In [None]:
"""
Diagn√≥stico completo y fix de sleep stages
Detecta autom√°ticamente el patr√≥n de nombres
"""

import pandas as pd
import numpy as np
import mne
from pathlib import Path
from tqdm import tqdm
import re

def diagnose_files(features_file, data_path):
    """Diagnostica los archivos para entender el problema"""
    print("\n" + "="*80)
    print(" DIAGN√ìSTICO")
    print("="*80)
    
    # 1. Ver subject_ids en el DataFrame
    df = pd.read_parquet(features_file)
    print(f"\nüìä DataFrame:")
    print(f"   Shape: {df.shape}")
    print(f"   Subject IDs √∫nicos: {df['subject_id'].nunique()}")
    print(f"\n   Primeros 10 subject_ids:")
    for sid in df['subject_id'].unique()[:10]:
        print(f"      '{sid}'")
    
    # 2. Ver archivos hypnogram disponibles
    data_path = Path(data_path)
    hypno_files = list(data_path.glob("*Hypnogram*.edf"))
    
    print(f"\n Archivos Hypnogram encontrados: {len(hypno_files)}")
    print(f"\n   Primeros 10 archivos:")
    for f in hypno_files[:10]:
        print(f"      {f.name}")
    
    # 3. Intentar extraer patr√≥n
    if hypno_files:
        sample_name = hypno_files[0].name
        print(f"\n Ejemplo de nombre de archivo:")
        print(f"   {sample_name}")
        
        # Extraer posible subject_id
        # Patrones comunes: SC4001E0, ST7011J0, etc.
        patterns = [
            r'(SC\d+[A-Z]\d+)',  # SC4001E0
            r'(ST\d+[A-Z]\d+)',  # ST7011J0
            r'(\d+)',             # Solo n√∫meros
        ]
        
        for pattern in patterns:
            match = re.search(pattern, sample_name)
            if match:
                print(f"   Posible ID: '{match.group(1)}'")
    
    return df, hypno_files


def create_subject_mapping(df_subject_ids, hypno_files):
    """
    Crea mapeo entre subject_ids del DataFrame y archivos hypnogram
    """
    print("\n" + "="*80)
    print("CREANDO MAPEO")
    print("="*80)
    
    mapping = {}
    
    for subject_id in tqdm(df_subject_ids, desc="Mapeando sujetos"):
        # Buscar archivo que contenga este subject_id
        found = False
        
        for hypno_file in hypno_files:
            # Probar coincidencia exacta
            if subject_id in hypno_file.name:
                mapping[subject_id] = hypno_file
                found = True
                break
            
            # Probar sin guiones/espacios
            clean_id = subject_id.replace('-', '').replace('_', '').replace(' ', '')
            clean_name = hypno_file.name.replace('-', '').replace('_', '').replace(' ', '')
            if clean_id in clean_name:
                mapping[subject_id] = hypno_file
                found = True
                break
        
        if not found and len(mapping) < 5:
            print(f"     No se encontr√≥ hypnogram para: '{subject_id}'")
    
    print(f"\n   ‚úì Mapeos exitosos: {len(mapping)} / {len(df_subject_ids)}")
    
    if len(mapping) > 0:
        print(f"\n   Ejemplos de mapeo:")
        for i, (sid, hfile) in enumerate(list(mapping.items())[:3]):
            print(f"      '{sid}' ‚Üí {hfile.name}")
    
    return mapping


def load_hypnogram(hypno_file):
    """Carga un archivo hypnogram"""
    try:
        annotations = mne.read_annotations(hypno_file, verbose=False)
        
        sleep_stages = []
        for desc in annotations.description:
            if 'Sleep stage' in desc:
                stage = desc.replace('Sleep stage ', '').strip()
                
                stage_map = {
                    'W': 'W',
                    '?': 'Unknown',
                    '1': 'S1',
                    '2': 'S2',
                    '3': 'S3',
                    '4': 'S4',
                    'R': 'REM'
                }
                
                mapped_stage = stage_map.get(stage, stage)
                sleep_stages.append(mapped_stage)
        
        return sleep_stages
        
    except Exception as e:
        return None


def fix_sleep_stages(features_file, data_path):
    """Fix completo con diagn√≥stico autom√°tico"""
    
    # 1. Diagn√≥stico
    df, hypno_files = diagnose_files(features_file, data_path)
    
    if len(hypno_files) == 0:
        print("\n No se encontraron archivos hypnogram")
        return None
    
    # 2. Crear mapeo
    subject_ids = df['subject_id'].unique()
    mapping = create_subject_mapping(subject_ids, hypno_files)
    
    if len(mapping) == 0:
        print("\n No se pudo mapear ning√∫n sujeto")
        print("\n Posibles soluciones:")
        print("   1. Verifica que los subject_ids coincidan con nombres de archivos")
        print("   2. Los archivos deben contener 'Hypnogram' en el nombre")
        return None
    
    # 3. Cargar hypnogramas
    print("\n" + "="*80)
    print(" CARGANDO HYPNOGRAMAS")
    print("="*80)
    
    hypnogram_cache = {}
    for subject_id, hypno_file in tqdm(mapping.items(), desc="Cargando"):
        stages = load_hypnogram(hypno_file)
        if stages:
            hypnogram_cache[subject_id] = stages
    
    print(f"   ‚úì Hypnogramas cargados: {len(hypnogram_cache)}")
    
    # 4. Asignar etiquetas
    print("\n  Asignando sleep stages...")
    
    df['sleep_stage_new'] = None
    assigned = 0
    
    for subject_id in tqdm(hypnogram_cache.keys(), desc="Asignando"):
        sleep_stages = hypnogram_cache[subject_id]
        
        mask = df['subject_id'] == subject_id
        subject_indices = df[mask].index
        
        for idx in subject_indices:
            epoch_idx = df.loc[idx, 'epoch_idx']
            
            if epoch_idx < len(sleep_stages):
                df.loc[idx, 'sleep_stage_new'] = sleep_stages[epoch_idx]
                assigned += 1
    
    print(f"\n    √âpocas etiquetadas: {assigned:,} / {len(df):,}")
    
    # 5. Reemplazar columna
    df['sleep_stage'] = df['sleep_stage_new']
    df = df.drop(columns=['sleep_stage_new'])
    
    # 6. Filtrar v√°lidas
    valid_stages = ['W', 'S1', 'S2', 'S3', 'S4', 'REM']
    df_valid = df[df['sleep_stage'].isin(valid_stages)]
    
    print(f"   Shape final: {df_valid.shape}")
    
    # 7. Distribuci√≥n
    print(f"\n Distribuci√≥n de Sleep Stages:")
    stage_counts = df_valid['sleep_stage'].value_counts()
    for stage, count in stage_counts.items():
        pct = (count / len(df_valid)) * 100
        print(f"      {stage:5s}: {count:7,} ({pct:5.1f}%)")
    
    return df_valid


def save_fixed_file(df, original_file):
    """Guarda archivo reparado"""
    print("\n" + "="*80)
    print(" GUARDANDO")
    print("="*80)
    
    output_file = Path(original_file).parent / "features_FIXED.parquet"
    
    df.to_parquet(output_file, index=False)
    size_mb = output_file.stat().st_size / (1024*1024)
    
    print(f"    Archivo: {output_file.name}")
    print(f"    Tama√±o: {size_mb:.1f} MB")
    print(f"    Shape: {df.shape}")
    
    return output_file


def main():
    """Main"""
    
    print("\n" + "="*80)
    print(" DIAGN√ìSTICO Y FIX AUTOM√ÅTICO")
    print("="*80)
    
    features_file = r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf\features_batch\features_complete_labeled.parquet"
    data_path = r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf\sleep-cassette"
    
    if not Path(features_file).exists():
        print(f" No se encuentra: {features_file}")
        return
    
    if not Path(data_path).exists():
        print(f" No se encuentra: {data_path}")
        return
    
    # Fix
    df_fixed = fix_sleep_stages(features_file, data_path)
    
    if df_fixed is None or len(df_fixed) == 0:
        print("\n No se pudo reparar el archivo")
        return
    
    # Guardar
    output_file = save_fixed_file(df_fixed, features_file)
    
    print("\n" + "="*80)
    print(" COMPLETADO")
    print("="*80)
    print(f"\n Usa:")
    print(f"   df = pd.read_parquet('{output_file.name}')")
    
    return df_fixed


if __name__ == "__main__":
    df = main()

# NCA y selecci√≥n de mejores features

In [None]:
"""
NCA Feature Selection - An√°lisis exploratorio
Selecciona 30 mejores features basado en varianza y correlaci√≥n
Ya que no tenemos etiquetas, usamos m√©todos no supervisados
"""

import pandas as pd
import numpy as np
from pathlib import Path
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.feature_selection import VarianceThreshold, mutual_info_classif
import matplotlib.pyplot as plt
import seaborn as sns
from tqdm import tqdm
import pickle

class FeatureSelectorUnsupervised:
    """
    Selector de features usando m√©todos no supervisados
    """
    
    def __init__(self, features_file, output_dir):
        self.features_file = Path(features_file)
        self.output_dir = Path(output_dir)
        self.output_dir.mkdir(parents=True, exist_ok=True)
        
    def load_data(self):
        """Carga datos"""
        print("\n" + "="*80)
        print(" CARGANDO DATOS")
        print("="*80)
        
        df = pd.read_parquet(self.features_file)
        print(f"    Shape: {df.shape}")
        print(f"    Columnas: {df.columns.tolist()[:10]} ... (+{len(df.columns)-10} m√°s)")
        
        # Detectar columnas de metadata autom√°ticamente
        possible_meta = ['subject_id', 'epoch_idx', 'sleep_stage', 'channel', 
                        'subject', 'epoch', 'label', 'stage']
        meta_cols = [col for col in df.columns if col in possible_meta]
        
        print(f"    Columnas de metadata detectadas: {meta_cols}")
        
        # Features son todas las dem√°s
        feature_cols = [col for col in df.columns if col not in meta_cols]
        
        print(f"    Features: {len(feature_cols)}")
        print(f"    Sujetos: {df['subject_id'].nunique() if 'subject_id' in df.columns else 'N/A'}")
        
        return df, feature_cols, meta_cols
    
    def select_features_by_variance_and_correlation(self, df, feature_cols, n_features=30):
        """
        Selecciona features basado en:
        1. Varianza (elimina features con poca variabilidad)
        2. Correlaci√≥n (elimina redundantes)
        3. PCA para encontrar las m√°s importantes
        """
        print("\n" + "="*80)
        print(" SELECCI√ìN DE FEATURES - M√©todo H√≠brido")
        print("="*80)
        
        # Filtrar solo columnas num√©ricas
        print(f"\nüîç Filtrando columnas num√©ricas...")
        df_features = df[feature_cols]
        
        # Identificar columnas num√©ricas
        numeric_cols = df_features.select_dtypes(include=[np.number]).columns.tolist()
        print(f"   Features num√©ricas: {len(numeric_cols)} / {len(feature_cols)}")
        
        if len(numeric_cols) == 0:
            print("    No hay columnas num√©ricas!")
            return [], None, None, None
        
        # Usar solo num√©ricas
        X = df_features[numeric_cols].values
        print(f"   X shape: {X.shape}")
        
        # Limpiar datos
        print(f"\n Limpieza:")
        nan_count = np.isnan(X).sum() if X.dtype in [np.float32, np.float64] else 0
        inf_count = np.isinf(X).sum() if X.dtype in [np.float32, np.float64] else 0
        print(f"   NaNs: {nan_count}")
        print(f"   Infs: {inf_count}")
        
        X = np.nan_to_num(X, nan=0.0, posinf=0.0, neginf=0.0)
        
        # Actualizar feature_cols con solo las num√©ricas
        feature_cols = numeric_cols
        
        # Normalizar
        print(f"\n Normalizando...")
        scaler = StandardScaler()
        X_scaled = scaler.fit_transform(X)
        
        # 1. Filtrar por varianza
        print(f"\n  1 Filtro por Varianza")
        selector = VarianceThreshold(threshold=0.01)  # Eliminar features con varianza muy baja
        X_var = selector.fit_transform(X_scaled)
        selected_mask = selector.get_support()
        features_after_var = [f for f, m in zip(feature_cols, selected_mask) if m]
        
        print(f"   Features despu√©s de filtro: {len(features_after_var)} / {len(feature_cols)}")
        
        # 2. An√°lisis de correlaci√≥n
        print(f"\n 2 An√°lisis de Correlaci√≥n")
        df_features = pd.DataFrame(X_var, columns=features_after_var)
        corr_matrix = df_features.corr().abs()
        
        # Encontrar pares altamente correlacionados
        upper_triangle = corr_matrix.where(
            np.triu(np.ones(corr_matrix.shape), k=1).astype(bool)
        )
        
        # Eliminar features con correlaci√≥n > 0.95
        to_drop = [column for column in upper_triangle.columns 
                   if any(upper_triangle[column] > 0.95)]
        
        features_after_corr = [f for f in features_after_var if f not in to_drop]
        X_corr = df_features[features_after_corr].values
        
        print(f"   Features altamente correlacionadas eliminadas: {len(to_drop)}")
        print(f"   Features restantes: {len(features_after_corr)}")
        
        # 3. PCA para importancia
        print(f"\n 3 PCA para Ranking de Importancia")
        pca = PCA(n_components=min(50, len(features_after_corr)))
        X_pca = pca.fit_transform(X_corr)
        
        # Importancia basada en componentes principales
        # Features que contribuyen m√°s a los primeros componentes son m√°s importantes
        components_abs = np.abs(pca.components_)
        
        # Ponderar por varianza explicada
        weighted_importance = np.zeros(len(features_after_corr))
        for i, comp in enumerate(components_abs):
            weighted_importance += comp * pca.explained_variance_ratio_[i]
        
        # Seleccionar top N
        top_indices = np.argsort(weighted_importance)[-n_features:][::-1]
        selected_features = [features_after_corr[i] for i in top_indices]
        
        print(f"\n    Top {n_features} features seleccionadas")
        
        # Mostrar top 15
        print(f"\n Top 15 Features:")
        for i, feat in enumerate(selected_features[:15], 1):
            imp = weighted_importance[features_after_corr.index(feat)]
            print(f"      {i:2d}. {feat:60s} (imp: {imp:.6f})")
        
        return selected_features, scaler, pca, weighted_importance
    
    def separate_by_channel_type(self, selected_features):
        """
        Separa features por tipo de canal
        """
        print("\n" + "="*80)
        print("SEPARACI√ìN POR CANAL")
        print("="*80)
        
        eog_features = [f for f in selected_features if 'EOG' in f.upper()]
        eeg_features = [f for f in selected_features if 'EEG' in f.upper()]
        
        print(f"   EOG features: {len(eog_features)}")
        print(f"   EEG features: {len(eeg_features)}")
        
        return eog_features, eeg_features
    
    def save_results(self, df, selected_features, scaler, pca, output_name, description, meta_cols):
        """Guarda resultados"""
        print("\n" + "="*80)
        print(f" GUARDANDO - {output_name}")
        print("="*80)
        
        output_path = self.output_dir / output_name
        output_path.mkdir(parents=True, exist_ok=True)
        
        # 1. DataFrame con features seleccionadas
        # Usar solo las columnas de metadata que existen
        existing_meta = [col for col in meta_cols if col in df.columns]
        cols_to_save = existing_meta + selected_features
        df_selected = df[cols_to_save].copy()
        
        # Parquet
        parquet_file = output_path / "features_selected_30.parquet"
        df_selected.to_parquet(parquet_file, index=False)
        size_mb = parquet_file.stat().st_size / (1024*1024)
        print(f"   ‚úì Parquet: {parquet_file.name} ({size_mb:.1f} MB)")
        
        # Pickle
        pkl_file = output_path / "features_selected_30.pkl"
        df_selected.to_pickle(pkl_file, compression='gzip')
        print(f"   ‚úì Pickle: {pkl_file.name}")
        
        # 2. Lista de features
        features_file = output_path / "selected_features_list.txt"
        with open(features_file, 'w', encoding='utf-8') as f:
            f.write("="*80 + "\n")
            f.write(f"FEATURES SELECCIONADAS - {description}\n")
            f.write("="*80 + "\n\n")
            f.write(f"Total: {len(selected_features)}\n")
            f.write(f"M√©todo: Varianza + Correlaci√≥n + PCA\n\n")
            
            for i, feat in enumerate(selected_features, 1):
                f.write(f"{i:2d}. {feat}\n")
        print(f"   ‚úì Lista: {features_file.name}")
        
        # 3. Modelo
        model_file = output_path / "feature_selector.pkl"
        with open(model_file, 'wb') as f:
            pickle.dump({
                'scaler': scaler,
                'pca': pca,
                'selected_features': selected_features
            }, f)
        print(f"   ‚úì Modelo: {model_file.name}")
        
        # 4. Info
        info_file = output_path / "info.txt"
        with open(info_file, 'w', encoding='utf-8') as f:
            f.write("="*80 + "\n")
            f.write(f"INFORMACI√ìN - {description}\n")
            f.write("="*80 + "\n\n")
            
            f.write(f"Shape: {df_selected.shape}\n")
            f.write(f"√âpocas: {len(df_selected):,}\n")
            f.write(f"Features: {len(selected_features)}\n")
            
            if 'subject_id' in df_selected.columns:
                f.write(f"Sujetos: {df_selected['subject_id'].nunique()}\n\n")
                f.write("Sujetos:\n")
                for subject in sorted(df_selected['subject_id'].unique()):
                    n = (df_selected['subject_id'] == subject).sum()
                    f.write(f"  {subject}: {n:,} √©pocas\n")
        print(f"   ‚úì Info: {info_file.name}")
        
        # 5. Visualizaci√≥n
        self._plot_feature_importance(selected_features[:15], output_path)
        
        print(f"\n Guardado en: {output_path}")
        
        return df_selected
    
    def _plot_feature_importance(self, top_features, output_path):
        """Plot"""
        plt.figure(figsize=(12, 8))
        
        y_pos = np.arange(len(top_features))
        plt.barh(y_pos, range(len(top_features), 0, -1), color='steelblue')
        plt.yticks(y_pos, top_features, fontsize=8)
        plt.xlabel('Ranking', fontsize=11, fontweight='bold')
        plt.title('Top 15 Features Seleccionadas', fontsize=13, fontweight='bold')
        plt.gca().invert_yaxis()
        plt.grid(axis='x', alpha=0.3)
        plt.tight_layout()
        
        plot_file = output_path / "feature_ranking.png"
        plt.savefig(plot_file, dpi=150, bbox_inches='tight')
        plt.close()
        
        print(f"   ‚úì Gr√°fico: {plot_file.name}")


def main():
    """Main"""
    
    print("\n" + "="*80)
    print(" SELECCI√ìN DE 30 MEJORES FEATURES")
    print("   M√©todo: Varianza + Correlaci√≥n + PCA")
    print("="*80)
    
    # Configuraci√≥n
    features_file = r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf\features_batch\features_complete_labeled.parquet"
    output_dir = r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf"
    
    if not Path(features_file).exists():
        print(f" No se encuentra: {features_file}")
        return
    
    # Crear selector
    selector = FeatureSelectorUnsupervised(features_file, output_dir)
    
    # Cargar datos
    df, feature_cols, meta_cols = selector.load_data()
    
    # ========================================================================
    # PARTE 1: TODAS LAS FEATURES (EOG + EEG combinadas)
    # ========================================================================
    print("\n" + "="*80)
    print(" SELECCI√ìN GLOBAL: EOG + EEG")
    print("="*80)
    
    selected_all, scaler_all, pca_all, importance_all = \
        selector.select_features_by_variance_and_correlation(df, feature_cols, n_features=30)
    
    # Guardar
    df_all_selected = selector.save_results(
        df, selected_all, scaler_all, pca_all,
        "selected_features_all", "EOG + EEG (30 features)", meta_cols
    )
    
    # ========================================================================
    # PARTE 2: SEPARAR POR TIPO (EOG vs EEG)
    # ========================================================================
    eog_features, eeg_features = selector.separate_by_channel_type(selected_all)
    
    if len(eog_features) > 0:
        print("\n" + "="*80)
        print(f" SUBSET: Solo EOG ({len(eog_features)} features)")
        print("="*80)
        
        df_eog_selected = selector.save_results(
            df, eog_features, scaler_all, pca_all,
            "selected_features_eog_only", f"Solo EOG ({len(eog_features)} features)", meta_cols
        )
    
    if len(eeg_features) > 0:
        print("\n" + "="*80)
        print(f" SUBSET: Solo EEG ({len(eeg_features)} features)")
        print("="*80)
        
        df_eeg_selected = selector.save_results(
            df, eeg_features, scaler_all, pca_all,
            "selected_features_eeg_only", f"Solo EEG ({len(eeg_features)} features)", meta_cols
        )
    
    # ========================================================================
    # RESUMEN
    # ========================================================================
    print("\n" + "="*80)
    print(" COMPLETADO")
    print("="*80)
    
    print(f"\n Resultados en: {output_dir}")
    print(f"\n Carpetas creadas:")
    print(f"   1. selected_features_all/       - EOG + EEG (30 features)")
    if len(eog_features) > 0:
        print(f"   2. selected_features_eog_only/  - Solo EOG ({len(eog_features)} features)")
    if len(eeg_features) > 0:
        print(f"   3. selected_features_eeg_only/  - Solo EEG ({len(eeg_features)} features)")
    
    print(f"\n  NOTA: Las etiquetas de sleep_stage est√°n en -1")
    print(f"   Necesitar√°s cargar las etiquetas reales desde los hypnogramas")
    print(f"   o desde otro archivo para poder entrenar modelos.")
    
    print(f"\n Para usar:")
    print(f"   df = pd.read_parquet('selected_features_all/features_selected_30.parquet')")
    print(f"   # Asignar etiquetas manualmente despu√©s")


if __name__ == "__main__":
    main()

In [None]:
"""
Fix completo: Lee hypnogramas correctamente y actualiza archivos NCA
"""

import pandas as pd
import numpy as np
import mne
from pathlib import Path
from tqdm import tqdm
import re

def parse_sleep_edf_hypnogram(hypno_file):
    """
    Parser especializado para hypnogramas de Sleep-EDF
    Formato: +30630Sleep stage W (tiempo en d√©cimas de segundo + duraci√≥n + stage)
    """
    try:
        # Leer el archivo raw
        with open(hypno_file, 'rb') as f:
            content = f.read()
        
        # Buscar anotaciones (despu√©s del header EDF)
        annotations_start = content.find(b'Sleep stage')
        if annotations_start == -1:
            return None
        
        # Extraer todas las anotaciones
        annotations_text = content[annotations_start:].decode('latin-1', errors='ignore')
        
        # Buscar patrones: "Sleep stage X" donde X puede ser W, 1, 2, 3, 4, R, ?
        pattern = r'Sleep stage ([W1234R\?])'
        matches = re.findall(pattern, annotations_text)
        
        if not matches:
            return None
        
        # Mapear etiquetas
        stage_map = {
            'W': 'W',
            '?': 'Unknown',
            '1': 'S1',
            '2': 'S2',
            '3': 'S3',
            '4': 'S4',
            'R': 'REM'
        }
        
        sleep_stages = [stage_map.get(stage, stage) for stage in matches]
        
        return sleep_stages
        
    except Exception as e:
        return None


def create_subject_mapping(subject_ids, data_path):
    """
    Mapea subject_ids con archivos hypnogram
    """
    print("\n" + "="*80)
    print("üîó MAPEANDO SUJETOS CON HYPNOGRAMAS")
    print("="*80)
    
    data_path = Path(data_path)
    hypno_files = list(data_path.glob("*Hypnogram*.edf"))
    
    print(f"   Archivos hypnogram encontrados: {len(hypno_files)}")
    
    mapping = {}
    
    for subject_id in tqdm(subject_ids, desc="Mapeando"):
        # Patr√≥n base: SC4001E (sin √∫ltimo d√≠gito)
        base_pattern = subject_id[:-1]
        
        for hypno_file in hypno_files:
            if base_pattern in hypno_file.name:
                mapping[subject_id] = hypno_file
                break
    
    print(f"   ‚úì Mapeos exitosos: {len(mapping)} / {len(subject_ids)}")
    
    if len(mapping) > 0:
        print(f"\n   Ejemplos:")
        for i, (sid, hfile) in enumerate(list(mapping.items())[:5]):
            print(f"      '{sid}' ‚Üí {hfile.name}")
    
    return mapping


def load_all_hypnograms(mapping):
    """
    Carga todos los hypnogramas
    """
    print("\n" + "="*80)
    print(" CARGANDO HYPNOGRAMAS")
    print("="*80)
    
    hypnogram_cache = {}
    failed = []
    
    for subject_id, hypno_file in tqdm(mapping.items(), desc="Cargando"):
        stages = parse_sleep_edf_hypnogram(hypno_file)
        
        if stages:
            hypnogram_cache[subject_id] = stages
        else:
            failed.append(subject_id)
    
    print(f"\n   ‚úì Exitosos: {len(hypnogram_cache)}")
    if failed:
        print(f"     Fallidos: {len(failed)}")
        for sid in failed[:5]:
            print(f"      - {sid}")
    
    return hypnogram_cache


def assign_labels_to_dataframe(df, hypnogram_cache):
    """
    Asigna labels del hypnogram cache al DataFrame
    """
    print("\n" + "="*80)
    print("  ASIGNANDO ETIQUETAS")
    print("="*80)
    
    # Crear nueva columna
    df['sleep_stage_corrected'] = None
    assigned = 0
    
    # Por cada sujeto con hypnogram
    for subject_id in tqdm(hypnogram_cache.keys(), desc="Asignando"):
        stages = hypnogram_cache[subject_id]
        
        # Obtener filas de este sujeto
        mask = df['subject_id'] == subject_id
        subject_indices = df[mask].index
        
        # Asignar por √≠ndice de √©poca
        # Asumiendo que las filas est√°n ordenadas por √©poca
        for i, idx in enumerate(subject_indices):
            if i < len(stages):
                df.loc[idx, 'sleep_stage_corrected'] = stages[i]
                assigned += 1
    
    print(f"\n    √âpocas etiquetadas: {assigned:,} / {len(df):,}")
    
    # Reemplazar columna original
    df['sleep_stage'] = df['sleep_stage_corrected']
    df = df.drop(columns=['sleep_stage_corrected'])
    
    # Filtrar solo v√°lidas
    valid_stages = ['W', 'S1', 'S2', 'S3', 'S4', 'REM']
    df_valid = df[df['sleep_stage'].isin(valid_stages)].copy()
    
    print(f"   ‚úì √âpocas v√°lidas: {len(df_valid):,}")
    
    # Distribuci√≥n
    print(f"\n Distribuci√≥n:")
    stage_counts = df_valid['sleep_stage'].value_counts()
    for stage, count in sorted(stage_counts.items()):
        pct = (count / len(df_valid)) * 100
        print(f"      {stage:5s}: {count:7,} ({pct:5.1f}%)")
    
    return df_valid


def update_nca_folder(folder_path, hypnogram_cache, data_path):
    """
    Actualiza un folder NCA con las etiquetas correctas
    """
    print("\n" + "="*80)
    print(f" ACTUALIZANDO: {folder_path.name}")
    print("="*80)
    
    parquet_file = folder_path / "features_selected_30.parquet"
    
    if not parquet_file.exists():
        print(f"   ‚ö†Ô∏è  No se encuentra: {parquet_file.name}")
        return None
    
    # Cargar
    df = pd.read_parquet(parquet_file)
    print(f"   ‚úì Cargado: {df.shape}")
    print(f"   ‚úì Sleep stages actuales: {df['sleep_stage'].unique()}")
    
    # Asignar labels
    df_fixed = assign_labels_to_dataframe(df, hypnogram_cache)
    
    if len(df_fixed) == 0:
        print(f"   ‚ùå No quedaron filas v√°lidas")
        return None
    
    # Guardar actualizado
    print(f"\n Guardando versi√≥n corregida...")
    
    # Backup del original
    backup_file = folder_path / "features_selected_30_BACKUP.parquet"
    df.to_parquet(backup_file, index=False)
    print(f"   ‚úì Backup: {backup_file.name}")
    
    # Sobrescribir con versi√≥n corregida
    df_fixed.to_parquet(parquet_file, index=False)
    size_mb = parquet_file.stat().st_size / (1024*1024)
    print(f"   ‚úì Actualizado: {parquet_file.name} ({size_mb:.1f} MB)")
    
    # Tambi√©n pickle
    pkl_file = folder_path / "features_selected_30.pkl"
    df_fixed.to_pickle(pkl_file, compression='gzip')
    print(f"   ‚úì Actualizado: {pkl_file.name}")
    
    # Actualizar info.txt
    info_file = folder_path / "info_UPDATED.txt"
    with open(info_file, 'w', encoding='utf-8') as f:
        f.write("="*80 + "\n")
        f.write("ARCHIVO ACTUALIZADO CON ETIQUETAS CORRECTAS\n")
        f.write("="*80 + "\n\n")
        
        f.write(f"Shape: {df_fixed.shape}\n")
        f.write(f"√âpocas: {len(df_fixed):,}\n")
        f.write(f"Sujetos: {df_fixed['subject_id'].nunique()}\n\n")
        
        f.write("Distribuci√≥n de Sleep Stages:\n")
        stage_counts = df_fixed['sleep_stage'].value_counts()
        for stage, count in sorted(stage_counts.items()):
            pct = (count / len(df_fixed)) * 100
            f.write(f"  {stage:5s}: {count:7,} ({pct:5.1f}%)\n")
        
        f.write(f"\nSujetos:\n")
        for subject in sorted(df_fixed['subject_id'].unique()):
            n = (df_fixed['subject_id'] == subject).sum()
            f.write(f"  {subject}: {n:,} √©pocas\n")
    
    print(f"   ‚úì Info: {info_file.name}")
    
    return df_fixed


def main():
    """Main"""
    
    print("\n" + "="*80)
    print(" FIX COMPLETO: ETIQUETAS + ACTUALIZACI√ìN NCA")
    print("="*80)
    
    # Rutas
    nca_base_dir = r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf"
    data_path = r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf\sleep-cassette"
    
    nca_folders = [
        "selected_features_all",
        "selected_features_eog_only",
        "selected_features_eeg_only"
    ]
    
    # Verificar data_path
    if not Path(data_path).exists():
        print(f"‚ùå No se encuentra: {data_path}")
        return
    
    # Cargar una carpeta para obtener subject_ids
    first_folder = Path(nca_base_dir) / nca_folders[0]
    if not first_folder.exists():
        print(f"‚ùå No se encuentra: {first_folder}")
        return
    
    parquet_file = first_folder / "features_selected_30.parquet"
    if not parquet_file.exists():
        print(f"‚ùå No se encuentra: {parquet_file}")
        return
    
    # Leer para obtener subject_ids
    print(f"\n Leyendo: {parquet_file}")
    df_sample = pd.read_parquet(parquet_file)
    subject_ids = df_sample['subject_id'].unique()
    print(f"   ‚úì Sujetos encontrados: {len(subject_ids)}")
    
    # Crear mapeo
    mapping = create_subject_mapping(subject_ids, data_path)
    
    if len(mapping) == 0:
        print("\n No se pudo mapear ning√∫n sujeto")
        return
    
    # Cargar hypnogramas
    hypnogram_cache = load_all_hypnograms(mapping)
    
    if len(hypnogram_cache) == 0:
        print("\n No se pudo cargar ning√∫n hypnogram")
        return
    
    # Actualizar cada carpeta NCA
    print("\n" + "="*80)
    print(" ACTUALIZANDO CARPETAS NCA")
    print("="*80)
    
    for folder_name in nca_folders:
        folder_path = Path(nca_base_dir) / folder_name
        
        if folder_path.exists():
            df_updated = update_nca_folder(folder_path, hypnogram_cache, data_path)
        else:
            print(f"\n  No existe: {folder_name}")
    
    # Resumen final
    print("\n" + "="*80)
    print(" ACTUALIZACI√ìN COMPLETADA")
    print("="*80)
    
    print(f"\n Carpetas actualizadas:")
    for folder_name in nca_folders:
        folder_path = Path(nca_base_dir) / folder_name
        if folder_path.exists():
            parquet = folder_path / "features_selected_30.parquet"
            if parquet.exists():
                df_check = pd.read_parquet(parquet)
                valid_count = df_check['sleep_stage'].isin(['W','S1','S2','S3','S4','REM']).sum()
                print(f"   ‚úì {folder_name}")
                print(f"      - √âpocas: {len(df_check):,}")
                print(f"      - V√°lidas: {valid_count:,}")
                print(f"      - Backup guardado: features_selected_30_BACKUP.parquet")
    
    print(f"\n Para usar:")
    print(f"   import pandas as pd")
    print(f"   df = pd.read_parquet('selected_features_all/features_selected_30.parquet')")
    print(f"   ")
    print(f"   # Ahora sleep_stage tiene valores correctos: W, S1, S2, S3, S4, REM")
    print(f"   print(df['sleep_stage'].value_counts())")
    
    print(f"\n Para entrenar:")
    print(f"   # Separar por sujetos (no por √©pocas)")
    print(f"   from sklearn.model_selection import train_test_split")
    print(f"   subjects = df['subject_id'].unique()")
    print(f"   train_subj, test_subj = train_test_split(subjects, test_size=0.2, random_state=42)")


if __name__ == "__main__":
    main()

In [None]:
"""
Diagn√≥stico: ¬øPor qu√© solo se etiquetaron 21K de 450K √©pocas?
"""

import pandas as pd
import numpy as np
from pathlib import Path

def diagnose_dataframe(parquet_file):
    """
    Investiga la estructura del DataFrame para entender las √©pocas
    """
    print("\n" + "="*80)
    print(" DIAGN√ìSTICO DE ESTRUCTURA")
    print("="*80)
    
    df = pd.read_parquet(parquet_file)
    
    print(f"\n Informaci√≥n b√°sica:")
    print(f"   Total de filas: {len(df):,}")
    print(f"   Columnas: {df.columns.tolist()}")
    
    # Analizar por sujeto
    print(f"\nüë§ An√°lisis por sujeto:")
    print(f"   Sujetos √∫nicos: {df['subject_id'].nunique()}")
    
    # Ver un sujeto ejemplo
    first_subject = df['subject_id'].iloc[0]
    subject_df = df[df['subject_id'] == first_subject]
    
    print(f"\n Ejemplo - Sujeto: {first_subject}")
    print(f"   Filas de este sujeto: {len(subject_df):,}")
    
    # Ver si hay √≠ndices de √©poca
    if 'epoch_idx' in df.columns:
        print(f"   Valores √∫nicos de epoch_idx: {subject_df['epoch_idx'].nunique()}")
        print(f"   Rango epoch_idx: {subject_df['epoch_idx'].min()} - {subject_df['epoch_idx'].max()}")
    
    # Ver primeras filas
    print(f"\n Primeras 10 filas del sujeto:")
    display_cols = ['subject_id']
    if 'epoch_idx' in df.columns:
        display_cols.append('epoch_idx')
    if 'sleep_stage' in df.columns:
        display_cols.append('sleep_stage')
    
    # Agregar primeras 3 columnas de features
    feature_cols = [col for col in df.columns if col not in ['subject_id', 'epoch_idx', 'sleep_stage']]
    display_cols.extend(feature_cols[:3])
    
    print(subject_df[display_cols].head(10).to_string())
    
    # Hip√≥tesis
    print(f"\n HIP√ìTESIS:")
    avg_rows_per_subject = len(df) / df['subject_id'].nunique()
    print(f"   Promedio filas por sujeto: {avg_rows_per_subject:.0f}")
    
    if avg_rows_per_subject > 200:
        print(f"     Esto es MUY ALTO para √©pocas de sue√±o (t√≠pico: 100-200)")
        print(f"   ")
        print(f"   Posibles causas:")
        print(f"   1. Hay filas duplicadas por canal (EOG, EEG Fpz-Cz, EEG Pz-Oz)")
        print(f"   2. Las features est√°n 'pivoteadas' pero epoch_idx se repite")
        print(f"   3. Cada fila es una ventana/segmento, no una √©poca completa")
    
    # Verificar duplicados
    if 'epoch_idx' in df.columns:
        print(f"\nüîç Verificando duplicados de √©poca:")
        dup_check = subject_df.groupby('epoch_idx').size()
        if dup_check.max() > 1:
            print(f"     √âpocas duplicadas encontradas!")
            print(f"   M√°ximo de filas por epoch_idx: {dup_check.max()}")
            print(f"   Ejemplo √©poca duplicada:")
            dup_epoch = dup_check[dup_check > 1].index[0]
            print(subject_df[subject_df['epoch_idx'] == dup_epoch][display_cols].to_string())
    
    return df


def propose_solution(df):
    """
    Propone soluci√≥n seg√∫n el diagn√≥stico
    """
    print("\n" + "="*80)
    print(" SOLUCI√ìN PROPUESTA")
    print("="*80)
    
    if 'epoch_idx' in df.columns:
        # Verificar si hay duplicados
        test_subject = df['subject_id'].iloc[0]
        test_df = df[df['subject_id'] == test_subject]
        duplicates = test_df.groupby('epoch_idx').size().max() > 1
        
        if duplicates:
            print(f"\n‚úÖ SOLUCI√ìN: Agregar/promediar filas duplicadas por √©poca")
            print(f"   ")
            print(f"   Estrategia:")
            print(f"   1. Agrupar por (subject_id, epoch_idx)")
            print(f"   2. Promediar features num√©ricas")
            print(f"   3. Tomar primera sleep_stage")
            print(f"   4. Resultado: 1 fila por √©poca real")
            
            return "aggregate"
    
    print(f"\n  Se necesita m√°s informaci√≥n para proponer soluci√≥n")
    print(f"   Por favor comparte:")
    print(f"   1. df.head(20) del archivo original")
    print(f"   2. Confirmaci√≥n de cu√°ntas √©pocas REALES tiene cada sujeto")
    
    return "unknown"


def aggregate_duplicates(df):
    """
    Agrega filas duplicadas por √©poca
    """
    print("\n" + "="*80)
    print("üîß AGREGANDO DUPLICADOS")
    print("="*80)
    
    if 'epoch_idx' not in df.columns:
        print("    No hay columna epoch_idx")
        return df
    
    # Identificar columnas
    group_cols = ['subject_id', 'epoch_idx']
    meta_cols = ['sleep_stage']
    feature_cols = [col for col in df.columns 
                   if col not in group_cols + meta_cols]
    
    print(f"   Agrupando por: {group_cols}")
    print(f"   Features a promediar: {len(feature_cols)}")
    
    # Agregar
    agg_dict = {}
    
    # Features: promedio
    for col in feature_cols:
        if df[col].dtype in [np.float64, np.float32, np.int64, np.int32]:
            agg_dict[col] = 'mean'
    
    # Sleep stage: primera ocurrencia
    if 'sleep_stage' in df.columns:
        agg_dict['sleep_stage'] = 'first'
    
    df_agg = df.groupby(group_cols, as_index=False).agg(agg_dict)
    
    print(f"\n   Antes: {len(df):,} filas")
    print(f"    Despu√©s: {len(df_agg):,} filas")
    print(f"   Reducci√≥n: {(1 - len(df_agg)/len(df))*100:.1f}%")
    
    return df_agg


def main():
    """Main"""
    
    print("\n" + "="*80)
    print(" DIAGN√ìSTICO: ¬øPor qu√© solo 21K de 450K √©pocas?")
    print("="*80)
    
    parquet_file = r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf\selected_features_all\features_selected_30.parquet"
    
    if not Path(parquet_file).exists():
        print(f" No se encuentra: {parquet_file}")
        return
    
    # Diagn√≥stico
    df = diagnose_dataframe(parquet_file)
    
    # Proponer soluci√≥n
    solution_type = propose_solution(df)
    
    if solution_type == "aggregate":
        print("\n" + "="*80)
        print("¬øQuieres agregar duplicados ahora? (s/n)")
        print("="*80)
        
        try:
            response = input().strip().lower()
            
            if response == 's':
                # Agregar
                df_fixed = aggregate_duplicates(df)
                
                # Guardar
                output_file = Path(parquet_file).parent / "features_selected_30_AGGREGATED.parquet"
                df_fixed.to_parquet(output_file, index=False)
                
                print(f"\n Guardado: {output_file.name}")
                print(f"   Shape: {df_fixed.shape}")
                print(f"   Ahora tienes 1 fila por √©poca real")
                
                # Reasignar labels
                print(f" Ahora ejecuta el script de fix de labels nuevamente")
                print(f"   pero usando este archivo agregado")
        except:
            print("\n   Modo autom√°tico, no se agreg√≥")
    
    print("\n" + "="*80)
    print(" SIGUIENTE PASO")
    print("="*80)
    print(f"Comparte la salida de este diagn√≥stico")
    print(f"para confirmar la soluci√≥n correcta.")


if __name__ == "__main__":
    main()

In [None]:
"""
Limpieza final: Remueve NaN y verifica que todo est√© correcto
"""

import pandas as pd
import numpy as np
from pathlib import Path

def analyze_and_clean_folder(folder_path):
    """
    Analiza y limpia un folder NCA
    """
    print("\n" + "="*80)
    print(f" ANALIZANDO: {folder_path.name}")
    print("="*80)
    
    parquet_file = folder_path / "features_selected_30.parquet"
    
    if not parquet_file.exists():
        print(f"    No existe: {parquet_file.name}")
        return None
    
    # Cargar
    df = pd.read_parquet(parquet_file)
    
    print(f"\nüìä Estado actual:")
    print(f"   Shape: {df.shape}")
    print(f"   Sujetos: {df['subject_id'].nunique()}")
    
    # Verificar sleep_stage
    print(f"\n Sleep stages:")
    if 'sleep_stage' in df.columns:
        stage_counts = df['sleep_stage'].value_counts()
        print(f"   Valores √∫nicos: {df['sleep_stage'].nunique()}")
        for stage, count in sorted(stage_counts.items()):
            print(f"      {stage}: {count:,}")
        
        # Ver si hay -1 o None
        invalid = df['sleep_stage'].isin([-1, 'Unknown', None, np.nan]).sum()
        if invalid > 0:
            print(f"     Inv√°lidas: {invalid}")
    
    # Verificar NaN en features
    print(f"\n An√°lisis de NaN:")
    feature_cols = [col for col in df.columns 
                   if col not in ['subject_id', 'epoch_idx', 'sleep_stage']]
    
    nan_per_col = df[feature_cols].isna().sum()
    cols_with_nan = nan_per_col[nan_per_col > 0]
    
    if len(cols_with_nan) > 0:
        print(f"     Columnas con NaN: {len(cols_with_nan)} / {len(feature_cols)}")
        print(f"   Top 10 columnas con m√°s NaN:")
        for col, count in cols_with_nan.nlargest(10).items():
            pct = (count / len(df)) * 100
            print(f"      {col[:50]:50s}: {count:6,} ({pct:5.1f}%)")
        
        # NaN por fila
        nan_per_row = df[feature_cols].isna().sum(axis=1)
        rows_with_nan = (nan_per_row > 0).sum()
        print(f"\n   Filas con alg√∫n NaN: {rows_with_nan:,} / {len(df):,}")
    else:
        print(f"    No hay NaN")
    
    # Verificar epoch_idx
    print(f"\n Verificar epoch_idx:")
    if 'epoch_idx' in df.columns:
        print(f"    Existe")
    else:
        print(f"     No existe - creando...")
        # Crear epoch_idx secuencial por sujeto
        df['epoch_idx'] = df.groupby('subject_id').cumcount()
        print(f"    Creado")
    
    return df


def clean_dataframe(df):
    """
    Limpia el DataFrame
    """
    print("\n" + "="*80)
    print(" LIMPIEZA")
    print("="*80)
    
    original_len = len(df)
    
    # 1. Remover filas con sleep_stage inv√°lida
    valid_stages = ['W', 'S1', 'S2', 'S3', 'S4', 'REM']
    df = df[df['sleep_stage'].isin(valid_stages)].copy()
    
    removed_invalid = original_len - len(df)
    if removed_invalid > 0:
        print(f"   ‚úì Removidas {removed_invalid:,} filas con sleep_stage inv√°lida")
    
    # 2. Remover filas con muchos NaN
    feature_cols = [col for col in df.columns 
                   if col not in ['subject_id', 'epoch_idx', 'sleep_stage']]
    
    # Contar NaN por fila
    nan_per_row = df[feature_cols].isna().sum(axis=1)
    
    # Remover filas con m√°s del 50% de NaN
    threshold = len(feature_cols) * 0.5
    mask_valid = nan_per_row < threshold
    
    df_clean = df[mask_valid].copy()
    
    removed_nan = len(df) - len(df_clean)
    if removed_nan > 0:
        print(f"   ‚úì Removidas {removed_nan:,} filas con >50% NaN")
    
    # 3. Rellenar NaN restantes con media por columna
    for col in feature_cols:
        if df_clean[col].isna().sum() > 0:
            mean_val = df_clean[col].mean()
            df_clean[col] = df_clean[col].fillna(mean_val)
    
    print(f"\n    DataFrame limpio:")
    print(f"      Filas: {original_len:,} ‚Üí {len(df_clean):,}")
    print(f"      Reducci√≥n: {removed_invalid + removed_nan:,} ({(removed_invalid + removed_nan)/original_len*100:.1f}%)")
    
    return df_clean


def save_cleaned(df, folder_path):
    """
    Guarda DataFrame limpio
    """
    print("\n" + "="*80)
    print(" GUARDANDO VERSI√ìN LIMPIA")
    print("="*80)
    
    # Backup
    parquet_file = folder_path / "features_selected_30.parquet"
    backup_file = folder_path / "features_selected_30_BEFORE_CLEAN.parquet"
    
    if parquet_file.exists():
        pd.read_parquet(parquet_file).to_parquet(backup_file, index=False)
        print(f"   ‚úì Backup: {backup_file.name}")
    
    # Guardar limpio
    df.to_parquet(parquet_file, index=False)
    size_mb = parquet_file.stat().st_size / (1024*1024)
    print(f"   ‚úì Actualizado: {parquet_file.name} ({size_mb:.1f} MB)")
    
    # Pickle
    pkl_file = folder_path / "features_selected_30.pkl"
    df.to_pickle(pkl_file, compression='gzip')
    print(f"   ‚úì Actualizado: {pkl_file.name}")
    
    # Info actualizada
    info_file = folder_path / "info_FINAL.txt"
    with open(info_file, 'w', encoding='utf-8') as f:
        f.write("="*80 + "\n")
        f.write("ARCHIVO FINAL - LIMPIO Y LISTO\n")
        f.write("="*80 + "\n\n")
        
        f.write(f"Shape: {df.shape}\n")
        f.write(f"√âpocas: {len(df):,}\n")
        f.write(f"Features: {len([c for c in df.columns if c not in ['subject_id', 'epoch_idx', 'sleep_stage']])}\n")
        f.write(f"Sujetos: {df['subject_id'].nunique()}\n\n")
        
        f.write("Distribuci√≥n de Sleep Stages:\n")
        stage_counts = df['sleep_stage'].value_counts()
        for stage, count in sorted(stage_counts.items()):
            pct = (count / len(df)) * 100
            f.write(f"  {stage:5s}: {count:7,} ({pct:5.1f}%)\n")
        
        f.write(f"\nSujetos (primeros 20):\n")
        for subject in sorted(df['subject_id'].unique())[:20]:
            n = (df['subject_id'] == subject).sum()
            f.write(f"  {subject}: {n:,} √©pocas\n")
        
        f.write(f"\nCalidad de datos:\n")
        f.write(f"  - Sin NaN: ‚úì\n")
        f.write(f"  - Sleep stages v√°lidas: ‚úì\n")
        f.write(f"  - epoch_idx presente: {'‚úì' if 'epoch_idx' in df.columns else '‚úó'}\n")
    
    print(f"   ‚úì Info: {info_file.name}")
    
    # Resumen final
    print(f"\n RESUMEN FINAL:")
    print(f"   √âpocas totales: {len(df):,}")
    print(f"   Sujetos: {df['subject_id'].nunique()}")
    print(f"   Promedio √©pocas/sujeto: {len(df) / df['subject_id'].nunique():.1f}")
    print(f"\n   Distribuci√≥n sleep stages:")
    for stage, count in sorted(df['sleep_stage'].value_counts().items()):
        pct = (count / len(df)) * 100
        print(f"      {stage:5s}: {count:6,} ({pct:5.1f}%)")


def main():
    """Main"""
    
    print("\n" + "="*80)
    print(" LIMPIEZA FINAL - Remover NaN y Verificar")
    print("="*80)
    
    base_dir = Path(r"C:\Users\Alfredo Sempertegui\Documents\Proyecto IC\sleep_edf")
    
    folders = [
        "selected_features_all",
        "selected_features_eog_only", 
        "selected_features_eeg_only"
    ]
    
    for folder_name in folders:
        folder_path = base_dir / folder_name
        
        if not folder_path.exists():
            print(f"\n  No existe: {folder_name}")
            continue
        
        # Analizar
        df = analyze_and_clean_folder(folder_path)
        
        if df is None:
            continue
        
        # Limpiar
        df_clean = clean_dataframe(df)
        
        # Guardar
        save_cleaned(df_clean, folder_path)
    
    # Resumen final
    print("\n" + "="*80)
    print(" PROCESO COMPLETADO")
    print("="*80)
    
    print(f"\n Archivos actualizados en:")
    print(f"   {base_dir}")
    
    print(f"\n Para usar:")
    print(f"   import pandas as pd")
    print(f"   ")
    print(f"   # Cargar dataset")
    print(f"   df = pd.read_parquet('selected_features_all/features_selected_30.parquet')")
    print(f"   ")
    print(f"   # Verificar")
    print(f"   print(f'Shape: {{df.shape}}')")
    print(f"   print(f'Sujetos: {{df[\"subject_id\"].nunique()}}')")
    print(f"   print(df['sleep_stage'].value_counts())")
    print(f"   print(f'NaN: {{df.isna().sum().sum()}}')")
    
    print(f"\n Para entrenar:")
    print(f"   from sklearn.model_selection import train_test_split")
    print(f"   ")
    print(f"   # Separar por SUJETOS")
    print(f"   subjects = df['subject_id'].unique()")
    print(f"   train_subj, test_subj = train_test_split(subjects, test_size=0.2, random_state=42)")
    print(f"   ")
    print(f"   train_df = df[df['subject_id'].isin(train_subj)]")
    print(f"   test_df = df[df['subject_id'].isin(test_subj)]")
    print(f"   ")
    print(f"   X_train = train_df.drop(['subject_id', 'sleep_stage', 'epoch_idx'], axis=1)")
    print(f"   y_train = train_df['sleep_stage']")


if __name__ == "__main__":
    main()