In [1]:
import os
import h5py
import pandas as pd
import numpy as np
from tsfresh import extract_features, select_features
from tsfresh.feature_extraction import MinimalFCParameters
from tsfresh.utilities.dataframe_functions import impute

from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC


In [None]:
train_file = "train.h5"
test_file = "test.h5"

with h5py.File(train_file, "r") as f:
    X_train = f["x"][:]   # señales EEG
    y_train = f["y"][:]   # etiquetas

with h5py.File(test_file, "r") as f:
    X_test = f["x"][:]    # señales EEG

# Exploración rápida
print(X_train.shape, X_train.dtype)
print(y_train.shape, y_train.dtype)
print(X_train[0,0,:5])  # primeras 5 muestras de EEG del primer sujeto
print(y_train[:5])       # primeras 5 etiquetas

In [None]:
# Convertir a DataFrame ancho
x_h5_train_df = pd.DataFrame(X_train[:,0,:])
y_h5_train_df = pd.Series(y_train)

def parsear_times_series(dataset):
    parsed_data = []
    for id, series in enumerate(dataset):
        time_series_df = pd.DataFrame({
            'id': id,
            'time': np.arange(len(series)),
            'value': series
        })
        parsed_data.append(time_series_df)
    return pd.concat(parsed_data)

# Parsear train y test (primer canal [:,0,:])
long_train_df = parsear_times_series(X_train[:,0,:])
long_test_df = parsear_times_series(X_test[:,0,:])

y_train_series = pd.Series(y_train)

In [None]:
def procesar_en_bloques(long_df, y_labels=None, bloque_size=5, dataset_name="train"):
    os.makedirs("features_chunks", exist_ok=True)
    n_ids = long_df['id'].nunique()
    chunks_files = []

    fc_parameters = MinimalFCParameters()  # features rápidas

    for start in range(0, n_ids, bloque_size):
        end = min(start + bloque_size, n_ids)
        bloque_ids = list(range(start, end))
        bloque_df = long_df[long_df['id'].isin(bloque_ids)]

        # Extraer features
        features_bloque = extract_features(
            bloque_df,
            column_id='id',
            column_sort='time',
            default_fc_parameters=fc_parameters
        )
        impute(features_bloque)
        # Guardar CSV temporal
        file_path = f"features_chunks/{dataset_name}_features_{start}_{end}.csv"
        features_bloque.to_csv(file_path)
        chunks_files.append(file_path)
        print(f"Bloque {start}-{end} procesado y guardado en {file_path}")

    # Combinar todos los CSVs
    features_final = pd.concat([pd.read_csv(f, index_col=0) for f in chunks_files])

    # Selección de features si hay etiquetas
    if y_labels is not None:
        features_final = select_features(features_final, y_labels)

    # 🔹 Guardar el archivo combinado
    final_file = f"features_chunks/{dataset_name}_features_final.csv"
    features_final.to_csv(final_file)
    print(f"Features combinadas guardadas en {final_file}")

    return features_final

In [None]:
X_train_features = procesar_en_bloques(long_train_df, y_labels=y_train_series, bloque_size=20, dataset_name="train")
X_test_features = procesar_en_bloques(long_test_df, y_labels=None, bloque_size=20, dataset_name="test")

# Alinear columnas (usar solo columnas de train en test)
X_test_features = X_test_features[X_train_features.columns]

In [None]:
models = {
    'LogisticRegression': (
        LogisticRegression(max_iter=1000, solver='liblinear'),
        {'C': [0.1, 1, 10], 'penalty': ['l1', 'l2']}
    ),
    'SVM': (
        SVC(),
        {'C': [0.1, 1, 10], 'kernel': ['linear','rbf']}
    )
}

best_models = {}

for name, (model, params) in models.items():
    grid = GridSearchCV(model, params, cv=5, scoring='f1', n_jobs=-1)
    grid.fit(X_train_features, y_train_series)
    best_models[name] = grid.best_estimator_
    print(f"Mejor modelo {name}: {grid.best_params_}")

In [None]:
for name, model in best_models.items():
    y_pred = model.predict(X_test_features)  # test no tiene etiquetas reales
    print(f"\nPredicciones {name}:")
    print(y_pred[:20])  # mostrar primeras 20 predicciones