In [None]:
import mne
import numpy as np
import pandas as pd
from pathlib import Path
from scipy import signal
from scipy.stats import entropy
import pywt
from joblib import Parallel, delayed
import warnings
warnings.filterwarnings('ignore')

# =================== HÀM TÍNH FEATURE ===================

def compute_band_power(data, sfreq):
    freqs, psd = signal.welch(data, sfreq, nperseg=min(256, data.shape[-1]))
    bands = {'delta': (0.5,4), 'theta': (4,8), 'alpha': (8,13),
             'beta': (13,30), 'gamma': (30,45)}
    power_features = {}
    for band, (low, high) in bands.items():
        idx = np.logical_and(freqs >= low, freqs <= high)
        power_features[band] = np.trapz(psd[:, idx], freqs[idx], axis=1)
    return power_features

def compute_spectral_entropy(data, sfreq):
    freqs, psd = signal.welch(data, sfreq, nperseg=min(256, data.shape[-1]))
    psd_norm = psd / np.sum(psd, axis=1, keepdims=True)
    return -np.sum(psd_norm * np.log2(psd_norm + 1e-10), axis=1)

def compute_wavelet_features(data, wavelet='db4', level=3):
    n_channels = data.shape[0]
    features = {'coeffs_mean': [], 'coeffs_energy': []}
    for ch in range(n_channels):
        coeffs = pywt.wavedec(data[ch], wavelet, level=level)
        features['coeffs_mean'].append([np.mean(np.abs(c)) for c in coeffs])
        features['coeffs_energy'].append([np.sum(c**2) for c in coeffs])
    for k in features:
        features[k] = np.array(features[k])
    return features

def combine_features(bp, spec_ent, wave_feats, ch_names):
    all_feats = []
    for i, ch in enumerate(ch_names):
        ch_feats = []
        for band in bp:
            ch_feats.append(bp[band][i])
        ch_feats.append(spec_ent[i])
        for lvl in range(wave_feats['coeffs_mean'].shape[1]):
            ch_feats.append(wave_feats['coeffs_mean'][i][lvl])
            ch_feats.append(wave_feats['coeffs_energy'][i][lvl])
        all_feats.extend(ch_feats)
    return all_feats

# =================== XỬ LÝ 1 SUBJECT ===================

def process_subject(preproc_file):
    subject = preproc_file.parent.parent.name 
    
    # --- Tìm file events của subject ---
    events_file = Path(f"data/EEG/{subject}/eeg/{subject}_task-oddball_events.tsv")
    
    if not events_file.exists():
        print(f"Không tìm thấy events cho {subject}")
        return []

    # ---- Load events ----
    events_tsv = pd.read_csv(events_file, sep="\t")

    raw = mne.io.read_raw_fif(preproc_file, preload=True, verbose=False)
    sfreq = raw.info['sfreq']
    ch_names = raw.ch_names

    results = []

    for idx, row in events_tsv.iterrows():
        onset_sample = int(row["onset"])
        label = row["event_type"] 

        segment_samples = int(1.0 * sfreq)
        data = raw.get_data(start=onset_sample, stop=onset_sample + segment_samples)


        bp = compute_band_power(data, sfreq)
        spec_ent = compute_spectral_entropy(data, sfreq)
        wave_feats = compute_wavelet_features(data)
        feats = combine_features(bp, spec_ent, wave_feats, ch_names)

        results.append([subject, preproc_file.name, idx, label] + feats)

    return results

# =================== BATCH PROCESS ===================

def extract_features_batch(derivatives_path, output_file='eeg_features.csv', n_jobs=-1):
    derivatives_path = Path(derivatives_path)
    files = list(derivatives_path.rglob('*_desc-preproc_eeg.fif'))
    print(f"Tìm thấy {len(files)} file EEG")

    # Song song hóa
    all_results = Parallel(n_jobs=n_jobs)(
        delayed(process_subject)(f) for f in files
    )

    flat = [item for sublist in all_results for item in sublist]

    df = pd.DataFrame(flat)
    df = df.rename(columns={0: "subject", 1: "file", 2: "trial", 3: "label"})
    feature_cols = [f"f{i}" for i in range(df.shape[1] - 4)]
    df.columns = ["subject", "file", "trial", "label"] + feature_cols

    df.to_csv(output_file, index=False)
    print("DONE →", output_file, "shape:", df.shape)

    return df

# =================== RUN ===================

if __name__ == "__main__":
    extract_features_batch(
        "data/EEG/derivatives/preprocessing",
        output_file="optimized_eeg_features.csv",
        n_jobs=-1
    )


Tìm thấy 42 file EEG
DONE → optimized_eeg_features.csv shape: (13826, 1782)


In [12]:
import pandas as pd
tsv = pd.read_csv("eeg_features.csv")


In [13]:
tsv.head()

Unnamed: 0,subject,file,trial,label,f0,f1,f2,f3,f4,f5,...,f1768,f1769,f1770,f1771,f1772,f1773,f1774,f1775,f1776,f1777
0,sub-01,sub-01_task-oddball_desc-preproc_eeg.fif,0,S 5,0.0,0.0,0.0,2.810764e-11,5.100409e-12,1.207314,...,2.839575e-12,1.26083,0.000101,2.561104e-06,9.135965e-07,1.814907e-10,6.672152e-08,4.236073e-12,2.90108e-09,1.476699e-14
1,sub-01,sub-01_task-oddball_desc-preproc_eeg.fif,1,S 5,0.0,0.0,0.0,1.188307e-11,5.867867e-12,1.148442,...,4.879814e-12,1.186929,7.2e-05,1.774518e-06,1.054273e-06,2.410315e-10,7.116892e-08,2.195702e-12,3.421914e-09,1.024458e-14
2,sub-01,sub-01_task-oddball_desc-preproc_eeg.fif,2,S 5,0.0,0.0,0.0,1.324512e-11,8.52066e-12,3.39278,...,3.554239e-12,3.29215,1.2e-05,2.908802e-08,1.034735e-06,2.683197e-10,8.124062e-08,8.471733e-12,3.411116e-09,2.55e-14
3,sub-01,sub-01_task-oddball_desc-preproc_eeg.fif,3,S 5,0.0,0.0,0.0,1.258177e-11,8.381647e-12,1.492794,...,3.724248e-12,1.537859,5.9e-05,1.368634e-06,9.047409e-07,1.643513e-10,7.40381e-08,8.404831e-12,3.283984e-09,3.630522e-14
4,sub-01,sub-01_task-oddball_desc-preproc_eeg.fif,4,S 7,0.0,0.0,0.0,3.355817e-11,7.498204e-12,1.44407,...,2.766799e-12,1.493626,0.000133,4.057098e-06,7.838209e-07,1.218728e-10,5.154912e-08,1.252141e-12,2.464448e-09,6.117372e-15


In [6]:
import pandas as pd
tsv = pd.read_csv("data/EEG/sub-01/eeg/sub-01_task-oddball_events.tsv", sep="\t")
tsv["onset"].max()


1308857

In [7]:
raw = mne.io.read_raw_fif("data/EEG/derivatives/preprocessing/sub-01/eeg/sub-01_task-oddball_desc-preproc_eeg.fif", preload=True)
raw.n_times


Opening raw data file data/EEG/derivatives/preprocessing/sub-01/eeg/sub-01_task-oddball_desc-preproc_eeg.fif...
    Range : 0 ... 1371319 =      0.000 ...  1371.319 secs
Ready.
Reading 0 ... 1371319  =      0.000 ...  1371.319 secs...


1371320