# Dependencies

In [11]:
import numpy as np
import pandas as pd
import scipy.io
import matplotlib.pyplot as plt
from scipy import signal
import librosa as lr
import librosa.feature as lrf
from scipy.signal import welch
import pywt
from pywt import *
from scipy.signal import periodogram
#from pyemd import emd
from scipy.signal import hilbert
from scipy.stats import linregress, skew, kurtosis
from scipy.fft import fft, fftfreq

import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
warnings.simplefilter(action='ignore', category=UserWarning)
warnings.simplefilter(action='ignore', category=RuntimeWarning)

import nolds
from pyentrp import entropy as ent
from scipy.signal import detrend
from nolds import dfa

# Feature Extraction

In [12]:
def hjorth_parameters(eeg_signal):
    # Calculate the first derivative (slope) of the EEG signal
    diff_signal = np.diff(eeg_signal)
    
    # Calculate the variance (activity) of the original signal
    activity = np.var(eeg_signal)
    
    # Calculate the variance (activity) of the first derivative (slope)
    mobility = np.var(diff_signal)
    
    # Calculate the mobility parameter (square root of mobility divided by activity)
    mobility /= activity
    
    # Calculate the second derivative of the EEG signal
    diff2_signal = np.diff(diff_signal)
    
    # Calculate the complexity (square root of the mobility divided by the mobility of the first derivative)
    complexity = np.sqrt(mobility / (np.var(diff2_signal) / activity))
    
    return activity, mobility, complexity

In [13]:
sr = 128
n_mfcc = 10
n_chr = 20
n_mel = 15
n_tonnetz = 15
frequency_bands = {
    'gamma': (30,64),
    'beta': (13, 30),
    'alpha': (8, 13),
    'theta': (4, 8),
    'delta': (1, 4),
    }
sampling_frequency = 128

# Number of sample points
N = sr*3

# sample spacing
T = 1.0 / sr

In [14]:
def feature_extraction(signal):

    feature_vector = {}
    
    # Timbral Texture Features
    
    # 0 indices are due to array shape
    feature_vector['spc_cnt'] = lrf.spectral_centroid(y=signal, sr=sr)[0][0] # Spectral Centroid
    feature_vector['spc_roff'] = lrf.spectral_rolloff(y=signal, sr=sr)[0][0] # Rolloff
    feature_vector['zc']  = np.array(np.sum(np.abs(np.diff(np.sign(signal)))) / (2 * len(signal)))
        
    for idx, mfcc in enumerate(lrf.mfcc(y=signal, n_mfcc=n_mfcc, sr=sr)): # First 5 MFCCs
        feature_vector['mfcc_' + str(idx)] = mfcc[0]
    
    for idx, chroma in enumerate(lrf.chroma_stft(y=signal, n_chroma=n_chr, sr=sr)): #chromagram
        feature_vector['chr_' + str(idx)] = chroma[0]

    for idx, mel in enumerate(lr.power_to_db(lrf.melspectrogram(y=signal, sr=sr))[:n_mel, :]):
        feature_vector['mel_' + str(idx)] = mel[0]
    
    # Iterate over each frequency band
    band_powers = {}

    # Calculate the power spectral density (PSD) using Welch's method
    frequencies, psd = welch(signal, fs=sampling_frequency, nperseg=1024)

    # Iterate over each frequency band
    # iterations are reversed due to performance differences in certain models
    # TODO: reversing process should be improved, way too clunky rn.
    for band, (low_freq, high_freq) in reversed(frequency_bands.items()):
        # Find indices corresponding to the specified frequency range
        band_indices = np.where((frequencies >= low_freq) & (frequencies < high_freq))
        # Integrate PSD within the band's frequency range to compute band power
        band_power = np.trapz(psd[band_indices], frequencies[band_indices])
        band_powers[band] = band_power
        feature_vector[band + '_power'] = band_power
    
    for band in reversed(list(band_powers)):
        for child_band in reversed(list(band_powers)):
            if child_band == band:
                continue
            feature_vector[band + '_' + child_band] = band_powers[band]/ band_powers[child_band]
        band_powers.pop(band)
    
    # Calculate the first differences
    first_differences = np.diff(signal, n=1)

    # Calculate the mean of the absolute values of the first differences
    feature_vector['mean_abs_sec_dif'] = np.mean(np.abs(first_differences))

    # TODO: explain feature
    feature_vector['dfa'] = dfa(signal, overlap=False)
    
    yf = fft(signal)
    yf = 2.0/N * np.abs(yf[0:N//2])
    np.clip(yf, 0, 15)
    yf = (yf - np.min(yf))/(np.max(yf) - np.min(yf))
    peaks, _ = scipy.signal.find_peaks(yf, height=0)
    peaks, _ = scipy.signal.find_peaks(yf, height=np.max(yf[peaks])*0.25)

    xf = fftfreq(N, T)[:N//2]

    # frequency of the maximum peak    
    #feature_vector['peak_freq'] = xf[yf == np.max(yf[peaks])]

    # maximum frequency of peaks
    #feature_vector['max_freq'] = xf[peaks][len(xf[peaks]) - 1]
    
    # peak slope
    res = linregress(xf[peaks], yf[peaks])
    feature_vector['slope'] = res.slope
    
    feature_vector['skew'] = [skew(signal)][0] #no
    feature_vector['kurtosis'] = [kurtosis(signal)][0] #no

    activity, mobility, complexity = hjorth_parameters(signal) #no
    feature_vector['activity'] = [activity][0]
    feature_vector['mobility'] = [mobility][0]
    feature_vector['complexity'] = [complexity][0]
    feature_vector['rms'] = np.sqrt(np.mean(signal**2))

    # TODO: group this with the other lrf features
    for idx, tonal in enumerate(lrf.tonnetz(y=signal)[:n_tonnetz, :]):
        feature_vector['ton_' + str(idx)] = tonal[0]
        
    return feature_vector

# Dataset Preparation

## I/O

In [15]:
dataset_path = 'drowsiness-dataset.mat' # changed to relative path
data_dict = scipy.io.loadmat(dataset_path)
subjects = list(data_dict["subindex"])
states = [i[0] for i in data_dict["substate"]]
subjects = [i[0] for i in data_dict["subindex"]]
eeg = data_dict["EEGsample"]

## Channel Selection

In [16]:
channel_names =  ["Fp1", "Fp2", "F7", "F3", "Fz", "F4", "F8", "FT7", "FC3", "FCZ", "FC4", "FT8", "T3", "C3", "Cz", "C4", "T4", "TP7", "CP3", "CPz", "CP4", "TP8", "T5", "P3", "PZ", "P4", "T6", "O1", "Oz" , "O2"]
channel_idx =  [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 26, 27, 28, 29]
channel_names = dict(zip(channel_names, channel_idx))

def select_channel(data, channel_list):
    selection = []
    channel_col = []
    for i in range(len(channel_list)):
        selection.append(data[:, channel_names[channel_list[i]], :])
        channel_col.append([channel_list[i]] * data.shape[0])
    selected_data = np.concatenate(selection)
    channel_col = np.concatenate(channel_col)
    return selected_data, channel_col

channel_list = ['F3', 'F4', 'C3', 'Cz', 'Oz', 'Fp1', 'Fp2', 'FT7', 'F8', 'Fz', 'C4']
labels = states * len(channel_list)
subjects = subjects * len(channel_list)
data, channel_col = select_channel(eeg, channel_list)

## Extraction on Selected Channels

In [19]:
# extracted features np array olabilir liste yerine
import time
start = time.time()
extracted_features = []
for i in range(data.shape[0]):
    signal_features = feature_extraction(data[i,:])
    extracted_features.append(signal_features)

end = time.time()
print(end-start)

1979.5855391025543


In [20]:
df = pd.DataFrame.from_records(extracted_features).fillna(0)
df.insert(loc = 0, column = 'channels', value = channel_col)
df['subjects'] = subjects
df['label'] = labels
df.to_csv("eeg_features.csv")