In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mne
import os
import seaborn as sns
from glob import glob
import warnings
from autopreprocess_pipeline import *
from autopreprocessing import dataset as ds
import shutil 
from tqdm import tqdm

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Conv1D, MaxPool1D, Dense, Flatten, Dropout

2024-06-09 17:11:26.731867: I tensorflow/core/util/port.cc:113] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2024-06-09 17:11:26.946750: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: SSE4.1 SSE4.2 AVX AVX2 AVX512F AVX512_VNNI AVX512_BF16 FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [2]:
main_path = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data'

In [3]:
df = pd.read_csv(os.path.join(main_path, 'TDBRAIN_ID_and_status.csv')) # convert the .xlsx file into a .csv file beforehand
df_subset = df[['participants_ID', 'formal_status']] # only participants' ID and their status are needed from all columns
df_filtered = df_subset[df_subset['formal_status'].isin(['HEALTHY', 'ADHD'])] # out of the 5+ statuses (classes), only healthy and adhd ones are needed

filtered_file_path = '/mnt/c/Users/lukar/Desktop/Faks/EEG/dataTDBRAIN_ID_and_status.csv'
df_filtered.to_csv(filtered_file_path, index=False) # save the .csv file

In [None]:
import shutil

folders_path = os.path.join(main_path, 'TDBRAIN_derivatives_csv\\derivatives')

folders = os.listdir(folders_path)

participant_ids = df_filtered['participants_ID'].tolist() # only the healthy and adhd participants' folders are needed. their IDs are contained in the
                                                          # participant_ID column of the df_filtered data frame

for folder in folders:
    if folder not in participant_ids:
        folder_path = os.path.join(folders_path, folder)
        if os.path.isdir(folder_path):
            print(f"Removing folder: {folder}")
            shutil.rmtree(folder_path)
        else:
            print(f"Skipping {folder} as it is not a directory.")

# the dataset is now significantly smaller, from 90 GB to 8 GB

In [4]:
df_filtered
df_filtered.head()

Unnamed: 0,participants_ID,formal_status
0,sub-87974617,HEALTHY
1,sub-87974621,HEALTHY
2,sub-87974665,HEALTHY
3,sub-87974709,HEALTHY
4,sub-87974841,HEALTHY


In [None]:
text = 'Fp1,Fp2,F7,F3,Fz,F4,F8,FC3,FCz,FC4,T7,C3,Cz,C4,T8,CP3,CPz,CP4,P7,P3,Pz,P4,P8,O1,Oz,O2,VPVA,VNVB,HPHL,HNHR,Erbs,OrbOcc,Mass' # electrode names copied from one .csv eeg recordings file
channel_names = text.split(',')  # split electrode names with commas
channel_names = [f"{name}" for name in channel_names] # create a list of strings 
print(channel_names)

In [None]:
sourcepath = folders_path
preprocpath = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data/processed_subjects'

In [None]:
varargsin = {
    'sourcepath' : folders_path,
    'preprocpath' : preprocpath
}

In [None]:
autopreprocess_standard(varargsin=varargsin)

In [5]:
def chop_string_from_end(string, char, flip):
    


    index = string.rfind(char)
    if not flip:
        if index != -1:
            chopped_string = string[index:]
        else:
            chopped_string = string

        return chopped_string
    else:
        if index != -1:
            chopped_string = string[:index]
        else:
            chopped_string = string

        return chopped_string



In [None]:
def segment_csv(path_to_file, path_to_dir, window_length=5000, stride=500):

    df = pd.read_csv(path_to_file)
    
    df = df.drop(columns=['artifacts', 'VEOG', 'HEOG', 'Erbs', 'OrbOcc', 'Mass'], axis=1)
    i=0 
    for i in range(110):
        #while (df.iloc[i*STRIDE + WINDOW_LENGTH] is not None):
        sub_df = df.iloc[i*stride : i*stride + window_length]
        i+=1
        subject_name = chop_string_from_end(path_to_file,"/", flip=0)
        clean_name = chop_string_from_end(subject_name,"eeg_csv",flip=1)

        #print(clean_name)
        seg_path = path_to_dir + "/" + clean_name + "_seg_" + str(i) + ".csv"

        # print(f"saving segment {i} of the patient to {seg_path}, window that captures row {i*stride} to {i*stride + window_length}")
        
        
        sub_df.to_csv(seg_path)       
    

In [6]:
path_to_dir = "/mnt/c/Users/lukar/Desktop/Faks/EEG/data/processed_subjects"

In [None]:

def find_csv_files(directory):
    """
    Recursively searches for .csv files in the given directory and its subdirectories.
    Returns a list of paths to the found .csv files.
    """
    csv_files = []

    # Traverse through all the files and directories in the given directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Check if the file has a .csv extension
            if file.endswith(".csv"):
                # If found, append the path to the list of csv_files
                csv_files.append(os.path.join(root, file))
                
    return csv_files            


In [None]:
csv_files = find_csv_files("/mnt/c/users/lukar/")

In [None]:
csv_files

In [None]:
for file in csv_files: 
    segment_csv(path_to_file = file, path_to_dir = path_to_dir)

In [7]:
print(df_filtered)
df_dict = df_filtered.set_index('participants_ID')['formal_status'].to_dict()
df_dict

    participants_ID formal_status
0      sub-87974617       HEALTHY
1      sub-87974621       HEALTHY
2      sub-87974665       HEALTHY
3      sub-87974709       HEALTHY
4      sub-87974841       HEALTHY
..              ...           ...
119    sub-88073205          ADHD
120    sub-88074021          ADHD
121    sub-88075053       HEALTHY
122    sub-88075101          ADHD
123    sub-88076989          ADHD

[124 rows x 2 columns]


{'sub-87974617': 'HEALTHY',
 'sub-87974621': 'HEALTHY',
 'sub-87974665': 'HEALTHY',
 'sub-87974709': 'HEALTHY',
 'sub-87974841': 'HEALTHY',
 'sub-87974973': 'HEALTHY',
 'sub-87976193': 'HEALTHY',
 'sub-87976369': 'HEALTHY',
 'sub-87976413': 'HEALTHY',
 'sub-87976457': 'HEALTHY',
 'sub-87976461': 'HEALTHY',
 'sub-87976505': 'HEALTHY',
 'sub-87976641': 'HEALTHY',
 'sub-87976773': 'HEALTHY',
 'sub-87976817': 'HEALTHY',
 'sub-87976953': 'HEALTHY',
 'sub-87977045': 'HEALTHY',
 'sub-87980197': 'HEALTHY',
 'sub-87980241': 'HEALTHY',
 'sub-87980329': 'HEALTHY',
 'sub-87980373': 'HEALTHY',
 'sub-87980417': 'HEALTHY',
 'sub-87980689': 'HEALTHY',
 'sub-87980869': 'HEALTHY',
 'sub-87980913': 'HEALTHY',
 'sub-87982225': 'HEALTHY',
 'sub-87982849': 'HEALTHY',
 'sub-88008997': 'HEALTHY',
 'sub-88015117': 'ADHD',
 'sub-88015565': 'ADHD',
 'sub-88024205': 'ADHD',
 'sub-88025421': 'ADHD',
 'sub-88025917': 'ADHD',
 'sub-88026949': 'ADHD',
 'sub-88028433': 'ADHD',
 'sub-88029425': 'ADHD',
 'sub-88029557':

In [8]:
source_dir = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data/processed_subjects'
target_dir_adhd = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data/adhd'
target_dir_healthy = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data/healthy'

In [None]:

for subject in os.listdir(source_dir):
   
    # index = file.rfind("/")
    # subject_name = file[index+1:]
    # subject_index = subject_name.find("_")
    # subject_name = subject_name[:subject_index]
   

    if df_dict[subject] == "HEALTHY":
        shutil.move(os.path.join(source_dir, subject), target_dir_healthy)
    elif df_dict[subject] == "ADHD":
        shutil.move(os.path.join(source_dir, subject), target_dir_adhd)   
                          


In [None]:
adhd_dir = os.listdir(target_dir_adhd)
healthy_dir = os.listdir(target_dir_healthy)

In [None]:
adhd_dir = [os.path.join(target_dir_adhd,subject) for subject in adhd_dir]
healthy_dir = [os.path.join(target_dir_healthy,subject) for subject in healthy_dir]

In [9]:
training_path = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data/training'
validation_path = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data/validation'
testing_path = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data/testing'


In [None]:

for i in range (len(healthy_dir)): #healthy_dir is the smaller set, so we will balance by using its length
    if i<31: # alternate between healthy and adhd subject and add 31 samples of each into training data 
        shutil.move(healthy_dir[i], training_path)
        shutil.move(adhd_dir[i], training_path)
    elif i >= 31 and i < 39:  # alternate between healthy and adhd subject and add 8 samples of each into validation data 
        shutil.move(healthy_dir[i], validation_path)
        shutil.move(adhd_dir[i], validation_path)
    elif i>=39 and i < 47: # alternate between healthy and adhd subject and add 8 samples of each into testing data
        shutil.move(healthy_dir[i], testing_path)
        shutil.move(adhd_dir[i], testing_path)

In [None]:
training_csv_files = find_csv_files(training_path)
validation_csv_files = find_csv_files(validation_path)
testing_csv_files = find_csv_files(testing_path)

In [10]:
training_segmented_path =   "/mnt/c/Users/lukar/Desktop/Faks/EEG/data/training_segmented"
validation_segmented_path =  "/mnt/c/Users/lukar/Desktop/Faks/EEG/data/validation_segmented"
testing_segmented_path = "/mnt/c/Users/lukar/Desktop/Faks/EEG/data/testing_segmented"

In [None]:
for training_file in training_csv_files:
    segment_csv(path_to_file = training_file, path_to_dir = training_segmented_path)

In [None]:
for validation_file in validation_csv_files:
    segment_csv(path_to_file = validation_file, path_to_dir = validation_segmented_path)
   

In [None]:
for testing_file in testing_csv_files:
    segment_csv(path_to_file = testing_file, path_to_dir = testing_segmented_path) 

In [23]:
training_filepaths = [os.path.join(training_segmented_path,file) for file in os.listdir(training_segmented_path)]
validation_filepaths = [os.path.join(validation_segmented_path,file) for file in os.listdir(validation_segmented_path)]
testing_filepaths = [os.path.join(testing_segmented_path,file) for file in os.listdir(testing_segmented_path)]

X_train = []
X_val = []
X_test = [] 

y_train = []
y_val = []
y_test = [] 

In [12]:
import numpy as np
import pywt
from scipy.spatial.distance import pdist
from scipy.stats import entropy
from sklearn.neighbors import NearestNeighbors

def wavelet_entropy(segment, wavelet='db4', level=4):
    coeffs = pywt.wavedec(segment, wavelet, level=level)
    entropy_list = []
    for coeff in coeffs:
        coeff = np.abs(coeff)
        norm_coeff = coeff / np.sum(coeff)
        entropy_list.append(entropy(norm_coeff))
    return np.mean(entropy_list)


import numpy as np
import pandas as pd
from scipy.stats import kurtosis, skew, entropy
from scipy.signal import welch
from scipy.integrate import simpson
from sklearn.preprocessing import StandardScaler
from numpy import log2, mean, sqrt
from math import log10

def compute_bandpower(segment, fs, band):
    freqs, psd = welch(segment, fs)
    band_freqs = (freqs >= band[0]) & (freqs <= band[1])
    band_power = simpson(y=psd[band_freqs], x=freqs[band_freqs])
    return band_power, psd, freqs

def hjorth_params(segment):
    # Hjorth Activity
    activity = np.var(segment)
    
    # Hjorth Mobility
    derivative = np.diff(segment)
    mobility = np.std(derivative) / np.std(segment)
    
    # Hjorth Complexity
    second_derivative = np.diff(derivative)
    complexity = (np.std(second_derivative) / np.std(derivative)) / mobility
    
    return activity, mobility, complexity

def spectral_entropy(psd):
    psd_norm = psd / np.sum(psd)
    return entropy(psd_norm)

def shannon_entropy(segment):
    prob_dist, _ = np.histogram(segment, bins=256, density=True)
    prob_dist = prob_dist[prob_dist > 0]
    return -np.sum(prob_dist * np.log2(prob_dist))

def higuchi_fd(segment, k_max):
    L = []
    x = np.asarray(segment)
    N = len(x)

    for k in range(1, k_max):
        Lk = 0
        for m in range(k):
            Lmk = 0
            for i in range(1, int(np.floor((N - m) / k))):
                Lmk += np.abs(x[m + i * k] - x[m + (i - 1) * k])
            Lmk = Lmk * (N - 1) / (int(np.floor((N - m) / k)) * k)
            Lk += Lmk
        L.append(np.log(Lk / k))

    return np.polyfit(np.log(range(1, k_max)), L, 1)[0]

def katz_fd(segment):
    L = np.sum(np.sqrt(np.ediff1d(segment) ** 2 + 1))
    d = np.max(np.abs(segment - segment[0]))
    N = len(segment)
    return log10(L) / (log10(d) + log10(N))

def petrosian_fd(segment):
    n = len(segment)
    diff = np.diff(segment)
    N_delta = np.sum(diff[1:] * diff[:-1] < 0)
    return log10(n) / (log10(n) + log10(n / (n + 0.4 * N_delta)))


In [None]:

def extract_features(df, fs):
    feature_list = []

    for column in df.columns:
        segment = df[column].values
        # Compute time-domain features
        mean_val = np.mean(segment)
        std_val = np.std(segment)
        rms_val = np.sqrt(np.mean(segment**2))
        kurtosis_val = kurtosis(segment)
        skewness_val = skew(segment)

        # Compute Hjorth parameters
        activity, mobility, complexity = hjorth_params(segment)

        # Compute Shannon's entropy
        shannon_entropy_val = shannon_entropy(segment)

        # Compute band powers and PSD entropy
        delta_power, psd, freqs = compute_bandpower(segment, fs, [0.5, 4])
        theta_power, _, _ = compute_bandpower(segment, fs, [4, 8])
        alpha_power, _, _ = compute_bandpower(segment, fs, [8, 13])
        beta_power, _, _ = compute_bandpower(segment, fs, [13, 30])
        gamma_power, _, _ = compute_bandpower(segment, fs, [30, 100])
        
        # Compute spectral entropy
        spectral_entropy_val = spectral_entropy(psd)

        # Compute fractal dimensions
        higuchi_val = higuchi_fd(segment, k_max=10)
        katz_val = katz_fd(segment)
        petrosian_val = petrosian_fd(segment)

        # Compute wavelet entropy
        wavelet_entropy_val = wavelet_entropy(segment)


        # Combine all features
        features = [
            mean_val, std_val, rms_val, kurtosis_val, skewness_val,
            activity, mobility, complexity, shannon_entropy_val, spectral_entropy_val,
            delta_power, theta_power, alpha_power, beta_power, gamma_power,
            higuchi_val, katz_val, petrosian_val, wavelet_entropy_val,
        ]
        feature_list.append(features)

    # Convert to a NumPy array
    feature_array = np.array(feature_list)

    # Normalize the features
    scaler = StandardScaler()
    normalized_features = scaler.fit_transform(feature_array)

    return normalized_features


In [None]:
for csv in tqdm(training_filepaths): 

    
    index = csv.rfind("/")
    subject_name = csv[index+1:]
    subject_index = subject_name.find("_")
    subject_name = subject_name[:subject_index]
   

    if df_dict[subject_name] == "HEALTHY":
        y_train.append(1)
    elif df_dict[subject_name] == "ADHD":
         y_train.append(0)

    
    segment = pd.read_csv(csv)
    segment = segment.drop(columns = ['Unnamed: 0.1', 'Unnamed: 0'])

    feature_file = extract_features(df = segment,fs = 500)
        
    X_train.append(feature_file)

print (f"Training data loaded! Training set: {len(X_train)}, labels: {len(y_train)}")

In [None]:
for csv in tqdm(validation_filepaths): 
    
    index = csv.rfind("/")
    subject_name = csv[index+1:]
    subject_index = subject_name.find("_")
    subject_name = subject_name[:subject_index]
   

    if df_dict[subject_name] == "HEALTHY":
        y_val.append(1)
    elif df_dict[subject_name] == "ADHD":
         y_val.append(0)

    
    segment = pd.read_csv(csv)
    segment = segment.drop(columns = ['Unnamed: 0.1', 'Unnamed: 0'])
    
    feature_file = extract_features(df = segment,fs = 500)
        
    
        
    X_val.append(feature_file)

print (f"Validation data loaded! Validation set: {len(X_val)}, labels: {len(y_val)}")

In [None]:
for csv in tqdm(testing_filepaths): 
    
    index = csv.rfind("/")
    subject_name = csv[index+1:]
    subject_index = subject_name.find("_")
    subject_name = subject_name[:subject_index]
   

    if df_dict[subject_name] == "HEALTHY":
        y_test.append(1)
    elif df_dict[subject_name] == "ADHD":
         y_test.append(0)

    
    segment = pd.read_csv(csv)
    segment = segment.drop(columns = ['Unnamed: 0.1', 'Unnamed: 0'])
    
    feature_file = extract_features(df = segment, fs = 500)
  
    X_test.append(feature_file)

print (f"Testing data loaded! Testing set: {len(X_test)}, labels: {len(y_test)}")

In [119]:
X_train = np.load("X_train.npy")
X_test = np.load("X_test.npy")
X_val = np.load("X_val.npy")
y_train = np.load("y_train.npy")
y_test = np.load("y_test.npy")
y_val = np.load("y_val.npy")

In [120]:
X_train = X_train.reshape(14300, 26*19)
X_test = X_test.reshape(3520,26*19)
X_val = X_val.reshape(3520, 26*19)

In [123]:
y_train = y_train[:-3]

In [121]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score

In [124]:
adaboost = AdaBoostClassifier(n_estimators = 50, algorithm="SAMME")
adaboost.fit(X_train,y_train)

In [125]:
preds = adaboost.predict(X_test)

In [126]:
print(classification_report(preds,y_test))

              precision    recall  f1-score   support

           0       0.59      0.68      0.63      1526
           1       0.72      0.64      0.68      1994

    accuracy                           0.66      3520
   macro avg       0.66      0.66      0.66      3520
weighted avg       0.67      0.66      0.66      3520



In [127]:
print(len(adaboost.feature_importances_))

494


In [128]:
important_feature_indices = np.where(adaboost.feature_importances_ > 0)[0]

print("Indices of important features:", important_feature_indices)

Indices of important features: [ 11  14  15  18  48  49  51  88  93  97 108 110 140 148 197 201 240 241
 268 273 280 298 300 313 317 319 337 356 386 395 397 398 433 451 462 486]


In [129]:
X_train_selected = X_train[:, important_feature_indices]

In [130]:
X_test_selected = X_test[:,important_feature_indices]
X_val_selected = X_val[:, important_feature_indices]

In [131]:
from sklearn.svm import SVC

In [133]:
svm_classifier = SVC()
svm_classifier.fit(X_train_selected, y_train)

In [134]:
y_pred = svm_classifier.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
print(f'Classification Accuracy: {accuracy}')

Classification Accuracy: 0.6869318181818181


In [135]:
from sklearn.neural_network import MLPClassifier

In [154]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(8,16), max_iter=2000, random_state=42)
mlp_classifier.fit(X_train_selected, y_train)

# Predict and evaluate
y_pred = mlp_classifier.predict(X_test_selected)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)


In [155]:
y_prob = mlp_classifier.predict_proba(X_test_selected)

In [156]:
print(accuracy)
print(report)

0.7775568181818182
              precision    recall  f1-score   support

           0       0.85      0.68      0.75      1760
           1       0.73      0.88      0.80      1760

    accuracy                           0.78      3520
   macro avg       0.79      0.78      0.78      3520
weighted avg       0.79      0.78      0.78      3520



In [153]:
len(y_prob)/110

32.0

In [84]:
adaboost = AdaBoostClassifier(n_estimators = 60, algorithm="SAMME")
adaboost.fit(X_train_selected,y_train)

In [85]:
predictions = adaboost.predict(X_test_selected)

In [86]:
print(classification_report(y_test,predictions))

              precision    recall  f1-score   support

           0       0.68      0.59      0.63      1760
           1       0.64      0.72      0.67      1760

    accuracy                           0.65      3520
   macro avg       0.66      0.65      0.65      3520
weighted avg       0.66      0.65      0.65      3520



In [87]:
print(adaboost.feature_importances_)

[0.0181952  0.01538185 0.03641875 0.01607019 0.02750439 0.01075643
 0.02935811 0.04575906 0.04522285 0.02598295 0.04389864 0.10225107
 0.01066167 0.02168312 0.0229266  0.02286938 0.01512248 0.02403128
 0.01687637 0.02232871 0.06682118 0.01326659 0.0250338  0.02436231
 0.02478969 0.00828876 0.03650852 0.02755209 0.01916114 0.03636577
 0.03316869 0.0184407  0.02503268 0.01521495 0.01445491 0.03823914]


In [89]:
top_25_indices = np.argsort(adaboost.feature_importances_)[-25:][::-1]

print(top_25_indices)

[11 20  7  8 10 35 26  2 29 30  6 27  4  9 22 32 24 23 17 14 15 19 13 28
 31]


In [90]:
X_train_selected_2 = X_train_selected[:, top_25_indices]
X_test_selected_2 = X_test_selected[:,top_25_indices]
X_val_selected_2 = X_val_selected[:, top_25_indices]

In [117]:
svm_classifier = SVC(kernel='rbf')
svm_classifier.fit(X_train_selected_2, y_train)
y_pred = svm_classifier.predict(X_test_selected_2)
accuracy = accuracy_score(y_test, y_pred)
print(f'Classification Accuracy: {accuracy}')

Classification Accuracy: 0.6485795454545454


In [107]:
mlp_classifier = MLPClassifier(hidden_layer_sizes=(), max_iter=2000, random_state=42)
mlp_classifier.fit(X_train_selected_2, y_train)

# Predict and evaluate
y_pred = mlp_classifier.predict(X_test_selected_2)
accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

In [108]:
print(accuracy)
print(report)

0.6383522727272727
              precision    recall  f1-score   support

           0       0.67      0.55      0.60      1760
           1       0.62      0.73      0.67      1760

    accuracy                           0.64      3520
   macro avg       0.64      0.64      0.64      3520
weighted avg       0.64      0.64      0.64      3520



In [None]:
print(len(important_feature_indices))

In [109]:
from sklearn.ensemble import RandomForestClassifier

In [114]:
rf = RandomForestClassifier(
    n_estimators = 100,
    min_samples_split=20,
    min_samples_leaf=20
)

In [115]:
rf.fit(X_train_selected,y_train)

In [116]:
predictions_rf = rf.predict(X_test_selected)
print(classification_report(y_test,predictions_rf))

              precision    recall  f1-score   support

           0       0.59      0.65      0.62      1760
           1       0.61      0.54      0.57      1760

    accuracy                           0.59      3520
   macro avg       0.60      0.59      0.59      3520
weighted avg       0.60      0.59      0.59      3520



# Спектрограми

In [157]:
from scipy import signal

In [158]:
training_filepaths = [os.path.join(training_segmented_path,file) for file in os.listdir(training_segmented_path)]
validation_filepaths = [os.path.join(validation_segmented_path,file) for file in os.listdir(validation_segmented_path)]
testing_filepaths = [os.path.join(testing_segmented_path,file) for file in os.listdir(testing_segmented_path)]

In [166]:
X_train_spectrograms = []
X_test_spectrograms = [] 
X_val_spectrograms = [] 
y_train_spectrograms = []
y_test_spectrograms = [] 
y_val_spectrograms = [] 

In [169]:
fs = 500  # Sampling frequency
nperseg = 256  # Length of each segment
noverlap = 128  # Overlap between segments

def generate_spectrograms(training_filepaths, y_train_spectrograms):
    spectrogram_list = []
    for filepath in tqdm(training_filepaths):
        # Load the EEG data from CSV file
        eeg_data = pd.read_csv(filepath)
        
        index = filepath.rfind("/")
        subject_name = filepath[index+1:]
        subject_index = subject_name.find("_")
        subject_name = subject_name[:subject_index]
   

        if df_dict[subject_name] == "HEALTHY":
            y_train_spectrograms.append(1)
        elif df_dict[subject_name] == "ADHD":
             y_train_spectrograms.append(0)
        # Extract the data for each channel
        channels_data = {}
        for channel_name in eeg_data.columns:
            channels_data[channel_name] = eeg_data[channel_name].values

        # Create spectrogram for each channel
        spectrograms = {}
        for channel_name, channel_values in channels_data.items():
            f, t, Sxx = signal.spectrogram(channel_values, fs=fs, nperseg=nperseg, noverlap=noverlap)
            spectrograms[channel_name] = Sxx.T  # Transpose the spectrogram matrix

        spectrogram_list.append(spectrograms)

    return spectrogram_list, y_train

In [None]:

X_train_spectrograms, y_train_spectrograms = generate_spectrograms(training_filepaths,y_train_spectrograms)

 60%|█████████████████████████████████████████████▎                              | 8528/14300 [05:38<9:08:03,  5.70s/it]

In [None]:
X_test_spectrograms, y_test_spectrograms = generate_spectrograms(testing_filepaths, y_test_spectrograms)

In [None]:
X_val_spectrograms, y_val_spectrograms = generate_spectrograms(validation_filepaths, y_val_spectrograms)