In [21]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import mne
import os
import seaborn as sns
from glob import glob
import warnings
from autopreprocess_pipeline import *
from autopreprocessing import dataset as ds
import shutil 
from tqdm import tqdm

from tensorflow.python.keras.models import Sequential
from tensorflow.python.keras.layers import Conv1D, MaxPool1D, Dense, Flatten, Dropout

In [2]:
main_path = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data'

In [3]:
df = pd.read_csv(os.path.join(main_path, 'TDBRAIN_ID_and_status.csv')) # convert the .xlsx file into a .csv file beforehand
df_subset = df[['participants_ID', 'formal_status']] # only participants' ID and their status are needed from all columns
df_filtered = df_subset[df_subset['formal_status'].isin(['HEALTHY', 'ADHD'])] # out of the 5+ statuses (classes), only healthy and adhd ones are needed

filtered_file_path = '/mnt/c/Users/lukar/Desktop/Faks/EEG/dataTDBRAIN_ID_and_status.csv'
df_filtered.to_csv(filtered_file_path, index=False) # save the .csv file

In [None]:
import shutil

folders_path = os.path.join(main_path, 'TDBRAIN_derivatives_csv\\derivatives')

folders = os.listdir(folders_path)

participant_ids = df_filtered['participants_ID'].tolist() # only the healthy and adhd participants' folders are needed. their IDs are contained in the
                                                          # participant_ID column of the df_filtered data frame

for folder in folders:
    if folder not in participant_ids:
        folder_path = os.path.join(folders_path, folder)
        if os.path.isdir(folder_path):
            print(f"Removing folder: {folder}")
            shutil.rmtree(folder_path)
        else:
            print(f"Skipping {folder} as it is not a directory.")

# the dataset is now significantly smaller, from 90 GB to 8 GB

In [4]:
df_filtered
df_filtered.head()

Unnamed: 0,participants_ID,formal_status
0,sub-87974617,HEALTHY
1,sub-87974621,HEALTHY
2,sub-87974665,HEALTHY
3,sub-87974709,HEALTHY
4,sub-87974841,HEALTHY


In [None]:
text = 'Fp1,Fp2,F7,F3,Fz,F4,F8,FC3,FCz,FC4,T7,C3,Cz,C4,T8,CP3,CPz,CP4,P7,P3,Pz,P4,P8,O1,Oz,O2,VPVA,VNVB,HPHL,HNHR,Erbs,OrbOcc,Mass' # electrode names copied from one .csv eeg recordings file
channel_names = text.split(',')  # split electrode names with commas
channel_names = [f"{name}" for name in channel_names] # create a list of strings 
print(channel_names)

In [None]:
sourcepath = folders_path
preprocpath = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data/processed_subjects'

In [None]:
varargsin = {
    'sourcepath' : folders_path,
    'preprocpath' : preprocpath
}

In [None]:
autopreprocess_standard(varargsin=varargsin)

In [None]:
def chop_string_from_end(string, char, flip):
    


    index = string.rfind(char)
    if not flip:
        if index != -1:
            chopped_string = string[index:]
        else:
            chopped_string = string

        return chopped_string
    else:
        if index != -1:
            chopped_string = string[:index]
        else:
            chopped_string = string

        return chopped_string



In [None]:
def segment_csv(path_to_file, path_to_dir, window_length=5000, stride=500):

    df = pd.read_csv(path_to_file)
    
    df = df.drop(columns=['artifacts', 'VEOG', 'HEOG', 'Erbs', 'OrbOcc', 'Mass'], axis=1)
    i=0 
    for i in range(110):
        #while (df.iloc[i*STRIDE + WINDOW_LENGTH] is not None):
        sub_df = df.iloc[i*stride : i*stride + window_length]
        i+=1
        subject_name = chop_string_from_end(path_to_file,"/", flip=0)
        clean_name = chop_string_from_end(subject_name,"eeg_csv",flip=1)

        #print(clean_name)
        seg_path = path_to_dir + "/" + clean_name + "_seg_" + str(i) + ".csv"

        # print(f"saving segment {i} of the patient to {seg_path}, window that captures row {i*stride} to {i*stride + window_length}")
        
        
        sub_df.to_csv(seg_path)       
    

In [5]:
path_to_dir = "/mnt/c/Users/lukar/Desktop/Faks/EEG/data/processed_subjects"

In [None]:

def find_csv_files(directory):
    """
    Recursively searches for .csv files in the given directory and its subdirectories.
    Returns a list of paths to the found .csv files.
    """
    csv_files = []

    # Traverse through all the files and directories in the given directory
    for root, dirs, files in os.walk(directory):
        for file in files:
            # Check if the file has a .csv extension
            if file.endswith(".csv"):
                # If found, append the path to the list of csv_files
                csv_files.append(os.path.join(root, file))
                
    return csv_files            


In [None]:
csv_files = find_csv_files("/mnt/c/users/lukar/")

In [None]:
csv_files

In [None]:
for file in csv_files: 
    segment_csv(path_to_file = file, path_to_dir = path_to_dir)

In [6]:
print(df_filtered)
df_dict = df_filtered.set_index('participants_ID')['formal_status'].to_dict()
df_dict

    participants_ID formal_status
0      sub-87974617       HEALTHY
1      sub-87974621       HEALTHY
2      sub-87974665       HEALTHY
3      sub-87974709       HEALTHY
4      sub-87974841       HEALTHY
..              ...           ...
119    sub-88073205          ADHD
120    sub-88074021          ADHD
121    sub-88075053       HEALTHY
122    sub-88075101          ADHD
123    sub-88076989          ADHD

[124 rows x 2 columns]


{'sub-87974617': 'HEALTHY',
 'sub-87974621': 'HEALTHY',
 'sub-87974665': 'HEALTHY',
 'sub-87974709': 'HEALTHY',
 'sub-87974841': 'HEALTHY',
 'sub-87974973': 'HEALTHY',
 'sub-87976193': 'HEALTHY',
 'sub-87976369': 'HEALTHY',
 'sub-87976413': 'HEALTHY',
 'sub-87976457': 'HEALTHY',
 'sub-87976461': 'HEALTHY',
 'sub-87976505': 'HEALTHY',
 'sub-87976641': 'HEALTHY',
 'sub-87976773': 'HEALTHY',
 'sub-87976817': 'HEALTHY',
 'sub-87976953': 'HEALTHY',
 'sub-87977045': 'HEALTHY',
 'sub-87980197': 'HEALTHY',
 'sub-87980241': 'HEALTHY',
 'sub-87980329': 'HEALTHY',
 'sub-87980373': 'HEALTHY',
 'sub-87980417': 'HEALTHY',
 'sub-87980689': 'HEALTHY',
 'sub-87980869': 'HEALTHY',
 'sub-87980913': 'HEALTHY',
 'sub-87982225': 'HEALTHY',
 'sub-87982849': 'HEALTHY',
 'sub-88008997': 'HEALTHY',
 'sub-88015117': 'ADHD',
 'sub-88015565': 'ADHD',
 'sub-88024205': 'ADHD',
 'sub-88025421': 'ADHD',
 'sub-88025917': 'ADHD',
 'sub-88026949': 'ADHD',
 'sub-88028433': 'ADHD',
 'sub-88029425': 'ADHD',
 'sub-88029557':

In [None]:
source_dir = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data/processed_subjects'
target_dir_adhd = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data/adhd'
target_dir_healthy = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data/healthy'

In [None]:

for subject in os.listdir(source_dir):
   
    # index = file.rfind("/")
    # subject_name = file[index+1:]
    # subject_index = subject_name.find("_")
    # subject_name = subject_name[:subject_index]
   

    if df_dict[subject] == "HEALTHY":
        shutil.move(os.path.join(source_dir, subject), target_dir_healthy)
    elif df_dict[subject] == "ADHD":
        shutil.move(os.path.join(source_dir, subject), target_dir_adhd)   
                          


In [None]:
adhd_dir = os.listdir(target_dir_adhd)
healthy_dir = os.listdir(target_dir_healthy)

In [None]:
adhd_dir = [os.path.join(target_dir_adhd,subject) for subject in adhd_dir]
healthy_dir = [os.path.join(target_dir_healthy,subject) for subject in healthy_dir]

In [None]:
training_path = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data/training'
validation_path = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data/validation'
testing_path = '/mnt/c/Users/lukar/Desktop/Faks/EEG/data/testing'


In [None]:

for i in range (len(healthy_dir)): #healthy_dir is the smaller set, so we will balance by using its length
    if i<31: # alternate between healthy and adhd subject and add 31 samples of each into training data 
        shutil.move(healthy_dir[i], training_path)
        shutil.move(adhd_dir[i], training_path)
    elif i >= 31 and i < 39:  # alternate between healthy and adhd subject and add 8 samples of each into validation data 
        shutil.move(healthy_dir[i], validation_path)
        shutil.move(adhd_dir[i], validation_path)
    elif i>=39 and i < 47: # alternate between healthy and adhd subject and add 8 samples of each into testing data
        shutil.move(healthy_dir[i], testing_path)
        shutil.move(adhd_dir[i], testing_path)

In [None]:
training_csv_files = find_csv_files(training_path)
validation_csv_files = find_csv_files(validation_path)
testing_csv_files = find_csv_files(testing_path)

In [7]:
training_segmented_path =   "/mnt/c/Users/lukar/Desktop/Faks/EEG/data/training_segmented"
validation_segmented_path =  "/mnt/c/Users/lukar/Desktop/Faks/EEG/data/validation_segmented"
testing_segmented_path = "/mnt/c/Users/lukar/Desktop/Faks/EEG/data/testing_segmented"

In [None]:
for training_file in training_csv_files:
    segment_csv(path_to_file = training_file, path_to_dir = training_segmented_path)

In [None]:
for validation_file in validation_csv_files:
    segment_csv(path_to_file = validation_file, path_to_dir = validation_segmented_path)
   

In [None]:
for testing_file in testing_csv_files:
    segment_csv(path_to_file = testing_file, path_to_dir = testing_segmented_path) 

In [22]:
training_filepaths = [os.path.join(training_segmented_path,file) for file in os.listdir(training_segmented_path)]
validation_filepaths = [os.path.join(validation_segmented_path,file) for file in os.listdir(validation_segmented_path)]
testing_filepaths = [os.path.join(testing_segmented_path,file) for file in os.listdir(testing_segmented_path)]

X_train = []
X_val = []
X_test = [] 

y_train = []
y_val = []
y_test = [] 

In [18]:
import numpy as np
import pandas as pd
from scipy.stats import kurtosis, skew
from scipy.signal import welch
from scipy.integrate import simpson
from sklearn.preprocessing import StandardScaler

def compute_bandpower(segment, fs, band):
    freqs, psd = welch(segment, fs)
    band_freqs = (freqs >= band[0]) & (freqs <= band[1])
    band_power = simpson(y=psd[band_freqs], x=freqs[band_freqs])
    return band_power

def extract_features(df, fs):
    feature_list = []

    for column in df.columns:
        segment = df[column].values
        # Compute time-domain features
        mean_val = np.mean(segment)
        std_val = np.std(segment)
        kurtosis_val = kurtosis(segment)
        skewness_val = skew(segment)

        # Compute band powers
        delta_power = compute_bandpower(segment, fs, [0.5, 4])
        theta_power = compute_bandpower(segment, fs, [4, 8])
        alpha_power = compute_bandpower(segment, fs, [8, 13])
        beta_power = compute_bandpower(segment, fs, [13, 30])
        gamma_power = compute_bandpower(segment, fs, [30, 100])

        # Combine all features
        features = [mean_val, std_val, kurtosis_val, skewness_val, delta_power, theta_power, alpha_power, beta_power, gamma_power]
        feature_list.append(features)

    # Convert to a NumPy array
    feature_array = np.array(feature_list)

    # Normalize the features
    scaler = StandardScaler()
    normalized_features = scaler.fit_transform(feature_array)

    return normalized_features


In [23]:

for csv in tqdm(training_filepaths): 

    
    index = csv.rfind("/")
    subject_name = csv[index+1:]
    subject_index = subject_name.find("_")
    subject_name = subject_name[:subject_index]
   

    if df_dict[subject_name] == "HEALTHY":
        y_train.append(1)
    elif df_dict[subject_name] == "ADHD":
         y_train.append(0)

    
    segment = pd.read_csv(csv)
    segment = segment.drop(columns = ['Unnamed: 0.1', 'Unnamed: 0'])

    feature_file = extract_features(df = segment,fs = 500)
        
    X_train.append(feature_file)

print (f"Training data loaded! Training set: {len(X_train)}, labels: {len(y_train)}")

100%|█████████████████████████████████████████████████████████████████████████████| 14300/14300 [13:45<00:00, 17.32it/s]

Training data loaded! Training set: 14300, labels: 14300





In [24]:
for csv in tqdm(validation_filepaths): 
    
    index = csv.rfind("/")
    subject_name = csv[index+1:]
    subject_index = subject_name.find("_")
    subject_name = subject_name[:subject_index]
   

    if df_dict[subject_name] == "HEALTHY":
        y_val.append(1)
    elif df_dict[subject_name] == "ADHD":
         y_val.append(0)

    
    segment = pd.read_csv(csv)
    segment = segment.drop(columns = ['Unnamed: 0.1', 'Unnamed: 0'])
    
    feature_file = extract_features(df = segment,fs = 500)
        
    
        
    X_val.append(feature_file)

print (f"Validation data loaded! Validation set: {len(X_val)}, labels: {len(y_val)}")

100%|███████████████████████████████████████████████████████████████████████████████| 3520/3520 [03:33<00:00, 16.51it/s]

Validation data loaded! Validation set: 3520, labels: 3520





In [26]:
for csv in tqdm(testing_filepaths): 
    
    index = csv.rfind("/")
    subject_name = csv[index+1:]
    subject_index = subject_name.find("_")
    subject_name = subject_name[:subject_index]
   

    if df_dict[subject_name] == "HEALTHY":
        y_test.append(1)
    elif df_dict[subject_name] == "ADHD":
         y_test.append(0)

    
    segment = pd.read_csv(csv)
    segment = segment.drop(columns = ['Unnamed: 0.1', 'Unnamed: 0'])
    
    feature_file = extract_features(df = segment, fs = 500)
  
    X_test.append(feature_file)

print (f"Testing data loaded! Testing set: {len(X_test)}, labels: {len(y_test)}")

100%|███████████████████████████████████████████████████████████████████████████████| 3520/3520 [03:33<00:00, 16.52it/s]

Testing data loaded! Testing set: 3520, labels: 3521





In [31]:
adaboost = AdaBoostClassifier(n_estimators = 100) 
adaboost.fit(np.array(X_train), np.array(y_train))



ValueError: Found array with dim 3. DecisionTreeClassifier expected <= 2.

In [53]:
from sklearn.ensemble import AdaBoostClassifier
from sklearn.metrics import classification_report, accuracy_score

In [34]:
X_train = np.array(X_train)
X_test = np.array(X_test)
X_val = np.array(X_val)

In [37]:
X_train = X_train.reshape(14300, 26*9)
X_test = X_test.reshape(3520,26*9)
X_val = X_val.reshape(3520, 26*9)

In [43]:
adaboost = AdaBoostClassifier(n_estimators = 100, algorithm="SAMME")
adaboost.fit(X_train,y_train)

In [44]:
preds = adaboost.predict(X_test)

In [47]:
y_test.pop()

1

In [48]:
print(classification_report(preds,y_test))

              precision    recall  f1-score   support

           0       0.65      0.69      0.67      1675
           1       0.70      0.67      0.68      1845

    accuracy                           0.68      3520
   macro avg       0.68      0.68      0.68      3520
weighted avg       0.68      0.68      0.68      3520



In [51]:
print(adaboost.feature_importances_)

[0.         0.         0.         0.         0.         0.
 0.         0.         0.03383097 0.         0.00865103 0.
 0.         0.         0.         0.         0.         0.02044355
 0.         0.         0.         0.         0.02568942 0.01897258
 0.         0.02462753 0.         0.         0.         0.
 0.         0.         0.         0.01159154 0.         0.
 0.         0.         0.         0.         0.         0.
 0.0255797  0.         0.01273871 0.         0.01467629 0.
 0.         0.02578309 0.         0.         0.03391185 0.03355687
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.02605348 0.
 0.         0.         0.         0.         0.         0.02046
 0.         0.         0.         0.         0.         0.
 0.         0.         0.         0.         0.         0.00871658
 0.         0.         0.         0.01256039 0.01139633 0.
 0.         0.         0.         0.         0.         0.02183424
 0.        

In [52]:
from sklearn.svm import SVC

In [54]:
svm_classifier = SVC()
svm_classifier.fit(X_train, y_train)

In [55]:
y_pred = svm_classifier.predict(X_test)
accuracy = accuracy_score(y_test, y_pred)
print(f'Classification Accuracy: {accuracy}')

Classification Accuracy: 0.6130681818181818
