In [57]:
import os
import numpy as np
import pyedflib
from glob import glob

# Function to read txt file data
def read_txt_file(filepath):
    with open(filepath, 'r') as file:
        data = np.loadtxt(file)
    return data

# Function to write data to EDF
def write_to_edf(output_path, signals, signal_labels, sampling_frequency):
    num_signals = len(signals)
    
    # Initialize EdfWriter with the required number of channels (n_channels)
    edf_writer = pyedflib.EdfWriter(output_path, n_channels=num_signals, file_type=pyedflib.FILETYPE_EDFPLUS)
    
    # Prepare signal headers
    signal_headers = []
    for i in range(num_signals):
        header = {
            'label': signal_labels[i],
            'dimension': 'uV',
            'sample_rate': sampling_frequency,
            'physical_min': np.min(signals[i]),
            'physical_max': np.max(signals[i]),
            'digital_min': -32768,
            'digital_max': 32767
        }
        signal_headers.append(header)
    
    # Set signal headers in the EDF writer
    edf_writer.setSignalHeaders(signal_headers)
    
    # Write the signals to the EDF file
    edf_writer.writeSamples(signals)
    
    # Close the EDF writer properly
    edf_writer.close()

# Main function to process all patients and convert txt to edf
def convert_txt_to_edf(base_folder, output_folder, sampling_frequency=128):
    # Count all .edf files in output folder before conversion
    all_edf_files = glob(os.path.join(output_folder, '*.edf'))
    print(f"Found {len(all_edf_files)} existing .edf files in the output folder.")
    
    # Check if output folder exists, create if it doesn't
    if not os.path.exists(output_folder):
        os.makedirs(output_folder)
    
    patients = os.listdir(base_folder)
    
    for patient in patients:
        patient_folder = os.path.join(base_folder, patient)
        
        if os.path.isdir(patient_folder):
            # Collecting all electrode files for this patient
            electrode_files = sorted([f for f in os.listdir(patient_folder) if f.endswith('.txt')])
            
            signals = []
            signal_labels = []
            
            # Read data from all electrode .txt files
            for electrode_file in electrode_files:
                electrode_path = os.path.join(patient_folder, electrode_file)
                data = read_txt_file(electrode_path)
                signals.append(data)
                signal_labels.append(os.path.splitext(electrode_file)[0])  # Use the file name as the label
                
            # Output path for EDF
            output_edf_path = os.path.join(output_folder, f"{patient}.edf")
            
            # Write the data to EDF file
            write_to_edf(output_edf_path, signals, signal_labels, sampling_frequency)
            print(f"Converted {patient} to EDF.")
            
    # After conversion, print the number of .edf files in the output folder again
    all_edf_files = glob(os.path.join(output_folder, '*.edf'))
    print(f"Total .edf files in the output folder after conversion: {len(all_edf_files)}")

# Set your base folder and output folder paths
base_folder = "/Users/kanisha/Downloads/EEG_data1/Eyes_closed"
output_folder = "/Users/kanisha/Downloads/Files" 

# Convert the files
convert_txt_to_edf(base_folder, output_folder)



Found 92 existing .edf files in the output folder.
Converted Paciente62 to EDF.
Converted Paciente91 to EDF.
Converted Paciente65 to EDF.
Converted Paciente53 to EDF.
Converted Paciente54 to EDF.
Converted Paciente38 to EDF.
Converted Paciente31 to EDF.
Converted Paciente36 to EDF.
Converted Paciente55 to EDF.
Converted Paciente52 to EDF.
Converted Paciente64 to EDF.
Converted Paciente90 to EDF.
Converted Paciente63 to EDF.
Converted Paciente37 to EDF.
Converted Paciente30 to EDF.
Converted Paciente39 to EDF.
Converted Paciente23 to EDF.
Converted Paciente24 to EDF.
Converted Paciente8 to EDF.
Converted Paciente1 to EDF.
Converted Paciente12 to EDF.
Converted Paciente15 to EDF.
Converted Paciente6 to EDF.
Converted Paciente41 to EDF.
Converted Paciente79 to EDF.
Converted Paciente46 to EDF.
Converted Paciente70 to EDF.
Converted Paciente84 to EDF.
Converted Paciente48 to EDF.
Converted Paciente83 to EDF.
Converted Paciente77 to EDF.




Converted Paciente7 to EDF.
Converted Paciente14 to EDF.
Converted Paciente13 to EDF.
Converted Paciente25 to EDF.
Converted Paciente9 to EDF.
Converted Paciente22 to EDF.
Converted Paciente49 to EDF.
Converted Paciente76 to EDF.
Converted Paciente82 to EDF.
Converted Paciente85 to EDF.
Converted Paciente71 to EDF.
Converted Paciente78 to EDF.
Converted Paciente47 to EDF.
Converted Paciente40 to EDF.
Converted Paciente35 to EDF.
Converted Paciente32 to EDF.
Converted Paciente92 to EDF.
Converted Paciente66 to EDF.
Converted Paciente59 to EDF.
Converted Paciente61 to EDF.
Converted Paciente57 to EDF.
Converted Paciente68 to EDF.
Converted Paciente50 to EDF.
Converted Paciente33 to EDF.
Converted Paciente34 to EDF.
Converted Paciente51 to EDF.
Converted Paciente56 to EDF.
Converted Paciente69 to EDF.
Converted Paciente60 to EDF.
Converted Paciente67 to EDF.
Converted Paciente58 to EDF.
Converted Paciente45 to EDF.
Converted Paciente89 to EDF.
Converted Paciente42 to EDF.
Converted Pacien



Converted Paciente27 to EDF.
Converted Paciente18 to EDF.
Converted Paciente20 to EDF.
Converted Paciente5 to EDF.
Converted Paciente16 to EDF.
Converted Paciente29 to EDF.
Converted Paciente11 to EDF.
Converted Paciente2 to EDF.
Converted Paciente86 to EDF.
Converted Paciente72 to EDF.
Converted Paciente75 to EDF.
Converted Paciente81 to EDF.
Converted Paciente88 to EDF.
Converted Paciente43 to EDF.
Converted Paciente44 to EDF.
Converted Paciente3 to EDF.
Converted Paciente10 to EDF.
Converted Paciente17 to EDF.
Converted Paciente4 to EDF.
Converted Paciente28 to EDF.
Converted Paciente21 to EDF.
Converted Paciente26 to EDF.
Converted Paciente19 to EDF.
Total .edf files in the output folder after conversion: 92




In [58]:
from glob import glob
import os
import mne
import numpy as np
import pandas
import matplotlib.pyplot as plt

In [59]:
all_edf_files = glob(os.path.join(output_folder, '*.edf'))
print(f"Found {len(all_edf_files)} existing .edf files in the output folder.")

Found 92 existing .edf files in the output folder.


In [60]:
all_edf_files[0]

'/Users/kanisha/Downloads/Files/Paciente20.edf'

In [61]:
import os
from glob import glob

# Function to extract patient number from file path
def get_patient_number(file_path):
    # Split the file path to get the file name (e.g., 'paciente1.edf')
    file_name = os.path.basename(file_path)
    # Extract the number after 'Paciente' and before the file extension
    patient_number = int(file_name.replace('Paciente', '').split('.')[0])  
    return patient_number

# Path to your EDF files
all_file_path = glob('/Users/kanisha/Downloads/Files/*.edf')

# Filter for healthy patients (81 to 92)
healthy_file_path = [i for i in all_file_path if 81 <= get_patient_number(i) <= 92]

# Filter for AD patients (1 to 80)
patient_file_path = [i for i in all_file_path if 1 <= get_patient_number(i) <= 80]

# Print the number of files in each category
print("Total files:", len(all_file_path))
print("Healthy patients (81-92):", len(healthy_file_path))
print("AD patients (1-80):", len(patient_file_path))

data_list = control_epochs_array + patient_epochs_array
label_list = control_epoch_labels + patient_epoch_labels
# dividing the dataset so that there is a balanced distribution of data(not randomly)

Total files: 92
Healthy patients (81-92): 12
AD patients (1-80): 80


In [62]:
import mne

def read_data(file_path, filter_data=True, l_freq=0.5, h_freq=30):
    try:
        # Read the raw EEG data
        data = mne.io.read_raw_edf(file_path, preload=True)
        
        # Set the EEG reference to the average of all channels
        data.set_eeg_reference()

        # Apply filter if specified
        if filter_data:
            data.filter(l_freq=l_freq, h_freq=h_freq)

        # Define the list of required scalp channels
        required_channels = ['Fp1', 'Fp2', 'F3', 'F4', 'F7', 'F8', 'Fz', 'C3', 'C4', 'Cz', 
                             'P3', 'P4', 'Pz', 'T3', 'T4', 'T5', 'T6', 'O1', 'O2']

        # Check if all required channels are in the data
        available_channels = [ch for ch in required_channels if ch in data.info['ch_names']]
        if len(available_channels) != len(required_channels):
            print(f"Warning: Some channels are missing from the data. Available channels: {available_channels}")

        # Pick only the available channels
        data.pick_channels(available_channels)

        # Create epochs of 1 second duration with 0.5-second overlap
        epochs = mne.make_fixed_length_epochs(data, duration=1, overlap=0.5)

        # Extract the data from the epochs into a NumPy array
        array = epochs.get_data()
        
        return array
    
    except Exception as e:
        print(f"Error reading the file {file_path}: {e}")
        return None


In [63]:
# Assuming the healthy_file_path list is already populated with valid paths
# and the read_data function is properly defined

# Read data from the first file in the healthy_file_path list
sample_data = read_data(healthy_file_path[0])

# Check if the data is successfully read
if sample_data is not None:
    print("Sample Data Shape:", sample_data.shape)
else:
    print("Error reading the data from the file.")


Extracting EDF parameters from /Users/kanisha/Downloads/Files/Paciente81.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 1023  =      0.000 ...     7.992 secs...
EEG channel type selected for re-referencing
Applying average reference.
Applying a custom ('EEG',) reference.
Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 0.5 - 30 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.50
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 0.25 Hz)
- Upper passband edge: 30.00 Hz
- Upper transition bandwidth: 7.50 Hz (-6 dB cutoff frequency: 33.75 Hz)
- Filter length: 845 samples (6.602 s)

NOTE: pick_channels() is a legacy function. New code should use inst.pick(...).
Not setting metadata
15 match

[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    0.0s


In [64]:
sample_data.shape

(15, 19, 128)

In [65]:
%%capture
# this will hide the output becoz it will be very long
control_epochs_array = [read_data(i) for i in healthy_file_path]
patient_epochs_array = [read_data(i) for i in patient_file_path]

In [66]:
control_epochs_array[0].shape,control_epochs_array[1].shape

((15, 19, 128), (15, 19, 128))

In [67]:
control_epoch_labels = [len(i)*[0] for i in control_epochs_array]
patient_epoch_labels = [len(i)*[1] for i in patient_epochs_array]
len(control_epoch_labels),len(patient_epoch_labels)

(12, 80)

In [68]:
data_list = control_epochs_array + patient_epochs_array
label_list = control_epoch_labels + patient_epoch_labels

In [69]:
group_list = [[i]* len(j)  for i,j in enumerate(data_list)]
len(group_list)

92

In [70]:
data_array = np.vstack(data_list)
label_array = np.hstack(label_list)
group_array = np.hstack(group_list)
print(data_array.shape,label_array.shape,group_array.shape)

(1380, 19, 128) (1380,) (1380,)


In [71]:
from scipy import stats
import numpy as np

def mean(x):
    return np.mean(x, axis=-1)

def std(x):
    return np.std(x, axis=-1)

def ptp(x):
    return np.ptp(x, axis=-1)

def var(x):
    return np.var(x, axis=-1)

def minim(x):
    return np.min(x, axis=-1)

def maxim(x):
    return np.max(x, axis=-1)

def argminim(x):
    return np.argmin(x, axis=-1)

def argmaxim(x):
    return np.argmax(x, axis=-1)    

def res(x):
    return np.sqrt(np.mean(x**2, axis=-1))

def abs_diff_signal(x):
    return np.sum(np.abs(np.diff(x, axis=-1)), axis=-1)

def kurtosis(x):
    return stats.kurtosis(x, axis=-1)

def skewness(x):
    return stats.skew(x, axis=-1)

def concatenate_feature(x):
    return np.concatenate((mean(x), std(x), ptp(x), var(x), minim(x), maxim(x),
                           argminim(x), argmaxim(x), res(x), abs_diff_signal(x),
                           kurtosis(x), skewness(x)), axis=-1)

In [78]:
features = []
for d in data_array:
    features.append(concatenate_feature(d))  # Correct function call

In [79]:
features_array = np.array(features)
features_array.shape

(1380, 228)

In [80]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold,GridSearchCV

In [83]:
# Flatten the data
data_array_2d = data_array.reshape(data_array.shape[0], -1)
gscv.fit(data_array_2d, label_array)

In [85]:
clf = LogisticRegression()

# Define the cross-validation strategy
gkf = GroupKFold(n_splits=5)

# Create a pipeline with a scaler and classifier
pipe = Pipeline([
    ('scaler', StandardScaler()),  # Apply standard scaling
    ('clf', clf)  # Use Logistic Regression as classifier
])

# Define the hyperparameter grid to search for the best hyperparameters
param_grid = {'clf__C': [0.1, 0.5, 0.7, 1, 3, 5, 7]}  # Regularization strength for logistic regression

# Perform grid search with cross-validation using GroupKFold
gscv = GridSearchCV(pipe, param_grid, cv=gkf, n_jobs=12)  # n_jobs=12 means using 12 cores for parallel processing

# Fit the model on the features, labels, and groups
gscv.fit(features_array, label_array, groups=group_array)

In [86]:
gscv.best_score_ #0.92 before

np.float64(0.9221052631578948)