In [1]:
import scipy.io
import numpy as np
import pandas as pd
import os
import wfdb

In [2]:
def load_mat(fileName):
    data=scipy.io.loadmat(fileName)
    return data['val']


In [3]:
def load_patient_data(file_path):
    with open(file_path, 'r') as f:
        lines = f.readlines()

    # Parse the desired patient data from the lines
    parsed_data = {}
    for line in lines:
        
        attribute, value = line.strip().split(':')
        # Store the parsed data in the dictionary
        parsed_data[attribute.strip()] = value.strip()

    return parsed_data

In [4]:
def extract_patient_data(parent_dir):
    patient_folders = os.listdir(parent_dir)
    all_patient_data = []
    
    for patient_folder in patient_folders:
        
        if os.path.isdir(os.path.join(parent_dir, patient_folder)):
            # Extract the patient number from the folder name
            patient_number = patient_folder
            # Construct the file path
            file_path = os.path.join(parent_dir, patient_folder, f'{patient_number}.txt')

            # Check if the file exists
            if os.path.isfile(file_path):
                # Load and process the patient data
                patient_data = load_patient_data(file_path)

                # Append the patient data to the list
                all_patient_data.append(patient_data)
            else:
                print(f"No text file found for patient {patient_number}")
    return all_patient_data

In [5]:
# def extract_all_eeg(root_dir):
#     # Initialize the list to store all patient EEG data
#     all_eeg_data = []

#     # Iterate over the patient folders
#     for patient_folder in os.listdir(root_dir):
#         patient_dir = os.path.join(root_dir, patient_folder)

#         # Initialize a list to store the channel data for the patient
#         eeg_data_patient = []

#         # Iterate over the files in the patient folder
#         for file in os.listdir(patient_dir):
#             file_path = os.path.join(patient_dir, file)

#             # Check if the file is a MATLAB MAT file
#             if file.endswith('.mat'):
#                 # Load the EEG data from the MATLAB MAT file
                
#                 eeg_data = load_mat(file_path)

#                 # Append the channel data to the patient's EEG data list
#                 eeg_data_patient.append(eeg_data)

#         # Append the patient's EEG data list to the list of all EEG data
#         all_eeg_data.append(eeg_data_patient)
#     return all_eeg_data
def extract_all_eeg(root_dir):
    # Initialize the list to store all patient EEG data
    all_eeg_data = []

    # Iterate over the patient folders
    for patient_folder in os.listdir(root_dir):
        patient_dir = os.path.join(root_dir, patient_folder)

        # Initialize a list to store the channel data for the patient
        eeg_data_patient = []

        # Iterate over the files in the patient folder
        for file in os.listdir(patient_dir):
            file_path = os.path.join(patient_dir, file)

            # Check if the file is a MATLAB MAT file and contains "EEG" in its name
            if file.endswith('.mat') and 'EEG' in file:
                # Load the EEG data from the MATLAB MAT file
                eeg_data = load_mat(file_path)

                # Append the channel data to the patient's EEG data list
                eeg_data_patient.append(eeg_data)

        # Append the patient's EEG data list to the list of all EEG data
        all_eeg_data.append(eeg_data_patient)
    return all_eeg_data


In [6]:
# def extract_all_headers(root_dir):
#     patient_data_array = []
#     channel_names = []

#     # Iterate over the patient folders
#     for patient_folder in os.listdir(root_dir):
#         patient_folder_path = os.path.join(root_dir, patient_folder)
#         # Check if the folder is a directory
#         if os.path.isdir(patient_folder_path):
#             hea_file_path = None  # Initialize with None
#             # Search for any file with the patient ID in its name
#             for file in os.listdir(patient_folder_path):
#                 if patient_folder in file and file.endswith('.hea'):
#                     hea_file_path = os.path.join(patient_folder_path, file)
#                     break  # Found the .hea file, exit the loop
            
#             if hea_file_path is not None:
#                 # Read the .hea file
#                 record = wfdb.rdheader(hea_file_path[:-4])
                
#                 # Extract the desired information from the header
#                 sampling_frequency = record.fs
#                 channel = record.sig_name
#                 # Add the patient information to the data array
#                 patient_data_array.append(sampling_frequency)
#                 channel_names.append(channel)
#             else:
#                 # Append None to the data array if no .hea file found
#                 patient_data_array.append(None)
#                 channel_names.append(None)
#         else:
#             print(f"Invalid folder found: {patient_folder}")
#     return patient_data_array, channel_names

def extract_all_headers(root_dir):
    patient_data_array = []
    channel_names = []

    # Iterate over the patient folders
    for patient_folder in os.listdir(root_dir):
        patient_folder_path = os.path.join(root_dir, patient_folder)
        # Check if the folder is a directory
        if os.path.isdir(patient_folder_path):
            max_samples = 0  # Initialize max samples to 0
            max_samples_file = None  # Initialize the max samples file to None

            # Search for any EEG .mat file with the patient ID in its name
            for file in os.listdir(patient_folder_path):
                if 'EEG' in file and file.endswith('.mat'):
                    file_path = os.path.join(patient_folder_path, file)
                    eeg_data = load_mat(file_path)  # Load the .mat file
                    samples = len(eeg_data)  # Get the number of samples

                    # Check if the number of samples in this file is greater than max_samples
                    if samples > max_samples:
                        max_samples = samples
                        max_samples_file = file

            # If a .mat file with the maximum number of samples was found
            if max_samples_file is not None:
                # Replace the .mat extension with .hea to get the corresponding .hea file
                hea_file = max_samples_file.replace('.mat', '.hea')
                hea_file_path = os.path.join(patient_folder_path, hea_file)

                # Read the .hea file
                record = wfdb.rdheader(hea_file_path[:-4])

                # Extract the desired information from the header
                sampling_frequency = record.fs
                channel = record.sig_name

                # Add the patient information to the data array
                patient_data_array.append(sampling_frequency)
                channel_names.append(channel)
            else:
                patient_data_array.append(None)
                channel_names.append(None)

        else:
            print(f"Invalid folder found: {patient_folder}")

    return patient_data_array, channel_names



In [7]:
# def reorder_eeg_channels(eeg_data, channel_names, desired_channel_order):
    

#     final_mapping_dicts = []
#     reordered_eeg_data = []
#     max_num_samples = 0

#     for i in range(len(eeg_data)):
#         patient_eeg = eeg_data[i]
#         patient_channels = channel_names[i]

#         # Create a mapping dictionary for the patient's channel names
#         mapping_dict = {}
#         reordered_channels = []
#         for channel in desired_channel_order:
#             if channel in patient_channels:
#                 index = patient_channels.index(channel)
#                 mapping_dict[channel] = index
#                 reordered_channels.append(patient_eeg[index])

#         final_mapping_dicts.append(mapping_dict)
#         reordered_eeg_data.append(reordered_channels)
#         max_num_samples = max(max_num_samples, len(reordered_channels))

#     # Pad or truncate channels to match the length of the longest channel
#     for i in range(len(reordered_eeg_data)):
#         channel_data = reordered_eeg_data[i]
#         channel_length = len(channel_data)
#         if channel_length < max_num_samples:
#             padded_data = np.pad(channel_data, (0, max_num_samples - channel_length), mode='constant')
#             reordered_eeg_data[i] = padded_data
#         elif channel_length > max_num_samples:
#             truncated_data = channel_data[:max_num_samples]
#             reordered_eeg_data[i] = truncated_data

#     # Delete channels that are not in the desired channel list
#     for i in range(len(reordered_eeg_data)):
#         patient_channels = list(final_mapping_dicts[i].keys())
#         channels_to_delete = [channel for channel in patient_channels if channel not in desired_channel_order]
#         for channel in channels_to_delete:
#             del final_mapping_dicts[i][channel]

#     return reordered_eeg_data, final_mapping_dicts

In [8]:
def delete_channels(clean_df):
    for i, row in clean_df.iterrows():
        eeg_data = np.array(row['EEG'])  # Convert the list to a NumPy array
        if eeg_data.shape[0] > 19:  # Check if the EEG data has more than 19 channels
            # Drop channels after the 19th channel
            clean_df.at[i, 'EEG'] = eeg_data[:19]

In [9]:
def eeg_one_file(all_eeg_data):
    # eeg_data_one_file = []
    # for patient_files in all_eeg_data:
    #     if len(patient_files) > 0:
    #         # Select the first file for each patient
    #         first_file = patient_files[0]
    #         eeg_data_one_file.append(first_file)
    #     else:
    #         eeg_data_one_file.append(None)
    eeg_data_one_file = []

    for patient_files in all_eeg_data:
        if len(patient_files) > 0:
            # Find the file with the most samples
            max_samples = -1
            selected_file = None

            for eeg_file in patient_files:
                num_samples = eeg_file.shape[1]  # Get the number of samples
                if num_samples > max_samples:
                    max_samples = num_samples
                    selected_file = eeg_file

            eeg_data_one_file.append(selected_file)
        else:
            eeg_data_one_file.append(None)
    
    return eeg_data_one_file



In [10]:
def reorder_eeg_channels(eeg_data, channel_names):
    desired_channel_order = ['Fp1', 'Fp2', 'F3', 'F4', 'C3', 'C4', 'P3', 'P4', 'O1', 'O2', 'F7', 'F8', 'T3', 'T4', 'T5', 'T6', 'Fz', 'Cz', 'Pz']

    final_mapping_dicts = []
    reordered_eeg_data = []
    max_num_samples = 0

    for i in range(len(eeg_data)):
        patient_eeg = eeg_data[i]
        patient_channels = channel_names[i]

        # Create a mapping dictionary for the patient's channel names
        mapping_dict = {}
        reordered_channels = []
        for channel in desired_channel_order:
            if channel in patient_channels:
                index = patient_channels.index(channel)
                mapping_dict[channel] = index
                reordered_channels.append(patient_eeg[index])

        final_mapping_dicts.append(mapping_dict)
        reordered_eeg_data.append(reordered_channels)
        max_num_samples = max(max_num_samples, len(reordered_channels))

    # Pad or truncate channels to match the length of the longest channel
    for i in range(len(reordered_eeg_data)):
        channel_data = reordered_eeg_data[i]
        channel_length = len(channel_data)
        if channel_length < max_num_samples:
            padded_data = np.pad(channel_data, (0, max_num_samples - channel_length), mode='constant')
            reordered_eeg_data[i] = padded_data
        elif channel_length > max_num_samples:
            truncated_data = channel_data[:max_num_samples]
            reordered_eeg_data[i] = truncated_data

    # Delete channels that are not in the desired channel list
    for i in range(len(reordered_eeg_data)):
        patient_channels = list(final_mapping_dicts[i].keys())
        channels_to_delete = [channel for channel in patient_channels if channel not in desired_channel_order]
        for channel in channels_to_delete:
            del final_mapping_dicts[i][channel]

    return reordered_eeg_data, final_mapping_dicts

In [11]:
root_dir = r'c:\Users\sendm\physionet.org\files\i-care\2.0\training'
all_eeg_data = extract_all_eeg(root_dir)
# Print the shape of the EEG data
for patient_idx, eeg_data_patient in enumerate(all_eeg_data):
    patient_shape = [channel_data.shape for channel_data in eeg_data_patient]

In [None]:
eeg_data_one_file = eeg_one_file(all_eeg_data)

In [None]:
print(eeg_data_one_file[0])

In [None]:
all_hea_data, channel_names = extract_all_headers(root_dir)

In [None]:
num_patients_with_eeg = sum(len(eeg_data_patient) > 0 for eeg_data_patient in all_eeg_data)


# Print the number of patients with EEG data
print(f"Number of patients with EEG data: {num_patients_with_eeg}")

In [None]:
df2 = pd.DataFrame({"EEG":eeg_data_one_file})


In [None]:
parent_dir = r"C:\Users\sendm\physionet.org\files\i-care\2.0\training"
all_patient_data = extract_patient_data(parent_dir)


In [None]:
df = pd.DataFrame(data=all_patient_data)


In [None]:
final_df = pd.concat([df, df2], axis=1, join='inner') 


In [None]:
print(final_df)

In [None]:
final_df["FS"] = all_hea_data
final_df['Channel Names'] = channel_names

In [None]:
bad_indexes = []
for i in final_df.index:
    if final_df['EEG'].isnull().iloc[i]:
        bad_indexes.append(i)
clean_df = final_df.drop(bad_indexes).reset_index(drop=True)


In [None]:
print(clean_df)

In [None]:
del clean_df['Hospital']
del clean_df['Patient']
del clean_df['TTM']


In [None]:
clean_df['Outcome'] = clean_df['Outcome'].replace("Good", 0)
clean_df['Outcome'] = clean_df['Outcome'].replace("Poor", 1)

clean_df['Sex'] = clean_df['Sex'].replace("Male", 0)
clean_df['Sex'] = clean_df['Sex'].replace("Female", 1)

clean_df['Shockable Rhythm'] = clean_df['Shockable Rhythm'].replace("True", 0)
clean_df['Shockable Rhythm'] = clean_df['Shockable Rhythm'].replace("False", 1)

clean_df['OHCA'] = clean_df['OHCA'].replace("True", 0)
clean_df['OHCA'] = clean_df['OHCA'].replace("False", 1)


In [None]:
clean_df['ROSC'] = clean_df['ROSC'].replace("nan", np.nan)
clean_df = clean_df.dropna(subset=['ROSC'])
clean_df['Sex'] = clean_df['Sex'].replace("nan", np.nan)
clean_df = clean_df.dropna(subset=['Sex'])
clean_df['OHCA'] = clean_df['OHCA'].replace("nan", np.nan)
clean_df = clean_df.dropna(subset=['OHCA'])
clean_df['Shockable Rhythm'] = clean_df['Shockable Rhythm'].replace("nan", np.nan)
clean_df = clean_df.dropna(subset=['Shockable Rhythm'])
clean_df.dropna(subset=['FS'], inplace=True)

clean_df.reset_index(drop=True, inplace=True)

In [None]:
good = 0
bad = 0
for i in clean_df.index:
    if(clean_df['Outcome'][i] == 0):
        
        good += 1
    else:
        bad += 1
        
print("Number of good labels: " + str(good))
print("Number of bad labels: " + str(bad))

In [None]:
print(clean_df)

In [None]:
reordered_eeg, map_dicts = reorder_eeg_channels(clean_df['EEG'], clean_df['Channel Names'])

In [None]:
print(reordered_eeg)

In [None]:
clean_df['EEG'] = reordered_eeg

In [None]:
del clean_df['Channel Names']

In [None]:
delete_channels(clean_df)

In [None]:
outcome_list = clean_df['Outcome']
eeg_list = clean_df['EEG']
fs_list = clean_df['FS']

In [None]:
del clean_df['Outcome']
del clean_df['EEG']
del clean_df['FS']

In [None]:
clean_df['EEG'] = eeg_list
clean_df['FS'] = fs_list
clean_df['Outcome'] = outcome_list


In [None]:
final_clean_df = clean_df.reset_index()

In [None]:
print(final_clean_df)

In [None]:
final_clean_df['Age'] = final_clean_df['Age'].astype(int)
final_clean_df['Sex'] = final_clean_df['Sex'].astype(int)
final_clean_df['ROSC'] = final_clean_df['ROSC'].astype(int)
final_clean_df['OHCA'] = final_clean_df['OHCA'].astype(int)
final_clean_df['Shockable Rhythm'] = final_clean_df['Shockable Rhythm'].astype(int)

# final_clean_df.to_hdf('CCIR_DF_Large2.h5', key='data', mode='w')
final_clean_df.to_pickle('CCIR_DF_Large.pkl')
