In [1]:
import sys
from glob2 import glob

import numpy as np
import pandas as pd # dataframes, tables 
import seaborn as sns # plotting
import matplotlib.pyplot as plt
from scipy.cluster.hierarchy import dendrogram, linkage

import networkx as nx

import mne
from mne.datasets import eegbci
from mne.io import concatenate_raws, read_raw_edf

In [2]:
from glob2 import glob

# reading the list of file names in the EEG Epilepsy Corpus
edf_file_list = glob("../tuh_eeg_epilepsy/edf/*/*/*/*/*/*.edf")


#edf_file_list = glob("../tuh_eeg_abnormal/edf/*/*/*/*/*/*/*.edf")
len(edf_file_list)

unique_epilepsy_patient_ids = list(set([x.split("/")[-1].split("_")[0] for x in edf_file_list]))
len(unique_epilepsy_patient_ids)

200

# 1. Functions

In [3]:
def read_edf_to_pandas(edf_filename, select_channels = True):
    """ Reads data from an edf file to a Pandas dataframe.
        Column names are 'channel_labels'.
        
        If 'select_channels=True', then only 19 common channels are selected to 
        create the resulting dataframe. The channel names will be updated (standardized).
        
        Returns: dataframe, channel labels
    """
    # read edf file
    raw_data = read_raw_edf(edf_filename, verbose=False, preload=False)
    
    if select_channels:
        # the TUEP database has 3 EEG channel configurations: '02_tcp_le', '03_tcp_ar_a', '01_tcp_ar'
        # number of channels and channel names differ within these configurations
        # to be able to compare the different EEG readings we need to select channels
        # that are common for all configurations

        # the list of 19 channels (their short labels) that we will use for analysing EEG data
        channels_to_use = ['FP1', 'FP2', 'F7', 'F3', 'FZ', 'F4', 'F8',
                           'T3', 'C3', 'CZ', 'C4', 'T4', 'T5',
                           'P3', 'PZ', 'P4', 'T6', 'O1', 'O2']
        
        # the function to update channel names from original to new format:
        ch_name_update_func = lambda ch: ch.split(' ')[-1].split('-')[0]

        # renaming the original channel names in one .edf file;
        # the update will be written into the in-memory edf object
        raw_data.rename_channels(mapping=ch_name_update_func)
        
        # check if all required channels are in the edf file
        try:
            assert all([ch in raw_data.info["ch_names"] for ch in channels_to_use])
        except:
            print('Not all required channels are in the edf file.')
        
        # dataframe with EEG readings from selected channels and with 
        # updated channel names
        df = pd.DataFrame(raw_data.pick_channels(channels_to_use).get_data().T,
            columns=raw_data.pick_channels(channels_to_use).info['ch_names'])
        
        # we need to return correct channel/column names
        channel_labels = channels_to_use # as specified by us: left-to-right and top-down
        # channel_labels = df.columns.tolist() # as given in the edf file
        
    else:
        # get channel names from edf file
        channel_labels = raw_data.info["ch_names"]

        # create a dataframe from
        df = pd.DataFrame(raw_data.get_data().T, columns=channel_labels)

    return df[channel_labels], channel_labels # as specified by us: left-to-right and top-down
    # return df, channel_labels # as given in the edf file

# !!! NEED TO DECIDE !!!
# what order of channels should be given in the dataframe?
#      - as specified in the edf file
#      - as chosen by us: left-to-right and top-down

In [4]:
def compute_corr_matrix(edf_filename):
    
    # read edf file from filename
    # by default, common channels will be selected and renamed
    df, channel_labels = read_edf_to_pandas(edf_filename)
    
    # calculate the correlation matrix
    corr_matrix = df.corr()
    
    return corr_matrix


def plot_correlation_matrix(edf_filename):
    """ Reads edf file from relative path (ex. ../tuh_eeg_epilepsy/edf/*/*/*/*/*/*.edf). 
        Creates a dataframe with all EEG readings from all channels.
        Computes a correlation matrix.
    """
    # read edf file from filename
    # by default, common channels will be selected and renamed
    df, channel_labels = read_edf_to_pandas(edf_filename)
    
    # calculate the correlation matrix
    corr_matrix = df.corr()
        
    # plot the heatmap for correlation matrix
    fig, ax = plt.subplots(1,1, figsize=(8,6))
    sns.heatmap(corr_matrix,
                xticklabels=channel_labels, 
                yticklabels=channel_labels,
                cmap= plt.cm.jet,
                ax = ax)
    
    plt.title('Correlation Matrix')
    plt.xlabel('channel_names')
    plt.ylabel('channel_names')
     
    # return fig
    

In [5]:
def entropy(bins, *X):
    
    # binning of the data
    data, *edges = np.histogramdd(X, bins=bins)
    
    # calculate probabilities
    data = data.astype(float)/data.sum()
    
    # compute H(X,Y,...,Z) = sum(-P(x,y,...,z) ∗ log2(P(x,y,...,z)))
    return np.sum(-data * np.log2(data+sys.float_info.epsilon))


def mutual_information(bins, X, Y):
    
    # compute I(X,Y) = H(X) + H(Y) − H(X,Y)
    
    H_X = entropy(bins, X)
    H_Y = entropy(bins, Y)
    H_XY = entropy(bins, X, Y)
    
    return H_X + H_Y - H_XY

# Compute number of bins using Sturge's rule
def compute_mi_matrix(df):
    """ Compute Mutual Information matrix.
    
        Return: mi_matrix
    """
    n_cols = df.shape[1]
    mi_matrix = np.zeros([n_cols, n_cols])
    
    # Sturge's rule for number of bins
    n_bins = int(1 + 3.322*np.log2(df.shape[0]))
    
    for i in range(n_cols):
        for j in range(n_cols):
            mi = mutual_information(n_bins, df.iloc[:,i],df.iloc[:,j])
            mi_matrix[i,j] = mi
    
    return mi_matrix
    

def compute_normed_mi_matrix(mi_matrix):
    """ Compute normalized version of the given Mutual Information matrix.
    
        Return: normed_mi_matrix
    """
    
    # normalize mi matrix by dividing matrix elements with
    # sqrt of product of respective diagonal elements
    divisor_matrix = np.sqrt(np.diag(mi_matrix)*np.diag(mi_matrix).reshape(-1,1))
    normed_mi_matrix = mi_matrix/divisor_matrix

    return normed_mi_matrix

# 2. Compute MI matrices

In [6]:
# create a dataframe with overview over filenames in the TUEP
# there are 1648 files in the TUEP

df_files_overview = pd.DataFrame([f.split('/')[3:]+[f] for f in edf_file_list], 
            columns = ['text_label', 'ch_conf', '-', 'patient_id', 'session_date', 'filename', 'file_path'])

# create some additional columns

df_files_overview['session'] = df_files_overview['session_date'].apply(lambda s: s.split('_')[0])
df_files_overview['year'] = df_files_overview['session_date'].apply(lambda s: s.split('_')[1])
df_files_overview['month'] = df_files_overview['session_date'].apply(lambda s: s.split('_')[2])
df_files_overview['numeric_label'] = df_files_overview['text_label'].replace(to_replace=['epilepsy', 'no_epilepsy'], value=[1,0])
df_files_overview['token'] = df_files_overview['filename'].apply(lambda s: s.split('_')[-1][:-4])

In [7]:
edf_files_per_patient = [df_files_overview.query(f"patient_id == '{id}'")['file_path'].values[0] for id in unique_epilepsy_patient_ids]
df_files_per_patient = df_files_overview[df_files_overview['file_path'].isin(edf_files_per_patient)]
chosen_files=[f for f in edf_files_per_patient if '/epilepsy/' in f] + [f for f in edf_files_per_patient if 'no_epilepsy' in f]

In [14]:
from tqdm import tqdm

from multiprocessing import Pool
from compute_mi_matrices import compute_mi_form_edf_file

frames_list = chosen_files

max_pool = 5

# with Pool(max_pool) as p:
    pool_outputs = list(
        tqdm(
            p.imap(compute_mi_form_edf_file,
                   frames_list),
            total=len(frames_list)
        )
    )    

# print(pool_outputs)
new_dict = dict(pool_outputs)
# print("dict:", new_dict)

100%|██████████| 200/200 [20:25<00:00,  6.13s/it] 


In [16]:
len(new_dict.keys())

200

In [8]:
# save the obtained dictionary into a file using joblib

import joblib

# joblib.dump(new_dict, './matrices/mi_matrices_v0.data')

In [2]:
new_dict = joblib.load('./matrices/mi_matrices_v0.data')

In [14]:
from tqdm import tqdm

from multiprocessing import Pool
from compute_mi_for_bands import compute_mi_over_bands_form_edf_file

frames_list = chosen_files

max_pool = 5

with Pool(max_pool) as p:
    pool_outputs = list(
        tqdm(
            p.imap(compute_mi_over_bands_form_edf_file,
                   frames_list),
            total=len(frames_list)
        )
    )    

# print(pool_outputs)
new_dict = dict(pool_outputs)
# print("dict:", new_dict)

100%|██████████| 200/200 [3:51:55<00:00, 69.58s/it]    


In [15]:
# save the obtained dictionary into a file using joblib

import joblib

joblib.dump(new_dict, './matrices/mi_matrices_for_bands.data')

['./matrices/mi_matrices_for_bands.data']