In [44]:
from google.colab import drive
drive.mount('/gdrive')

Mounted at /gdrive


<h1><b><FONT COLOR="0DA2EC">Preparing raw schizophrenia EEG dataset </h1><br>
<h3><b> Reference Paper : <a href=https://doi.org/10.1371/journal.pone.0188629>Graph-based analysis of brain connectivity in
schizophrenia</a>  </b>

In [45]:
fname='/content/drive/MyDrive/dataset_EEG/*.edf'

In [46]:
!pip install mne



In [47]:
from glob import glob
import os
import mne
import numpy as np
import pandas
import matplotlib.pyplot as plt

In [48]:
glob('/gdrive/MyDrive/dataset_EEG/*.edf')

['/gdrive/MyDrive/dataset_EEG/h01.edf',
 '/gdrive/MyDrive/dataset_EEG/h02.edf',
 '/gdrive/MyDrive/dataset_EEG/h03.edf',
 '/gdrive/MyDrive/dataset_EEG/h04.edf',
 '/gdrive/MyDrive/dataset_EEG/h06.edf',
 '/gdrive/MyDrive/dataset_EEG/h07.edf',
 '/gdrive/MyDrive/dataset_EEG/h05.edf',
 '/gdrive/MyDrive/dataset_EEG/h08.edf',
 '/gdrive/MyDrive/dataset_EEG/h09.edf',
 '/gdrive/MyDrive/dataset_EEG/h10.edf',
 '/gdrive/MyDrive/dataset_EEG/h11.edf',
 '/gdrive/MyDrive/dataset_EEG/h12.edf',
 '/gdrive/MyDrive/dataset_EEG/h13.edf',
 '/gdrive/MyDrive/dataset_EEG/h14.edf',
 '/gdrive/MyDrive/dataset_EEG/s01.edf',
 '/gdrive/MyDrive/dataset_EEG/s08.edf',
 '/gdrive/MyDrive/dataset_EEG/s05.edf',
 '/gdrive/MyDrive/dataset_EEG/s03.edf',
 '/gdrive/MyDrive/dataset_EEG/s02.edf',
 '/gdrive/MyDrive/dataset_EEG/s04.edf',
 '/gdrive/MyDrive/dataset_EEG/s06.edf',
 '/gdrive/MyDrive/dataset_EEG/s10.edf',
 '/gdrive/MyDrive/dataset_EEG/s09.edf',
 '/gdrive/MyDrive/dataset_EEG/s07.edf',
 '/gdrive/MyDrive/dataset_EEG/s12.edf',


In [49]:
all_file_path=glob('/gdrive/MyDrive/dataset_EEG/*.edf')
print(len(all_file_path))

28


In [50]:
all_file_path[0:10]

['/gdrive/MyDrive/dataset_EEG/h01.edf',
 '/gdrive/MyDrive/dataset_EEG/h02.edf',
 '/gdrive/MyDrive/dataset_EEG/h03.edf',
 '/gdrive/MyDrive/dataset_EEG/h04.edf',
 '/gdrive/MyDrive/dataset_EEG/h06.edf',
 '/gdrive/MyDrive/dataset_EEG/h07.edf',
 '/gdrive/MyDrive/dataset_EEG/h05.edf',
 '/gdrive/MyDrive/dataset_EEG/h08.edf',
 '/gdrive/MyDrive/dataset_EEG/h09.edf',
 '/gdrive/MyDrive/dataset_EEG/h10.edf']

In [51]:
all_file_path[0].split('/')[4]

'h01.edf'

In [52]:
healthy_file_path=[i for i in all_file_path if 'h' in i.split('/')[4]]
patient_file_path=[i for i in all_file_path if 's' in i.split('/')[4]]

In [53]:
healthy_file_path

['/gdrive/MyDrive/dataset_EEG/h01.edf',
 '/gdrive/MyDrive/dataset_EEG/h02.edf',
 '/gdrive/MyDrive/dataset_EEG/h03.edf',
 '/gdrive/MyDrive/dataset_EEG/h04.edf',
 '/gdrive/MyDrive/dataset_EEG/h06.edf',
 '/gdrive/MyDrive/dataset_EEG/h07.edf',
 '/gdrive/MyDrive/dataset_EEG/h05.edf',
 '/gdrive/MyDrive/dataset_EEG/h08.edf',
 '/gdrive/MyDrive/dataset_EEG/h09.edf',
 '/gdrive/MyDrive/dataset_EEG/h10.edf',
 '/gdrive/MyDrive/dataset_EEG/h11.edf',
 '/gdrive/MyDrive/dataset_EEG/h12.edf',
 '/gdrive/MyDrive/dataset_EEG/h13.edf',
 '/gdrive/MyDrive/dataset_EEG/h14.edf']

In [54]:
patient_file_path

['/gdrive/MyDrive/dataset_EEG/s01.edf',
 '/gdrive/MyDrive/dataset_EEG/s08.edf',
 '/gdrive/MyDrive/dataset_EEG/s05.edf',
 '/gdrive/MyDrive/dataset_EEG/s03.edf',
 '/gdrive/MyDrive/dataset_EEG/s02.edf',
 '/gdrive/MyDrive/dataset_EEG/s04.edf',
 '/gdrive/MyDrive/dataset_EEG/s06.edf',
 '/gdrive/MyDrive/dataset_EEG/s10.edf',
 '/gdrive/MyDrive/dataset_EEG/s09.edf',
 '/gdrive/MyDrive/dataset_EEG/s07.edf',
 '/gdrive/MyDrive/dataset_EEG/s12.edf',
 '/gdrive/MyDrive/dataset_EEG/s13.edf',
 '/gdrive/MyDrive/dataset_EEG/s11.edf',
 '/gdrive/MyDrive/dataset_EEG/s14.edf']

In [55]:
def read_data(file_path):
  data=mne.io.read_raw_edf(file_path,preload=True)
  data.set_eeg_reference() #auto referencing with one electrode
  data.filter(l_freq=0.5,h_freq=45) # bandpass filter of 0.5Hz to 45Hz
  epochs=mne.make_fixed_length_epochs(data,duration=5,overlap=1)
  array=epochs.get_data()
  return array

In [56]:
sample_data = read_data(healthy_file_path[0])

Extracting EDF parameters from /gdrive/MyDrive/dataset_EEG/h01.edf...
EDF file detected
Setting channel info structure...
Creating raw.info structure...
Reading 0 ... 231249  =      0.000 ...   924.996 secs...
EEG channel type selected for re-referencing
Applying average reference.
Applying a custom ('EEG',) reference.
Filtering raw data in 1 contiguous segment
Setting up band-pass filter from 0.5 - 45 Hz

FIR filter parameters
---------------------
Designing a one-pass, zero-phase, non-causal bandpass filter:
- Windowed time-domain design (firwin) method
- Hamming window with 0.0194 passband ripple and 53 dB stopband attenuation
- Lower passband edge: 0.50
- Lower transition bandwidth: 0.50 Hz (-6 dB cutoff frequency: 0.25 Hz)
- Upper passband edge: 45.00 Hz
- Upper transition bandwidth: 11.25 Hz (-6 dB cutoff frequency: 50.62 Hz)
- Filter length: 1651 samples (6.604 s)

Not setting metadata
231 matching events found
No baseline correction applied
0 projection items activated
Using da

[Parallel(n_jobs=1)]: Done  17 tasks      | elapsed:    0.3s


In [57]:
sample_data.shape #no of epochs, channels, length signal

(231, 19, 1250)

In [58]:
%%capture
control_epochs_array=[read_data(i) for i in healthy_file_path]
patient_epochs_array=[read_data(i) for i in patient_file_path]

In [59]:
#control_epochs_array is the 14 arrays of 231 epoch with 19 channel and 1250 ms length signal
#patients_epochs_array is the 14 arrays of 231 epoch with 19 channel and 1250 ms length signal

control_epochs_array[0].shape,patient_epochs_array[0].shape

((231, 19, 1250), (211, 19, 1250))

In [60]:
# lebel the signals
# for control epochs label will be 0
# for patient epochs label with be 1

control_epochs_label = [len(i)*[0] for i in control_epochs_array]
patient_epochs_label = [len(i)*[1] for i in patient_epochs_array]

#create 14 list of 231 0s list for healthy signal
#create 14 list of 231 1s list for patient signal

len(control_epochs_label),len(patient_epochs_label)

(14, 14)

In [61]:
len(control_epochs_label[0])

231

In [62]:
data_list = control_epochs_array + patient_epochs_array
label_list = control_epochs_label + patient_epochs_label

In [63]:
enumerate(data_list)

<enumerate at 0x7894c3711940>

In [64]:
#example of enumerate
l1 = ["eat", "sleep", "repeat"]
s1 = "geek"

# creating enumerate objects
obj1 = enumerate(l1)
obj2 = enumerate(s1)

print ("Return type:", type(obj1))
print (list(enumerate(l1)))

# changing start index to 2 from 0
print (list(enumerate(s1, 2)))

Return type: <class 'enumerate'>
[(0, 'eat'), (1, 'sleep'), (2, 'repeat')]
[(2, 'g'), (3, 'e'), (4, 'e'), (5, 'k')]


In [65]:
group_list = [ [i]*len(j) for i,j in enumerate(data_list)]
len(group_list)

28

In [66]:
#converting list to array
data_array = np.vstack(data_list)
label_array= np.hstack(label_list)
group_array=np.hstack(group_list)

print(data_array.shape,label_array.shape,group_array.shape)

(7201, 19, 1250) (7201,) (7201,)


<h2> <FONT COLOR="FF0000">Feature Extraction from data_array (raw EEG)


In [68]:
#shape of the 1 feature should be 7201,19
#shape of the 10 feature should be 7201,19*10

from scipy import stats
def mean(x):
  return np.mean(x,axis=-1)
def std(x):
  return np.std(x,axis=-1)
def ptp(x):
  return np.ptp(x,axis=-1)
def var(x):
  return np.var(x,axis=-1)
def minim(x):
  return np.min(x,axis=-1)
def maxim(x):
  return np.max(x,axis=-1)

def argminim(x):
  return np.argmin(x,axis=-1)
def argmaxim(x):
  return np.argmax(x,axis=-1)

def rms(x):
  return np.sqrt(np.mean(x**2,axis=-1))

def abs_diff_signal(x):
  return np.sum(np.abs(np.diff(x,axis=-1)),axis=-1)
def skewness(x):
  return stats.skew(x,axis=-1)
def kurtosis(x):
  return stats.kurtosis(x,axis=-1)

def concatenate_features(x):
  return np.concatenate((mean(x),std(x),ptp(x),var(x),minim(x),maxim(x),argminim(x),argmaxim(x),rms(x),abs_diff_signal(x),skewness(x),kurtosis(x)),axis=-1)



In [70]:
features=[]
for d in data_array:
  features.append(concatenate_features(d))

In [71]:
features_array = np.array(features)
features_array.shape

(7201, 228)

In [72]:
#total 228/19==12 features

In [73]:
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import GroupKFold,GridSearchCV

In [79]:
clf= LogisticRegression()
gkf= GroupKFold(10)
pipe=Pipeline([('scaler',StandardScaler()),('clf',clf)])
param_grid={'clf__C':[0.1,0.5,0.7,1,3,5,7,9]}
gscv=GridSearchCV(pipe,param_grid,cv=gkf,n_jobs=12)
gscv.fit(features_array,label_array,groups=group_array)

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [80]:
gscv.best_score_

0.65348837358381