In [1]:
import os
import sys
raw_data_dir = ''
acc_key = 'acc'
plot_examples = False # notebook file size will increase by 30 to 60MB if set to True; size is <1MB otherwise

if  'COLAB_GPU' in os.environ :
    print('Using Google Colab. Setting up environment')
    raw_data_dir = '/content/drive/My Drive/Colab Notebooks/' 
    #raw_data_dir = 'Raw/'

    !pip install mne==0.19.2
    !pip install pyedflib==0.1.15
    !pip install chart_studio==1.0.0


    print('\n \n To load files from Google Drive, account validation is required.')
    #mount to drive -- files should be located in the /Colab Notebooks directory
    from google.colab import drive
    drive.mount("/content/drive", force_remount=True)
    
    if not os.path.exists('/content/tmp/eeg_sz/ReadData'):
      os.makedirs('/content/tmp/eeg_sz/ReadData')
      os.makedirs('/content/tmp/eeg_sz/utils')
    # download project utilities and data reader 
    !wget -O/content/tmp/eeg_sz/ReadData/RawDataReader.py https://raw.githubusercontent.com/WinAIML/schizophrenia/master/ReadData/RawDataReader.py
    !wget -O/content/tmp/eeg_sz/utils/ModelBuilder.py -P/utils https://raw.githubusercontent.com/WinAIML/schizophrenia/master/MLModels/utils/ModelBuilder.py
    !wget -O/content/tmp/eeg_sz/utils/ChartBuilder.py -P/utils https://raw.githubusercontent.com/WinAIML/schizophrenia/master/MLModels/utils/ChartBuilder.py
    sys.path.append('/content/tmp/eeg_sz/')

elif 'KAGGLE_URL_BASE' in os.environ:
    acc_key = 'accuracy'
    print('Using Kaggle kernel. Setting up environment')
    !pip install mne==0.19.2
    !pip install pyedflib==0.1.15
    !pip install chart_studio==1.0.0
    !svn checkout https://github.com/WinAIML/schizophrenia/trunk/Data/Raw
    raw_data_dir = 'Raw/'
    
    # download project utilities and data reader 
    !svn checkout https://github.com/WinAIML/schizophrenia/trunk/ReadData
    !svn checkout https://github.com/WinAIML/schizophrenia/trunk/MLModels/utils Utils


else:
    # assuming that a local run will be launched only from a github project; 
    # add the utils and ReadData directories to the temporary path
    if 'HOMEPATH' in os.environ:
        print('Using homepath ' + os.environ['HOMEPATH'])
    raw_data_dir = '../../Data/Raw/'
    
    from pathlib import Path
    import sys
    sys.path.append(os.path.realpath('..'))
    path = Path(os.getcwd())
    sys.path.append(str(path.parent.parent))


import tensorflow as tf
from tensorflow.keras.models import Model
from tensorflow.keras.layers import Dense, Activation, Permute, Dropout
from tensorflow.keras.layers import Conv2D, Conv1D, MaxPooling1D, MaxPooling2D, AveragePooling2D, AveragePooling1D
from tensorflow.keras.layers import SeparableConv2D, DepthwiseConv2D
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras.layers import SpatialDropout1D
from tensorflow.keras.regularizers import l1_l2
from tensorflow.keras.layers import Input, Flatten
from tensorflow.keras.constraints import max_norm

import pandas as pd
import numpy as np
import random 

from importlib import reload  #reload(chart_builder)


#################
# import project utilities and the raw data reader
# Kaggle environment does not accept 'utils' as a file, so it must be accessed seperately

import ReadData.RawDataReader as data_reader
if 'KAGGLE_URL_BASE' in os.environ:
    import Utils.ModelBuilder as model_builder
    import Utils.ChartBuilder as chart_builder
else:
    import utils.ModelBuilder as model_builder
    import utils.ChartBuilder as chart_builder



Using homepath \Users\marit




In [3]:
import pandas as pd
import numpy as np

import random 
import math

#svm imports
from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

from sklearn.metrics import accuracy_score

In [4]:
ignore_list = [] #['s12', 's14']  #list of patient files that should be skipped
resolution_hz =  250 #EEG values per second
#seconds of data to include in one slice
time_window = resolution_hz * 15


In [5]:
import pyedflib
#returns file duration in seconds
def get_edf_file_duration(file_name):
    f = pyedflib.EdfReader(file_name)
    duration = f.file_duration
    f.close()
    return  duration

# get the minimum length of the files
def get_minimum_duration(group_directory_name, patient_group_file_prefix):
    file_durations = []
    for i in range (1, 15): # reading 14 files
        patient_id = "{}{:02d}".format(patient_group_file_prefix, i)
        file_name = raw_data_dir +'{}/{}.edf'.format(group_directory_name, patient_id)
        file_durations.append(get_edf_file_duration(file_name))

    return(min(file_durations))


minimum_duration = min(get_minimum_duration("Healthy Controls", "h"), get_minimum_duration('SZ Patients', 's'))
print('Minimum duration: ', minimum_duration)
all_channels = ['Fp2', 'F8', 'T4', 'T6', 'O2', 'Fp1', 'F7', 'T3', 'T5', 'O1', 'F4',
                'C4' , 'P4', 'F3', 'C3', 'P3', 'Fz' , 'Cz', 'Pz']

excluded_channels = ['F8', 'T3', 'T5', 'T6', 'O1', 'F3', 'Fp1', 'Fp2', 'P4']


def get_raw_eeg_mne(file_name, resolution_hz, tmin=None, tmax=None, exclude=[]):
    raw = mne.io.read_raw_edf(file_name, preload=True, exclude=exclude).load_data()
    raw.set_montage("standard_1020") #set montage to 10-20
    #print('tmin: ', tmin)
    #print('tmax: ', tmax)
    tmin = tmin if tmin else 1
    tmax = tmax if tmax else (get_edf_file_duration(file_name)-1) #get_edf_file_duration rounds values up
    raw.crop(tmin=tmin, tmax=tmax)
    raw.resample(resolution_hz, npad="auto") #set sampling frequency (dataset is set to 250Hz)
    
    return raw



Minimum duration:  740


In [6]:
import mne
mne.set_log_level("WARNING")

In [7]:

# get a list of randomly selected sets of numbers based on a range
# the proportion of values selected for each set is determined by the ratio_array
import itertools
def get_mixed_indexes_for_ml_train_test(length, ratios_array):
    input_indexes = range(0, length)
    output_indexes = []
    for ratio in ratios_array:
        input_indexes = [i for i in input_indexes if i not in list(itertools.chain(*output_indexes))]
        selection = random.sample(input_indexes, k=math.floor(ratio * length))
        output_indexes.append(selection)
    return output_indexes

In [8]:


# modified based on https://stackoverflow.com/a/48704557/2466781
def chunk(seq, size):
    sl = len(seq) - (len(seq)%size) #exclude values that will be out of range
    r = [np.asarray(seq[pos:pos + size]) for pos in range(0, sl, size)]
    """print('r begin')
    print(r)
    print('r end')"""
    return r

def chunk_list(nested_list, size):
    v = []
    for d in nested_list:
        df = pd.DataFrame(np.asarray(d))
        c = chunk(df, size)
        for e in c:
            v.append(e)
    return v

# modified version of process_patient_group in older notebooks
# Uses the raw EDF files and converts to dataframe, dropping the first 150 and last 30 seconds of the shortest  file
# All other files are trimmed similarly to produce the same size
# Adapted from page 1 of https://buildmedia.readthedocs.org/media/pdf/pyedflib/latest/pyedflib.pdf
def process_patient_group(group_directory_name, patient_group_file_prefix, 
                          minimum_original_duration,
                          resolution_hz,
                          plot_channels = False,
                          excluded_channels = []):
    meta_df = pd.DataFrame()
    meta = []
    patient_id_list = []

    for i in range (1, 15): # reading 14 files
        patient_id = "{}{:02d}".format(patient_group_file_prefix, i)
        patient_id_list.append(patient_id)
        
        file_name = raw_data_dir + '{}/{}.edf'.format(group_directory_name, patient_id)
        data = get_raw_eeg_mne(file_name, resolution_hz, exclude=excluded_channels, tmin=120, tmax=minimum_duration-120)
        df = data.to_data_frame()
        ## based on visual inspection, drop the first 120 seconds
        if patient_id not in ignore_list:
            meta.append(np.asarray(df))
            
                    
    return np.asarray(meta)



In [9]:

hc_data = process_patient_group('Healthy Controls', 'h', minimum_duration, resolution_hz, excluded_channels=excluded_channels)
display(np.asarray(hc_data).shape)

sz_data = process_patient_group('SZ Patients', 's', minimum_duration, resolution_hz, excluded_channels=excluded_channels)
display(np.asarray(sz_data).shape)

(14, 125001, 10)

(14, 125001, 10)

In [10]:


def plot_examples(file_name, excluded_channels, resolution_hz):
    patient_id = file_name.split("\\")[-1][:-4]
    print('Raw Data')
    raw = get_raw_eeg_mne(file_name, resolution_hz)
    #events = mne.find_events(raw, stim_channel=raw.ch_names, initial_event=True, consecutive=True)
    raw.plot()
    df = raw.to_data_frame()
    print('Shape: ', df.shape)

    print('Cleaned Data ')
    print('Excluding channels {}; '.format(", ".join(excluded_channels)))
    print('Removing first 120s of data; and last 120s of shortest sample. Limiting all other samples to range of shortest sample')
    cleaned = get_raw_eeg_mne(file_name, resolution_hz, tmin=120, tmax=minimum_duration-120, exclude=excluded_channels)
    cleaned.crop()
    cleaned.plot()
    print('Shape: ', cleaned.to_data_frame().shape)

from plotly.graph_objs import Layout, Scatter, Figure, Marker
import matplotlib.pyplot as plt

import plotly    
import chart_studio.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import cufflinks as cf
#import plotly.graph_objs.layout.scene.Annotation
cf.go_offline()
import plotly.graph_objs as go
import plotly.tools as tls
#cl_layout = go.Layout(width=950, height=800)


def plot_eeg_plotly(raw, excluded_channels, resolution_hz):
    n_channels = len(all_channels) - len(excluded_channels)
    picks = range(n_channels)
    start, stop = raw.time_as_index([0, -1])
    
    ####################################
    #stop = 500 #dbg

    data, times = raw[picks[:n_channels], start:stop]
    ch_names = [raw.info['ch_names'][p] for p in picks[:n_channels]]
    
    step = 1. / n_channels
    kwargs = dict(domain=[1 - step, 1], showticklabels=False, zeroline=False, showgrid=False)
    mc = 'rgb(27,61,120)'
    # create objects for layout and traces
    layout = Layout(yaxis=go.layout.YAxis(kwargs), showlegend=False)
    layout.update({'yaxis%d' % (0 + 1): go.layout.YAxis(kwargs), 'showlegend': False})
    traces = [Scatter(x=times, y=data.T[:, 0], marker_color=mc)]
    #display(raw.to_data_frame().head(stop))
    

    # loop over the channels
    for ii in range(1, n_channels):
            kwargs.update(domain=[1 - (ii + 1) * step, 1 - ii * step])
            layout.update({'yaxis%d' % (ii + 1): go.layout.YAxis(kwargs), 'showlegend': False})
            traces.append(Scatter(x=times, y=data.T[:, ii], marker_color = mc, yaxis='y%d' % (ii + 1)))

    # add channel names using Annotations
    annotations = [go.layout.Annotation(x=-0.06, y=0, xref='paper', yref='y%d' % (ii + 1),
                                          text=ch_name, showarrow=False)
                              for ii, ch_name in enumerate(ch_names)]
    layout.update(annotations=annotations)
    traces.reverse() #set the fist trace to the bottom of the plot sine it is the only one with xaxis

    # set the size of the figure and plot it
    layout.update(autosize=False, width=900, height=400)
    fig = Figure(data=traces, layout=layout)
    fig.update_xaxes(ticks="outside", tickwidth=2, tickcolor='black', ticklen=10, side='top')

    iplot(fig, filename='shared xaxis')
    
    
#adapted from https://plot.ly/python/v3/ipython-notebooks/mne-tutorial/
def plot_examples_plotly(file_name, excluded_channels, resolution_hz):
    patient_id = file_name.split("\\")[-1][:-4]
    ### print('\nRaw Data')
    ### raw = get_raw_eeg_mne(file_name)
    ### plot_eeg_plotly(raw, [])
    
    print('\nCleaned Data')
    print('Excluding channels {}; '.format(", ".join(excluded_channels)))
    print('Removing first 120s of data; and last 120s of shortest sample. Limiting all other samples to range of shortest sample')

    cleaned = get_raw_eeg_mne(file_name, resolution_hz, tmin=120, tmax=minimum_duration-120, exclude=excluded_channels)
    cleaned.crop()
    plot_eeg_plotly(cleaned, excluded_channels, resolution_hz)

# plot a random patient
sz_patient_list = list(range(1, 15, 1))
sz_patient_list.remove(12) #drop value from list of exclusions
sz_patient_list.remove(14) #drop value from list of exclusions
rand_patient_id =  random.choice(sz_patient_list)
rand_patient_file =  raw_data_dir + 'SZ Patients/{}.edf'.format("{}{:02d}").format('s', rand_patient_id)

rand_control_id = random.randrange(1, 15, 1)
rand_control_file = raw_data_dir + 'Healthy Controls/{}{:02d}.edf'.format('h', rand_control_id)


"""print('Example of Input Data From Random Patient')
print('Sz patient #{:02d}'.format(rand_patient_id))
plot_examples_plotly(rand_patient_file, excluded_channels, resolution_hz)"""

#print("Example of Input Data From Random Control")
#print('Control subject #{:02d}'.format(rand_control_id))
#plot_examples_plotly(rand_control_file, excluded_channels, resolution_hz)

print('Ignored files: ')
print(",".join(ignore_list))
    
    

Ignored files: 



In [11]:
#drop the second dimension
hc_data_all = np.asarray(hc_data)
sz_data_all = np.asarray(sz_data)


In [12]:
import sklearn.decomposition as decomposition

def select_denoised_data(patient_data):
    all_features = []
    
    for entry in patient_data:
        pca_denoise = decomposition.PCA(n_components=np.asarray(patient_data).shape[2])
        pca_denoise.fit(entry.transpose())
        denoised_data = pca_denoise.components_[:14] #select top v components, where v is based on chart of explained variance
        all_features.append(np.asarray(denoised_data)) 
        

    return all_features
    

sample_size = minimum_duration #use entire window
#send all channels and all patient data; s07 is still skipped
hc_data_all_denoised_selected = select_denoised_data(hc_data_all)
sz_data_all_denoised_selected = select_denoised_data(sz_data_all)

print('Shape of denoised data (extracted components) :')
print(np.asarray(hc_data_all_denoised_selected).shape)
print(np.asarray(sz_data_all_denoised_selected).shape)

Shape of denoised data (extracted components) :
(14, 10, 125001)
(14, 10, 125001)


In [13]:
# flatten the feature vectors so that input can be used in scikit learn 
def flatten_features(data):
    flattened_data = []
    for entry in data: 
        # shift axes so that data shape is time * channels * features. Then flatten data
        flattened_data.append(np.moveaxis(entry, 0, -1).flatten())
    return np.asarray(flattened_data, dtype=np.float32)


In [14]:
# load extracted features

X =  np.concatenate((hc_data_all_denoised_selected, sz_data_all_denoised_selected), axis=0)
X_original = X.copy()
print('Input size: ', X.shape)
y = ([0] * len(hc_data_all_denoised_selected)) +( [1] * len(sz_data_all_denoised_selected))
sample_size = 2

X = flatten_features(X)
print('Flattened input size: ', X.shape)



Input size:  (28, 10, 125001)
Flattened input size:  (28, 1250010)


In [15]:
# get a list of randomly selected sets of numbers based on a range
# the proportion of values selected for each set is determined by the ratio_array
import itertools
def get_mixed_indexes_for_ml_train_test(length, ratios_array):
    input_indexes = range(0, length)
    output_indexes = []
    for ratio in ratios_array:
        input_indexes = [i for i in input_indexes if i not in list(itertools.chain(*output_indexes))]
        selection = random.sample(input_indexes, k=math.floor(ratio * length))
        output_indexes.append(selection)
    return output_indexes
    



In [16]:
train_idxs, test_idxs = get_mixed_indexes_for_ml_train_test(len(X), [.80, 0.20])

X_train      = X[train_idxs][0:,]
Y_train      = np.asarray(y)[train_idxs]
X_test       = X[test_idxs][0:,]
Y_test       = np.asarray(y)[test_idxs]

print('Training on {} samples; testing on {} samples'.format(len(X_train), len(X_test)))

Training on 22 samples; testing on 5 samples


In [17]:
from sklearn.model_selection import cross_val_score

from sklearn import tree
from sklearn.naive_bayes import GaussianNB
from sklearn import neighbors
from sklearn.ensemble import AdaBoostClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Standardize the cross-validation implementation and
# the display of results for traditional models
#
# clf: the scikitlearn classifier to be used
# model_name: name of the model for display
# X: flattened data
# y: labels
# k_fold the number of folds (must be <= the number of participants in each group)
# scale_data: whether or not to use the StandardScaler to normalize the dataset; this is most helpful for SVM
#       but does not seem to have a significant impact on most other classifiers but seems to be problematic for
#       decision trees
def run_traditional_model(clf, model_name, X, y, k_fold=12, scale_data=False):
      
    if scale_data:
        scaler = StandardScaler()
        clf = Pipeline(steps=[('s', scaler), ('m', clf)]) 
    scaling_note = ' using scaled data ' if scale_data else ''
        
    scores = cross_val_score(clf, X, y, cv=k_fold)
    print('%s %3.f%%' % ('{} mean cross-validation accuracy (k-fold={}) {}:'
                         .format(model_name, k_fold, scaling_note), scores.mean() * 100.))
    return scores



k_fold=min(len(hc_data_all_denoised_selected), len(sz_data_all_denoised_selected))
print('Assessing various classifiers. Using k-fold value of ',k_fold,'\n')

model_name = 'SVM'
clf = SVC(gamma='scale', kernel='rbf', degree=3)## default values 
svm_scores = run_traditional_model(clf, model_name, X, y, k_fold=k_fold, scale_data=True)

model_name = 'Gaussian Naive Bayes'
clf = GaussianNB()
gnb_scores = run_traditional_model(clf, model_name, X, y, k_fold=k_fold)


model_name = 'K Nearest Neighbors'
n_neighbors = 2
clf = neighbors.KNeighborsClassifier(n_neighbors, weights='uniform') #weights={'uniform', 'distance'}
knn_scores = run_traditional_model(clf, model_name, X, y, k_fold=k_fold)


model_name = 'Nearest Centroid'
clf = neighbors.NearestCentroid()
nc_scores = run_traditional_model(clf, model_name, X, y, k_fold=k_fold)


model_name = 'Decision Tree'
clf = tree.DecisionTreeClassifier()
decision_tree_scores = run_traditional_model(clf, model_name, X, y, k_fold=k_fold)


model_name = 'Adaboost'
clf = AdaBoostClassifier(n_estimators=100)
adaboost_scores = run_traditional_model(clf, model_name, X, y, k_fold=k_fold)


model_name = 'Gradient Boosting Classifier'
clf = GradientBoostingClassifier(n_estimators=100, learning_rate=1.0,
    max_depth=1, random_state=0)
gbc_scores = run_traditional_model(clf, model_name, X, y, k_fold=k_fold)




Assessing various classifiers. Using k-fold value of  14 

SVM mean cross-validation accuracy (k-fold=14)  using scaled data :  43%
Gaussian Naive Bayes mean cross-validation accuracy (k-fold=14) :  54%
K Nearest Neighbors mean cross-validation accuracy (k-fold=14) :  50%
Nearest Centroid mean cross-validation accuracy (k-fold=14) :  29%
Decision Tree mean cross-validation accuracy (k-fold=14) :  93%
Adaboost mean cross-validation accuracy (k-fold=14) :  86%
Gradient Boosting Classifier mean cross-validation accuracy (k-fold=14) :  86%


In [18]:
## End of implementation code  

print('Printing environment settings')

from platform import python_version
print('\nPython version: ', python_version())
print('\nInstalled modules:\n')

!pip freeze

Printing environment settings

Python version:  3.7.1

Installed modules:

absl-py==0.7.1
alabaster==0.7.12
altgraph==0.16.1
anaconda-client==1.7.2
anaconda-navigator==1.9.6
anaconda-project==0.8.2
asn1crypto==0.24.0
astor==0.8.0
astroid==2.1.0
astropy==3.1
atomicwrites==1.2.1
attrs==18.2.0
Babel==2.6.0
backcall==0.1.0
backports.os==0.1.1
backports.shutil-get-terminal-size==1.0.0
beautifulsoup4==4.6.0
bitarray==0.8.3
bkcharts==0.2
blaze==0.11.3
bleach==3.0.2
blis==0.2.4
bokeh==1.0.2
boto==2.49.0
boto3==1.9.169
botocore==1.12.169
Bottleneck==1.2.1
cachetools==3.1.1
certifi==2018.11.29
cffi==1.11.5
chardet==3.0.4
Click==7.0
cloudpickle==0.6.1
clyent==1.2.2
colorama==0.4.1
colorlover==0.3.0
comtypes==1.1.7
conda==4.8.2
conda-build==3.17.6
conda-package-handling==1.3.11
conda-verify==3.1.1
contextlib2==0.5.5
cryptography==2.4.2
cufflinks==0.15
cycler==0.10.0
cymem==2.0.2
Cython==0.29.2
cytoolz==0.9.0.1
dask==1.0.0
datashape==0.5.4
decorator==4.3.0
defusedxml==0.5.0
distributed==1.25.1
docu

In [2]:
all_channels = ['Fp2', 'F8', 'T4', 'T6', 'O2', 'Fp1', 'F7', 'T3', 'T5', 'O1', 'F4',
                'C4' , 'P4', 'F3', 'C3', 'P3', 'Fz' , 'Cz', 'Pz']

excluded_channels = ['F8', 'T3', 'T5', 'T6', 'O1', 'F3', 'Fp1', 'Fp2', 'P4']

selected_channels = [ch for ch in all_channels if ch not in excluded_channels]
selected_channels

['T4', 'O2', 'F7', 'F4', 'C4', 'C3', 'P3', 'Fz', 'Cz', 'Pz']