In [9]:
import os
raw_data_dir = ''

if  'COLAB_GPU' in os.environ:
    print('Using Google Colab. Setting up environment')
    raw_data_dir = '/content/drive/My Drive/Colab Notebooks/' 
    !pip install mne
    !pip install pyedflib

    print('\n \n To load files from Google Drive, account validation is required.')
    #mount to drive -- files should be located in the Colab notebooks directory
    from google.colab import drive
    drive.mount("/content/drive", force_remount=True)
else:
    if 'HOMEPATH' in os.environ:
        print('Using homepath ' + os.environ['HOMEPATH'])
    #declare local data directory here:
    raw_data_dir = '../Data/Raw/' 

    


Using homepath \Users\marit


In [12]:
import pandas as pd
import numpy as np

import random 
import math

from sklearn.svm import SVC
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import accuracy_score

import pyedflib
import mne



In [7]:
# tmf
ignore_list = ['s12', 's14']  #list of patient files that should be skipped
#seconds of data to include in one slice
time_window = 250 * 20



In [10]:
# tmf

#returns file duration in seconds
def get_edf_file_duration(file_name):
    f = pyedflib.EdfReader(file_name)
    duration = f.file_duration
    f.close()
    return  duration

# get the minimum length of the files
def get_minimum_duration(group_directory_name, patient_group_file_prefix):
    file_durations = []
    for i in range (1, 15): # reading 14 files
        patient_id = "{}{:02d}".format(patient_group_file_prefix, i)
        file_name = raw_data_dir +'{}/{}.edf'.format(group_directory_name, patient_id)
        file_durations.append(get_edf_file_duration(file_name))

    return(min(file_durations))


minimum_duration = min(get_minimum_duration("Healthy Controls", "h"), get_minimum_duration('SZ Patients', 's'))
print('Minimum duration: ', minimum_duration)
all_channels = ['Fp2', 'F8', 'T4', 'T6', 'O2', 'Fp1', 'F7', 'T3', 'T5', 'O1', 'F4',
                'C4', 'P4', 'F3', 'C3', 'P3', 'Fz', 'Cz', 'Pz']
excluded_channels = ['O1', 'O2', 'T3', 'Fp1', 'Fp2', 'T6', 'F3', 'F4', 'F7', 'F8', 'Fz']


def get_raw_eeg_mne(file_name, tmin=None, tmax=None, exclude=[]):
    raw = mne.io.read_raw_edf(file_name, preload=True, exclude=exclude).load_data()
    raw.set_montage("standard_1020") #set montage to 10-20
    #print('tmin: ', tmin)
    #print('tmax: ', tmax)
    tmin = tmin if tmin else 1
    tmax = tmax if tmax else (get_edf_file_duration(file_name)-1) #get_edf_file_duration rounds values up
    raw.crop(tmin=tmin, tmax=tmax)
    raw.resample(250, npad="auto") #set sampling frequency to 250Hz
    
    return raw



Minimum duration:  740


In [13]:
# tmf
mne.set_log_level("WARNING")

def plot_examples(file_name, excluded_channels):
    patient_id = file_name.split("\\")[-1][:-4]
    print('Raw Data')
    raw = get_raw_eeg_mne(file_name)
    #events = mne.find_events(raw, stim_channel=raw.ch_names, initial_event=True, consecutive=True)
    raw.plot()
    df = raw.to_data_frame()
    print('Shape: ', df.shape)

    print('Cleaned Data ')
    print('Excluding channels {}; '.format(", ".join(excluded_channels)))
    print('Removing first 120s of data; and last 120s of shortest sample. Limiting all other samples to range of shortest sample')
    cleaned = get_raw_eeg_mne(file_name, tmin=120, tmax=minimum_duration-120, exclude=excluded_channels)
    cleaned.crop()
    cleaned.plot()
    print('Shape: ', cleaned.to_data_frame().shape)

from plotly.graph_objs import Layout, Scatter, Figure, Marker
import matplotlib.pyplot as plt

import plotly    
import chart_studio.plotly as py
from plotly.offline import download_plotlyjs, init_notebook_mode, iplot
import cufflinks as cf
#import plotly.graph_objs.layout.scene.Annotation
cf.go_offline()
import plotly.graph_objs as go
import plotly.tools as tls
#cl_layout = go.Layout(width=950, height=800)


def plot_eeg_plotly(raw, excluded_channels):
    n_channels = len(all_channels) - len(excluded_channels)
    picks = range(n_channels)
    start, stop = raw.time_as_index([0, -1])

    data, times = raw[picks[:n_channels], start:stop]
    ch_names = [raw.info['ch_names'][p] for p in picks[:n_channels]]
    
    step = 1. / n_channels
    kwargs = dict(domain=[1 - step, 1], showticklabels=False, zeroline=False, showgrid=False)
    mc = 'rgb(27,61,120)'
    # create objects for layout and traces
    layout = Layout(yaxis=go.layout.YAxis(kwargs), showlegend=False)
    layout.update({'yaxis%d' % (0 + 1): go.layout.YAxis(kwargs), 'showlegend': False})
    traces = [Scatter(x=times, y=data.T[:, 0], marker_color=mc)]
    

    # loop over the channels
    for ii in range(1, n_channels):
            kwargs.update(domain=[1 - (ii + 1) * step, 1 - ii * step])
            layout.update({'yaxis%d' % (ii + 1): go.layout.YAxis(kwargs), 'showlegend': False})
            traces.append(Scatter(x=times, y=data.T[:, ii], marker_color = mc, yaxis='y%d' % (ii + 1)))

    # add channel names using Annotations
    annotations = [go.layout.Annotation(x=-0.06, y=0, xref='paper', yref='y%d' % (ii + 1),
                                          text=ch_name, showarrow=False)
                              for ii, ch_name in enumerate(ch_names)]
    layout.update(annotations=annotations)
    traces.reverse() #set the fist trace to the bottom of the plot sine it is the only one with xaxis

    # set the size of the figure and plot it
    layout.update(autosize=False, width=900, height=400)
    fig = Figure(data=traces, layout=layout)
    fig.update_xaxes(ticks="outside", tickwidth=2, tickcolor='black', ticklen=10, side='top')

    iplot(fig, filename='shared xaxis')
    
    
#adapted from https://plot.ly/python/v3/ipython-notebooks/mne-tutorial/
def plot_examples_plotly(file_name, excluded_channels):
    patient_id = file_name.split("\\")[-1][:-4]
    ### print('\nRaw Data')
    ### raw = get_raw_eeg_mne(file_name)
    ### plot_eeg_plotly(raw, [])
    
    print('\nCleaned Data')
    print('Excluding channels {}; '.format(", ".join(excluded_channels)))
    print('Removing first 120s of data; and last 120s of shortest sample. Limiting all other samples to range of shortest sample')

    cleaned = get_raw_eeg_mne(file_name, tmin=120, tmax=minimum_duration-120, exclude=excluded_channels)
    cleaned.crop()
    plot_eeg_plotly(cleaned, excluded_channels)

# plot a random patient
sz_patient_list = list(range(1, 15, 1))
sz_patient_list.remove(12) #drop value from list of exclusions
sz_patient_list.remove(14) #drop value from list of exclusions
rand_patient_id =  random.choice(sz_patient_list)
rand_patient_file =  raw_data_dir + 'SZ Patients/{}.edf'.format("{}{:02d}").format('s', rand_patient_id)

rand_control_id = random.randrange(1, 15, 1)
rand_control_file = raw_data_dir + 'Healthy Controls/{}{:02d}.edf'.format('h', rand_control_id)

print('Example of Input Data From Random Patient')
print('Sz patient #{:02d}'.format(rand_patient_id))
plot_examples_plotly(rand_patient_file, [])

print("Example of Input Data From Random Control")
print('Control subject #{:02d}'.format(rand_control_id))
plot_examples_plotly(rand_control_file, [])

print('Ignored files: ')
print(",".join(ignore_list))
    
    

Example of Input Data From Random Patient
Sz patient #07

Cleaned Data
Excluding channels ; 
Removing first 120s of data; and last 120s of shortest sample. Limiting all other samples to range of shortest sample


KeyboardInterrupt: 

In [14]:
#tmf


# modified based on https://stackoverflow.com/a/48704557/2466781
def chunk(seq, size):
    sl = len(seq) - (len(seq)%size) #exclude values that will be out of range
    r = [np.asarray(seq[pos:pos + size]) for pos in range(0, sl, size)]
    """print('r begin')
    print(r)
    print('r end')"""
    return r

def chunk_list(nested_list, size):
    v = []
    for d in nested_list:
        df = pd.DataFrame(np.asarray(d))
        c = chunk(df, size)
        for e in c:
            v.append(e)
    return v

# modified version of process_patient_group in older notebooks
# Uses the raw EDF files and converts to dataframe, dropping the first 150 and last 30 seconds of the shortest  file
# All other files are trimmed similarly to produce the same size
# Adapted from page 1 of https://buildmedia.readthedocs.org/media/pdf/pyedflib/latest/pyedflib.pdf
def process_patient_group(group_directory_name, patient_group_file_prefix, 
                          minimum_original_duration,
                          plot_channels = False,
                          excluded_channels = []):
    meta_df = pd.DataFrame()
    meta = []
    patient_id_list = []

    for i in range (1, 15): # reading 14 files
        patient_id = "{}{:02d}".format(patient_group_file_prefix, i)
        patient_id_list.append(patient_id)
        
        file_name = raw_data_dir + '{}/{}.edf'.format(group_directory_name, patient_id)
        data = get_raw_eeg_mne(file_name, exclude=excluded_channels, tmin=120, tmax=minimum_duration-120)
        df = data.to_data_frame()
        ## based on visual inspection, drop the first 120 seconds
        if patient_id not in ignore_list:
            meta.append(np.asarray(df))            
            

    return np.asarray(meta)



In [15]:
#tmf - setup function

# create the /tmp directory if it doesn't already exist
import os
if not os.path.exists('tmp'):
    os.makedirs('tmp')


    
hc_data_all = process_patient_group('Healthy Controls', 'h', minimum_duration, excluded_channels=[])
display(np.asarray(hc_data_all).shape)

sz_data_all = process_patient_group('SZ Patients', 's', minimum_duration, excluded_channels=[])
display(np.asarray(sz_data_all).shape)
#print(sz_data)

(14, 125001, 19)

(12, 125001, 19)

In [18]:

# get ideal number of components
import numpy as np
from sklearn.decomposition import PCA
from sklearn.preprocessing import MinMaxScaler
#plt.grid()

X =  np.concatenate((hc_data_all, sz_data_all), axis=0)
data = X.reshape(X.shape[0] * X.shape[1], X.shape[2])

scaler = MinMaxScaler(feature_range=[0, 1])
#data_rescaled = scaler.fit_transform(data[1:, 0:8])
data_rescaled = scaler.fit_transform(data)

#Fitting the PCA algorithm with our Data
pca = PCA().fit(data_rescaled)
#Plotting the Cumulative Summation of the Explained Variance
plt.figure(figsize=(20,10))

plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('Number of Components')
plt.ylabel('Variance (%)') #for each component
plt.title('RepOD EEG in Sz Dataset Explained Variance')
plt.grid()


MemoryError: 

In [96]:
feature_ratings = {}


from sklearn import decomposition
def rate_features(patient_data):
    all_features = []
  
    
    for entry in patient_data[:1]:
        sample_features = []
        pca = decomposition.PCA(n_components=19)
        df = entry.transpose()
        # normalize data
        #from sklearn import preprocessing
        #df_scaled = pd.DataFrame(preprocessing.scale(df)) 
        df_norm = (df - df.mean()) / df.std()

        pca_output = pca.fit_transform(df_norm)
        print('explained variance: ')
        print(pca.explained_variance_)
        print(pca.singular_values_)
        #print(pca_outuput.explained_variance)
    
        #print(pca.components_)
        #print(pca_denoise.components_)
        denoised_data = pca.components_
        
        #get coefficients used to calculate components
        #https://stackoverflow.com/questions/22984335/recovering-features-names-of-explained-variance-ratio-in-pca-with-sklearn
        #i = np.identity(df.shape[1])
        i = np.eye(df.shape[1])
        print(df.shape[1])
        coeffs = pca.transform(i)
        
        print(np.allclose(df_norm.values.dot(coef), pca.fit_transform(df_norm.values)))
        
        
        #print(pd.DataFrame(pca.components_))

        
        ######all_features.append(np.asarray(denoised_data)) 
        #print(denoised_data)
        
        #denoised_data = pca.transform(entry.transpose())
        denoised_patient_data = [x for _,x in sorted(zip(pca_output.explained_variance_,denoised_data))]
        
        # use explained variance to sort features by salience
        print(denoised_patient_data)
        
        
       



    return all_features
    

sample_size = minimum_duration #use entire window
#send all channels and all patient data; s07 is still skipped

rate_features(hc_data_all)
hc_data_all_denoised_selected = select_denoised_data(hc_data_all)
sz_data_all_denoised_selected = select_denoised_data(sz_data_all)

print('Shape of denoised data (extracted components) :')
print(np.asarray(hc_data_all_denoised_selected).shape)
print(np.asarray(sz_data_all_denoised_selected).shape)

explained variance: 
[5.29569690e+04 1.09464990e+04 1.04057472e+04 6.72521991e+03
 3.99676697e+03 2.22050034e+03 1.42506224e+03 1.28504007e+03
 1.10029635e+03 8.26151145e+02 5.16024148e+02 4.24550647e+02
 3.50432209e+02 2.78608594e+02 2.32296231e+02 2.22213583e+02
 1.87225583e+02 1.41160324e+02 9.16303963e-26]
[9.76332649e+02 4.43888480e+02 4.32785685e+02 3.47928094e+02
 2.68219696e+02 1.99922500e+02 1.60159671e+02 1.52087874e+02
 1.40731426e+02 1.21945564e+02 9.63765255e+01 8.74180281e+01
 7.94215321e+01 7.08163447e+01 6.46632210e+01 6.32443238e+01
 5.80522222e+01 5.04072001e+01 1.28426910e-12]


MemoryError: 

In [111]:
#https://machinelearningmastery.com/feature-selection-machine-learning-python/
X =  np.concatenate((hc_data_all, sz_data_all), axis=0)
y = ([0] * len(hc_data_all.reshape(-1, hc_data_all.shape[-1]))) + ( [1] * len(sz_data_all.reshape(-1, sz_data_all.shape[-1])))
from sklearn.ensemble import ExtraTreesClassifier

# feature selection
feature_selection_model = ExtraTreesClassifier(n_estimators=19)
print(len(y))
print(np.asarray(X.reshape(-1, X.shape[-1])).shape)
flattened_X = X.reshape(-1, X.shape[-1])
feature_selection_model.fit(flattened_X, y)


3250026
(3250026, 19)


ExtraTreesClassifier(bootstrap=False, ccp_alpha=0.0, class_weight=None,
                     criterion='gini', max_depth=None, max_features='auto',
                     max_leaf_nodes=None, max_samples=None,
                     min_impurity_decrease=0.0, min_impurity_split=None,
                     min_samples_leaf=1, min_samples_split=2,
                     min_weight_fraction_leaf=0.0, n_estimators=19, n_jobs=None,
                     oob_score=False, random_state=None, verbose=0,
                     warm_start=False)

In [112]:
#print(feature_selection_model.feature_importances_)

#sort the features in order of the importance score generated by the model
ranking = [x for _,x in sorted(zip(feature_selection_model.feature_importances_, all_channels), reverse=True)]
 
#print(sorted(feature_selection_model.feature_importances_, reverse=True))
print('Ranking of Channels, according to Feature Importance Model (Tree Classifier)')
print(ranking)

print('\n\nScores: ')
print(sorted(feature_selection_model.feature_importances_, reverse=True))
        

Ranking of Channels, according to Feature Importance Model (Tree Classifier)
['Cz', 'P3', 'P4', 'Pz', 'T4', 'O2', 'T3', 'C4', 'C3', 'T6', 'T5', 'F3', 'F4', 'O1', 'Fz', 'F7', 'Fp2', 'F8', 'Fp1']


Scores: 
[0.08060796353071435, 0.0658660290172496, 0.06373515775957168, 0.06188804002592751, 0.055908688057012816, 0.054996887616922265, 0.05264403404881429, 0.052529385820546885, 0.051163713488321244, 0.05045640505967478, 0.049220405286900984, 0.04915219964460713, 0.048336622980680144, 0.046249768898948204, 0.04412461680367867, 0.04407183106096143, 0.04323023450713641, 0.04291598476083345, 0.04290203163149817]


In [None]:
#

In [6]:
# flatten the feature vectors so that input can be used in scikit learn 
def flatten_features(data):
    flattened_data = []
    for entry in data: 
        # shift axes so that data shape is time * channels * features. Then flatten data
        flattened_data.append(np.moveaxis(entry, 0, -1).flatten())
    return np.asarray(flattened_data, dtype=np.float32)


In [7]:
# load extracted features

X =  np.concatenate((hc_data_all_denoised_selected, sz_data_all_denoised_selected), axis=0)
X_original = X.copy()
print('Input size: ', X.shape)
y = ([0] * len(hc_data_all_denoised_selected)) +( [1] * len(sz_data_all_denoised_selected))
sample_size = 2

X = flatten_features(X)
print('Flattened input size: ', X.shape)



Input size:  (17, 12, 740)
Flattened input size:  (17, 8880)


In [8]:
# get a list of randomly selected sets of numbers based on a range
# the proportion of values selected for each set is determined by the ratio_array
def get_mixed_indexes_for_ml_train_test(length, ratios_array):
    input_indexes = range(0, length)
    output_indexes = []
    for ratio in ratios_array:
        selection = random.choices(input_indexes, k=math.floor(ratio * length))
        input_indexes = [i for i in input_indexes if i not in selection]
        output_indexes.append(selection)
    return output_indexes
    



In [9]:
train_idxs, test_idxs = get_mixed_indexes_for_ml_train_test(len(X), [.70, 0.30])

X_train      = X[train_idxs][0:,]
Y_train      = np.asarray(y)[train_idxs]
X_test       = X[test_idxs][0:,]
Y_test       = np.asarray(y)[test_idxs]

print('Training on {} samples; testing on {} samples'.format(len(X_train), len(X_test)))

Training on 11 samples; testing on 5 samples


In [10]:
# from https://machinelearningmastery.com/indoor-movement-time-series-classification-with-machine-learning-algorithms/

scaler = StandardScaler()
##kernel = ‘linear’, ‘poly’, ‘rbf’, ‘sigmoid’,  | gamma = 'scale', 'auto'
svm = SVC(gamma='scale', kernel='rbf', degree=3, decision_function_shape = 'ovr')#default values 

model = Pipeline(steps=[('s',scaler), ('m', svm)]) 
model.fit(X_train, Y_train)
# predict
yhat = model.predict(X_test)
# evaluate
score = accuracy_score(Y_test, yhat) * 100
# summarize
print('%s %.3f%%' % ('SVM accuracy for Y_test: ', score))

SVM accuracy for Y_test:  60.000%


In [11]:
## End of implementation code  

print('Printing environment settings')

from platform import python_version
print('\nPython version: ', python_version())
print('\nInstalled modules:\n')

!pip freeze

Printing environment settings

Python version:  3.7.1

Installed modules:

absl-py==0.7.1
alabaster==0.7.12
altgraph==0.16.1
anaconda-client==1.7.2
anaconda-navigator==1.9.6
anaconda-project==0.8.2
asn1crypto==0.24.0
astor==0.8.0
astroid==2.1.0
astropy==3.1
atomicwrites==1.2.1
attrs==18.2.0
Babel==2.6.0
backcall==0.1.0
backports.os==0.1.1
backports.shutil-get-terminal-size==1.0.0
beautifulsoup4==4.6.0
bitarray==0.8.3
bkcharts==0.2
blaze==0.11.3
bleach==3.0.2
blis==0.2.4
bokeh==1.0.2
boto==2.49.0
boto3==1.9.169
botocore==1.12.169
Bottleneck==1.2.1
cachetools==3.1.1
certifi==2018.11.29
cffi==1.11.5
chardet==3.0.4
Click==7.0
cloudpickle==0.6.1
clyent==1.2.2
colorama==0.4.1
colorlover==0.3.0
comtypes==1.1.7
conda==4.5.12
conda-build==3.17.6
conda-verify==3.1.1
contextlib2==0.5.5
cryptography==2.4.2
cufflinks==0.15
cycler==0.10.0
cymem==2.0.2
Cython==0.29.2
cytoolz==0.9.0.1
dask==1.0.0
datashape==0.5.4
decorator==4.3.0
defusedxml==0.5.0
distributed==1.25.1
docutils==0.14
entrypoints==0.2.3
