In [1]:

from keras.utils import to_categorical
from keras.optimizers import Adam
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv2D, MaxPool2D, Dropout, Flatten
from keras.layers.convolutional import MaxPooling2D
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, roc_auc_score
from scipy import signal
from glob import glob
import matplotlib.pyplot as plt

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# constants:
SUBSAMPLE = 1000  # scales the data by this factor
COLUMNS = ['HandStart', 'FirstDigitTouch',
        'BothStartLoadPhase', 'LiftOff',
        'Replace', 'BothReleased']
SUBJECTS = range(1, 2)
N_LABELS = 6

# Below are the paths to the data. Please pay attention to the % and * signs, these are needed in the loop.
TRAIN_DATA_PATH = 'C:/Users/Sebastiaan/Desktop/Programming/MachineLearning/Datasets/EEG/train/subj%d_series*_data.csv'
#The path below is for the test data used for a kaggle submission. This is not very relevant to our project.
TEST_DATA_PATH = 'C:/Users/Sebastiaan/Desktop/Programming/MachineLearning/Datasets/EEG/test/subj%d_series*_data.csv'
#TRAIN_DATA_PATH = 'C:/Users/bas/Documents/MachineLearning/train/subj%d_series*_data.csv' #path on my laptop
#TEST_DATA_PATH =  'C:/Users/bas/Documents/MachineLearning/test/subj%d_series*_data.csv' 

SUBMISSION_FOLDER = 'C:/Users/Sebastiaan/Desktop/Programming/MachineLearning/'
SUBMISSION_NAME = 'subbmision_vu_48_sub_pca_4.csv'

PCA_COMPONENTS = 0.8
CUTT_OFF_FREQUENCY = 2
ORDER = 4
SAMPLE_FREQUENCY = 500

In [3]:
def prepare_training_data(data_path):
    data = pd.read_csv(data_path)
    event_path = data_path.replace('_data', '_events')
    labels = pd.read_csv(event_path)
    clean_data = data.drop(['id'], axis = 1)
    labels = labels.drop(['id'], axis = 1)
    return clean_data, labels

In [4]:
def prepare_test_data(data_path):
    data = pd.read_csv(data_path)
    return data

In [5]:
def scaler_transform(data, scaler):
    if scaler == None:
        scaler = StandardScaler()
        return scaler.fit_transform(data), scaler
    else:
        return scaler.transform(data)        

In [6]:
def butter_worth(data, cut_off, hz, order):
    filtered_data = np.empty(np.shape(data))
    wn = cut_off / (hz/2)
    b, a = signal.butter(order, wn, analog = False)
    for i in range(len(data[0])):
        filtered_data[:,i] = signal.lfilter(b, a, data[:,i])    
    return filtered_data

In [7]:
def pca_transform(data, n_components, pca_model):
    if pca_model == None:
        pca = PCA(n_components, whiten = True) 
        return pca.fit_transform(data), pca
    else:
        return pca_model.transform(data)

In [8]:
def read_training_data(train_data_paths):
    labels_raw = []
    features_raw = []
    for data_path in train_data_paths:
        data, labels = prepare_training_data(data_path)
        features_raw.append(data)
        labels_raw.append(labels)
    features_raw = pd.concat(features_raw)
    labels_raw = pd.concat(labels_raw)
    x_train = np.asarray(features_raw.astype(float))
    y_train = np.asarray(labels_raw.astype(float))
    return x_train, y_train

In [9]:
def read_test_data(test_data_paths):
    test_features_raw = []
    ids = []
    for data_path in test_data_paths:
        data = prepare_test_data(data_path)
        test_features_raw.append(data)
        ids.append(np.array(data['id']))
    test_features_raw = pd.concat(test_features_raw)
    ids = np.concatenate(ids)
    test_features_raw = test_features_raw.drop(['id'], axis = 1)
    x_test = np.asarray(test_features_raw.astype(float))
    return x_test, ids

In [10]:
def train_model(x_train, y_train, model, subsample):
    model.fit(x_train[::subsample,:], y_train[::subsample])
    return model

In [12]:
def make_prediction(x_test, model):
    prediction = model.predict_proba(x_test)[:,1]
    return prediction

In [13]:
def metric_auc_score(predictions, y_test, with_plot):
    scores = []
    legend_text = []
    for i in range(N_LABELS):
        fpr, tpr, _  = roc_curve(y_test[:,i], predictions[:,i], 1)
        scores.append(roc_auc_score(y_test[:,i], predictions[:,i]))
        legend_text.append(COLUMNS[i]+' (area = %.3f)' % (scores[i]))
        if with_plot == True:
            plt.plot(fpr, tpr)
    if with_plot == True:
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves')
        plt.legend(legend_text)
        plt.show()
    return scores

In [14]:
def all_auc_scores(prediction_total, test_data_total, subjects, with_plot):
    scores = []
    for subject in subjects:
        score = metric_auc_score(prediction_total[subject-1],
                                 test_data_total[subject-1], with_plot)
        scores.append(score)
        print('Mean AUC Score of Subject %d: %.3f' % \
              (subject, np.mean(score)))
    return scores

In [15]:
def make_submission_file(name, ids_total, prediction_total, columns, path):
    submission = pd.DataFrame(index = np.concatenate(ids_total), columns = columns,
                              data = np.concatenate(prediction_total))
    submission.to_csv(path+name, index_label = 'id', float_format = '%.3f')

In [29]:
def generator(x_train, y_train, image_size):
    empty_matrix = np.expand_dims(np.zeros(np.shape(x_train[0:image_size])), axis=3)
    while True:
        for i in range(len(x_train)):
            empty_matrix = np.expand_dims(np.zeros(np.shape(x_train[0:image_size])), axis=3)
            if i-image_size < 0:
                yield np.array([empty_matrix]), np.expand_dims(y_train[i], axis=2)
            else:
                yield np.array([np.expand_dims(x_train[i-image_size:i],axis=3)]), np.expand_dims(y_train[i],axis=2)

In [None]:
def init_cnn(window):
    model = Sequential()
    model.add(Conv2D(16, kernel_size=(3, 3), activation='relu', input_shape=(window,32,1)))
    model.add(Conv2D(32, (3, 3), activation='relu'))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Dropout(0.25)) # Dropout 25% of the nodes of the previous layer during training
    model.add(Flatten())     # Flatten, and add a fully connected layer
    model.add(Dense(128, activation='relu')) 
    model.add(Dropout(0.5))
    model.add(Dense((1), activation='softmax')) # Last layer: 10 class nodes, with dropout
    model.summary()
    optimizer = Adam()
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [16]:
prediction_total = []
test_data_total = []
ids_total = []
for subject in SUBJECTS:
    test_features_raw = []
    train_data_paths = glob(TRAIN_DATA_PATH % (subject))
    test_data_paths =  glob(TEST_DATA_PATH % (subject))
  
    x_train, y_train = read_training_data(train_data_paths)  
    x_train, x_test, y_train, y_test = train_test_split(x_train, y_train, test_size=0.2)
    
    #Below you can put preprocessing functions. You can make a decision in which method to use
    #by commenting certain lines or not.
#     x_train = butter_worth(x_train, CUTT_OFF_FREQUENCY, SAMPLE_FREQUENCY, ORDER)
#     x_test = butter_worth(x_test, CUTT_OFF_FREQUENCY, SAMPLE_FREQUENCY, ORDER)
#     x_train, pca_model = pca_transform(x_train, PCA_COMPONENTS, None)
#     x_test = pca_transform(x_test, PCA_COMPONENTS, pca_model)
    x_train, scaler = scaler_transform(x_train, None)
    x_test = scaler_transform(x_test, scaler)
    #############################################
    

    predictions = np.empty((x_test.shape[0],6))
#     Below you define the model you want to use.
    logreg = LogisticRegression()
    for i in range(N_LABELS):
        print('Train subject %d, class %s' % (subject, COLUMNS[i]))
        model = train_model(x_train, y_train[:,i], logreg, SUBSAMPLE)
        predictions[:,i] = make_prediction(x_test, model)
    test_data_total.append(y_test)   
    prediction_total.append(predictions)

Train subject 1, class HandStart
Train subject 1, class FirstDigitTouch
Train subject 1, class BothStartLoadPhase
Train subject 1, class LiftOff
Train subject 1, class Replace
Train subject 1, class BothReleased


In [22]:
model = init_cnn(window=200)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_3 (Conv2D)            (None, 198, 30, 16)       160       
_________________________________________________________________
conv2d_4 (Conv2D)            (None, 196, 28, 32)       4640      
_________________________________________________________________
max_pooling2d_2 (MaxPooling2 (None, 98, 14, 32)        0         
_________________________________________________________________
dropout_3 (Dropout)          (None, 98, 14, 32)        0         
_________________________________________________________________
flatten_2 (Flatten)          (None, 43904)             0         
_________________________________________________________________
dense_3 (Dense)              (None, 128)               5619840   
_________________________________________________________________
dropout_4 (Dropout)          (None, 128)               0         
__________

In [30]:
my_generater = generator(x_train[::SUBSAMPLE], y_train[:,0][::SUBSAMPLE], 200)
model.fit_generator(my_generater, steps_per_epoch=1, epochs=15, verbose=2);

Epoch 1/15
 - 0s - loss: 15.9424 - acc: 0.0000e+00
Epoch 2/15


  
  """
  import sys


 - 0s - loss: 15.9424 - acc: 0.0000e+00
Epoch 3/15
 - 0s - loss: 15.9424 - acc: 0.0000e+00
Epoch 4/15
 - 0s - loss: 15.9424 - acc: 0.0000e+00
Epoch 5/15
 - 0s - loss: 15.9424 - acc: 0.0000e+00
Epoch 6/15
 - 0s - loss: 15.9424 - acc: 0.0000e+00
Epoch 7/15
 - 0s - loss: 15.9424 - acc: 0.0000e+00
Epoch 8/15
 - 0s - loss: 15.9424 - acc: 0.0000e+00
Epoch 9/15
 - 0s - loss: 15.9424 - acc: 0.0000e+00
Epoch 10/15
 - 0s - loss: 15.9424 - acc: 0.0000e+00
Epoch 11/15
 - 0s - loss: 15.9424 - acc: 0.0000e+00
Epoch 12/15
 - 0s - loss: 15.9424 - acc: 0.0000e+00
Epoch 13/15
 - 0s - loss: 15.9424 - acc: 0.0000e+00
Epoch 14/15
 - 0s - loss: 15.9424 - acc: 0.0000e+00
Epoch 15/15
 - 0s - loss: 15.9424 - acc: 0.0000e+00


  This is separate from the ipykernel package so we can avoid doing imports until


1137913

[array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([[1.]], dtype=float32),
 array([

In [None]:
scores = all_auc_scores(prediction_total, test_data_total, SUBJECTS, with_plot=True)

In [None]:
print('Mean Columnwise AUC ROC Score: %.3f, %.3f' % (np.mean(scores), np.std(scores)))

In [None]:
#########################KAGGLE SUBMISSION CODE

# prediction_total= []
# ids_total = []
# for subject in SUBJECTS:
#     test_features_raw = []
#     train_data_paths = glob(TRAIN_DATA_PATH % (subject))
#     test_data_paths =  glob(TEST_DATA_PATH % (subject))
  
#     x_train, y_train = read_training_data(train_data_paths)
#     x_test, ids = read_test_data(test_data_paths)
#     ids_total.append(ids)
    
    
    
    #Below you can put preprocessing functions. You can make a decision in which method to use
    #by commenting certain lines or not.#     x_train = butter_worth(x_train, CUTT_OFF_FREQUENCY, SAMPLE_FREQUENCY, ORDER)
#     x_test = butter_worth(x_test, CUTT_OFF_FREQUENCY, SAMPLE_FREQUENCY, ORDER)
#     x_train, pca_model = pca_transform(x_train, PCA_COMPONENTS, None)
#     x_test = pca_transform(x_test, PCA_COMPONENTS, pca_model)
#     x_train, scaler = scaler_transform(x_train, None)
#     x_test = scaler_transform(x_test, scaler)
#     #############################################
    
#     predictions = np.empty((x_test.shape[0],6))
#     #Below you define the model you want to use.
#     logreg = LogisticRegression()
#     for i in range(N_LABELS):
#         print('Train subject %d, class %s' % (subject, COLUMNS[i]))
#         model = train_model(x_train, y_train[:,i], logreg, SUBSAMPLE)
#         predictions[:,i] = make_prediction(x_test, model)
        
#     prediction_total.append(predictions)

# make_submission_file(SUBMISSION_NAME, ids_total, prediction_total, COLUMNS, SUBMISSION_FOLDER)