In [1]:
from keras.utils import to_categorical
from sklearn.utils import resample
from keras.optimizers import Adam, Adadelta
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv2D, MaxPool2D, Dropout, Flatten, Conv1D
from keras.layers.convolutional import MaxPooling2D, MaxPooling1D
from keras.models import load_model
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.utils import class_weight
from scipy import signal
from glob import glob
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings('ignore')

  from ._conv import register_converters as _register_converters
Using TensorFlow backend.


In [2]:
# constants:
SUBSAMPLE = 100  # scales the data by this factor can be replaced by divding the steps per epoch by batch size
COLUMNS = ['HandStart', 'FirstDigitTouch',
        'BothStartLoadPhase', 'LiftOff',
        'Replace', 'BothReleased']
SUBJECTS = range(1, 2)
N_LABELS = 6

# Below are the paths to the data. Please pay attention to the % and * signs, these are needed in the loop.
#TRAIN_DATA_PATH = 'C:/Users/Sebastiaan/Desktop/Programming/MachineLearning/Datasets/EEG/train/subj%d_series*_data.csv'
TRAIN_DATA_PATH = 'C:/Users/Gebruiker/Documents/Untitled Folder/train/train/subj%d_series*_data.csv'
#The path below is for the test data used for a kaggle submission. This is not very relevant to our project.
# TEST_DATA_PATH = 'C:/Users/Sebastiaan/Desktop/Programming/MachineLearning/Datasets/EEG/test/subj%d_series*_data.csv'
# TRAIN_DATA_PATH = 'C:/Users/bas/Documents/MachineLearning/train/subj%d_series*_data.csv' #path on my laptop
# TEST_DATA_PATH =  'C:/Users/bas/Documents/MachineLearning/test/subj%d_series*_data.csv' 

SUBMISSION_FOLDER = 'C:/Users/Sebastiaan/Desktop/Programming/MachineLearning/'
SUBMISSION_NAME = 'subbmision_vu_48_sub_pca_4.csv'

PCA_COMPONENTS = 0.8
CUTT_OFF_FREQUENCY = 2
ORDER = 4
SAMPLE_FREQUENCY = 500

EPOCHS = 5
WINDOW_SIZE = 500

In [20]:
def get_complete_data():
    data = pd.read_csv("C:/Users/Gebruiker/Documents/Untitled Folder/train/train/subj1_series1_data.csv")
    labels = pd.read_csv("C:/Users/Gebruiker/Documents/Untitled Folder/train/train/subj1_series1_events.csv")
    data = data.drop(['id'], axis = 1)
    data['HandStart'] = labels["HandStart"]
    data['FirstDigitTouch'] = labels["FirstDigitTouch"]
    data['BothStartLoadPhase'] = labels["BothStartLoadPhase"]
    data['LiftOff'] = labels["LiftOff"]
    data['Replace'] = labels["Replace"]
    data['BothReleased'] = labels["BothReleased"]
    data = np.asarray(data[4500:5500].astype(float))
    return data

In [21]:
def prepare_training_data(data_path):
    data = pd.read_csv(data_path)
    event_path = data_path.replace('_data', '_events')
    labels = pd.read_csv(event_path)
    clean_data = data.drop(['id'], axis = 1)
    labels = labels.drop(['id'], axis = 1)
    return clean_data, labels

In [22]:
def prepare_test_data(data_path):
    data = pd.read_csv(data_path)
    return data

In [23]:
def scaler_transform(data, scaler):
    if scaler == None:
        scaler = StandardScaler()
        return scaler.fit_transform(data), scaler
    else:
        return scaler.transform(data)        

In [24]:
def read_training_data(train_data_paths):
    labels_raw = []
    features_raw = []
    for data_path in train_data_paths:
        data, labels = prepare_training_data(data_path)
        features_raw.append(data[1722:2071])
        labels_raw.append(labels[1722:2071])
    features_raw = pd.concat(features_raw)
    labels_raw = pd.concat(labels_raw)
    x_train = np.asarray(features_raw.astype(float))
    y_train = np.asarray(labels_raw.astype(float))
    return x_train, y_train

In [25]:
def read_test_data(test_data_paths):
    test_features_raw = []
    ids = []
    for data_path in test_data_paths:
        data = prepare_test_data(data_path)
        test_features_raw.append(data)
        ids.append(np.array(data['id']))
    test_features_raw = pd.concat(test_features_raw)
    ids = np.concatenate(ids)
    test_features_raw = test_features_raw.drop(['id'], axis = 1)
    x_test = np.asarray(test_features_raw.astype(float))
    return x_test, ids

In [26]:
def metric_auc_score(predictions, y_test, with_plot):
    scores = []
    legend_text = []
    for i in range(N_LABELS):
        fpr, tpr, _  = roc_curve(y_test[:,i], predictions[:,i], 1)
        scores.append(roc_auc_score(y_test[:,i], predictions[:,i]))
        legend_text.append(COLUMNS[i]+' (area = %.3f)' % (scores[i]))
        if with_plot == True:
            plt.plot(fpr, tpr)
    if with_plot == True:
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves')
        plt.legend(legend_text)
        plt.show()
    return scores

In [27]:
def single_metric_auc_score(predictions, y_test, with_plot):
    fpr, tpr, _  = roc_curve(y_test, predictions, 1)
    score = roc_auc_score(y_test, predictions)
    print(COLUMNS[0]+' AUC score = %.3f' % (score))
    if with_plot == True:
        plt.plot(fpr, tpr)
        plt.xlabel('False Positive Rate')
        plt.ylabel('True Positive Rate')
        plt.title('ROC Curves')
        plt.show()
    return score

In [28]:
def all_auc_scores(prediction_total, test_data_total, subjects, with_plot):
    scores = []
    for subject in subjects:
        score = metric_auc_score(prediction_total[subject-1],
                                 test_data_total[subject-1], with_plot)
        scores.append(score)
        print('Mean AUC Score of Subject %d: %.3f' % \
              (subject, np.mean(score)))
    return scores

In [29]:
# def image_mappping(x_train, WINDOW_SIZE):
#     result = []
#     empty_matrix = np.atleast_3d(np.zeros(np.shape(x_train[0:WINDOW_SIZE])))
#     for i in range(len(x_train)):
#         if i-WINDOW_SIZE < 0:
#             result.append(empty_matrix)
#         else:
#             result.append(np.atleast_3d(x_train[i-WINDOW_SIZE:i]))
#     return result

def image_mapping(x_train, WINDOW_SIZE):
    result = []
    empty_matrix = np.zeros(np.shape(x_train[0:WINDOW_SIZE]))
    for i in range(len(x_train)):
        if i-WINDOW_SIZE < 0:
            result.append(empty_matrix)
        else:
            result.append(x_train[i-WINDOW_SIZE:i])
    return result

In [30]:
def train_generator(x_train, y_train, WINDOW_SIZE):
    x = image_mappping(x_train, WINDOW_SIZE)
    while True:
        for image, task in zip(x, y_train):
            yield np.array([image]), np.array([task])

In [31]:
def test_generator(x_test, WINDOW_SIZE):
    x = image_mappping(x_test, WINDOW_SIZE)
    while True:
        for image in x:
            yield np.array([image])

In [32]:
def simple_val(vals):
    new = np.empty([len(vals),1])
    for i in range(0,len(vals)):
        value = vals[i]
        for index in range(0,len(value)):
            if value[index] == 1:
                new[i] = index+1
                break
            elif index == 0:
                new[i] = 0
    return new

In [33]:
def class_weights(y_train):
    class_weight = class_weight.compute_class_weight('balanced', np.unique(y_train[:,0]), y_train[:,0])
    return {0 : class_weight[1], 2: class_weight[0]}

In [34]:
def resampling(comb):
    df_majority = np.array([value for value in comb if sum(value[-6:]) == 0])
    df_minority = np.array([value for value in comb if sum(value[-6:]) != 0])
    df_minority_upsampled = resample(df_minority, 
                                 replace=True,     # sample with replacement
                                 n_samples=len(df_majority),    # to match majority class
                                 random_state=123) # reproducible results
    df_upsampled = np.concatenate((df_majority, df_minority_upsampled))
    return df_upsampled

In [119]:
def init_cnn(window):
    """model = Sequential((
    # The first conv layer learns `nb_filter` filters (aka kernels), each of size ``(filter_length, nb_input_series)``.
    # Its output will have shape (None, window_size - filter_length + 1, nb_filter), i.e., for each position in
    # the input timeseries, the activation of each filter at that position.
    Conv1D(nb_filter=50, filter_length=100, activation='relu', input_shape=(window, 32)),
    MaxPooling1D(),     # Downsample the output of convolution by 2X.
    Conv1D(nb_filter=50, filter_length=50, activation='relu'),
    MaxPooling1D(),
    Flatten(),
    Dense(5, activation='softmax'),    
    ))
    optimizer = Adadelta()
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    model.summary()"""

    
    model = Sequential()
    model.add(Conv2D(16, kernel_size=(2, 2), activation='relu', input_shape=(8,4,1)))
    model.add(Conv2D(32, 2,2 , activation='relu'))
    model.add(MaxPool2D(pool_size=(2, 2)))
    model.add(Dropout(0.25)) # Dropout 25% of the nodes of the previous layer during training
    model.add(Flatten())     # Flatten, and add a fully connected layer
    model.add(Dense(128, activation='relu')) 
    model.add(Dropout(0.5))
    model.add(Dense(6, activation='sigmoid')) # Last layer: 10 class nodes, with dropout
    model.summary()
    optimizer = Adadelta()
    model.compile(optimizer=optimizer, loss='binary_crossentropy', metrics=['accuracy'])
    return model

In [120]:
for subject in SUBJECTS:
    test_features_raw = []
    train_data_paths = glob(TRAIN_DATA_PATH % (subject))
    np.set_printoptions(threshold=np.nan)
    #x_train, y_train = read_training_data(train_data_paths) 
    complete_x = get_complete_data()
    #y_train = np.random.randint(2,size=len(y_train))
    #print(y_train)
    complete_resampled = resampling(complete_x)
    x = complete_resampled[:,range(0,(complete_resampled.shape[1]-6))]
    y = complete_resampled[:, range(-6,-0)]
    #print(x)
    print(x.shape)
    #y_train = np.array([value for value in y_train if sum(value) != 0])
    #print(y_train)
    #x_train = x_train[0:len(y_train)]
    
    x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.5)
    #x_train = x_train.reshape(len(x_train),1,8,4)
    #x_test = x_test.reshape(len(x_test),1,8,4)
    #y_train = to_categorical(y_train, num_classes = 6)
    #print(y_train.shape)
    #print(image_mapping(x_train,500))
    #y_train = simple_val(y_train)
    #y_test = simple_val(y_test)
    #class_weights = class_weight.compute_class_weight('balanced', set(np.unique(y_train)), y_train)
    #print(class_weights)
    #class_weights = {0 : 1., 1: 50., 2: 50.,3: 50., 4: 50.,5: 50.}
    
    #print(y_train)

(1186, 32)


In [121]:
model = init_cnn(window=593)

_________________________________________________________________
Layer (type)                 Output Shape              Param #   
conv2d_15 (Conv2D)           (None, 7, 3, 16)          80        
_________________________________________________________________
conv2d_16 (Conv2D)           (None, 6, 2, 32)          2080      
_________________________________________________________________
max_pooling2d_6 (MaxPooling2 (None, 3, 1, 32)          0         
_________________________________________________________________
dropout_9 (Dropout)          (None, 3, 1, 32)          0         
_________________________________________________________________
flatten_11 (Flatten)         (None, 96)                0         
_________________________________________________________________
dense_15 (Dense)             (None, 128)               12416     
_________________________________________________________________
dropout_10 (Dropout)         (None, 128)               0         
__________

In [122]:
x_train = x_train.reshape(len(x_train),8,4)
x_train = x_train[:, :, :, None]
model.fit(x_train,y_train,epochs=10, batch_size=30, validation_split=1/6)

Train on 494 samples, validate on 99 samples
Epoch 1/10
Epoch 2/10
Epoch 3/10
Epoch 4/10
Epoch 5/10
Epoch 6/10
Epoch 7/10
Epoch 8/10
Epoch 9/10
Epoch 10/10


<keras.callbacks.History at 0x273187edba8>

In [126]:
#model.evaluate(x_test,y_test)
#np.set_printoptions(threshold=np.nan)
x_test = x_test.reshape(len(x_test),8,4)
x_test = x_test[:, :, :, None]
print(x_test.shape)
predictions = model.predict(x_test)
predictions

(593, 8, 4, 1)


array([[0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0., 0., 0., 0., 0., 0.],
       [0.

In [46]:
from sklearn.metrics import accuracy_score, confusion_matrix
pred = model.predict(x_test)
pred = np.argmax(pred, axis=1)
y_compare = np.argmax(y_test,axis=1)
score = accuracy_score(y_compare,pred)
cm = confusion_matrix(y_compare,pred)
print(cm)
print(score)

[[902   0   0   0]
 [134   0   0   0]
 [ 48   0   0   0]
 [312   0   0   0]]
0.6461318051575932


In [91]:
my_generater = train_generator(x_train, y_train, 250)
my_test_generater = train_generator(x_test, y_test, 250)
model.fit_generator(my_generater, steps_per_epoch=len(x_train)/50, epochs=4, 
                     verbose=1, class_weight=class_weights,
                    validation_data=my_test_generater, 
                    validation_steps = len(x_test)/50)

Epoch 1/4
Epoch 2/4
Epoch 3/4
Epoch 4/4


<keras.callbacks.History at 0x20e001917f0>

In [62]:
my_test_generater = test_generator(x_test, 250)
predictions = model.predict_generator(my_test_generater, steps=len(x_test)/100)
predictions

array([[1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.],
       [1.]], dtype=float32)

In [103]:
x_train = x_train[250,:,:,]
model.fit(x_train,y_train,epochs=10, batch_size=30, validation_split=1/6)

IndexError: index 250 is out of bounds for axis 0 with size 1

In [65]:
from sklearn.metrics import accuracy_score, confusion_matrix
pred = model.predict(x_test)
pred = np.argmax(pred, axis=1)
y_compare = np.argmax(y_test,axis=1)
score = accuracy_score(y_compare,pred)
cm = confusion_matrix(y_compare,pred)
print(cm)
print(score)

ValueError: Error when checking : expected conv1d_17_input to have 3 dimensions, but got array with shape (1396, 8, 4, 1)

In [None]:
my_generater = train_generator(x_train, y_train, WINDOW_SIZE)
my_test_generater = train_generator(x_test, y_test, WINDOW_SIZE)
model.fit_generator(my_generater, steps_per_epoch=len(x_train)/50, epochs=4, 
                     verbose=1, 
                    validation_data=my_test_generater, 
                    validation_steps = len(x_test)/50)

In [None]:
my_test_generater = test_generator(x_test, WINDOW_SIZE)
predictions = model.predict_generator(my_test_generater, steps=len(x_test)/50)

In [None]:
predictions[0] = 0
predictions

In [None]:
y_pred = model.predict_proba(image_mappping(x_train,500))
roc_auc_score(y_test, y_pred)

In [None]:
score = single_metric_auc_score(predictions, y_test, True)
print('AUC score: %.3f' % (score))

In [None]:
model.save('my_model.h5')  # creates a HDF5 file 'my_model.h5'

In [36]:
%history -n -o 0-50

   0: 
   1:

from keras.utils import to_categorical
from keras.optimizers import Adam, Adadelta
from keras.models import Sequential
from keras.layers import Dense, Activation, Conv2D, MaxPool2D, Dropout, Flatten, Conv1D
from keras.layers.convolutional import MaxPooling2D, MaxPooling1D
from keras.models import load_model
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA
from sklearn.model_selection import train_test_split
from sklearn.metrics import roc_curve, auc, roc_auc_score
from sklearn.utils import class_weight
from scipy import signal
from glob import glob
import matplotlib.pyplot as plt
   2:
# constants:
SUBSAMPLE = 100  # scales the data by this factor can be replaced by divding the steps per epoch by batch size
COLUMNS = ['HandStart', 'FirstDigitTouch',
        'BothStartLoadPhase', 'LiftOff',
        'Replace', 'BothReleased']
SUBJECTS = range(1, 2