# A Basic EEG Classifier with TPOT
 
1. Perform the same data wrangling techniques in the inception classifer to get panels filled with 2s epoch data
2. Send data into a TPOT pipeline and see what happens

In [1]:
import pandas as pd
import numpy as np
import scipy.stats as stats
import scipy.signal as signal
import matplotlib.pyplot as plt
from sklearn.cross_validation import train_test_split  
from tpot import TPOTClassifier  

EPOCH_LENGTH = 440
VARIANCE_THRESHOLD = 600



In [3]:
# Data has been collected, let's import it

open_data = pd.read_csv("../muse-data//DanoThursdayOpenRawEEG0.csv", header=0, index_col=False)
closed_data = pd.read_csv("../muse-data/DanoThursdayClosedRawEEG1.csv", header=0, index_col=False)

In [4]:
# Unfortunately, haven't come up with a good way to feed multi-dimensional data (i.e. including all 4 channels) into sklearn yet.
# To get around this, we'll drop everything except Channel 1's EEG data so everything works

open_array = open_data['Channel 1']
closed_array = closed_data['Channel 1']

In [5]:
# Prune a few rows from the tail of these arrays so that they are all divisible by our desired epoch length

open_overflow = open_array.size % EPOCH_LENGTH
open_array = open_array[0:-open_overflow]
closed_overflow = closed_array.size % EPOCH_LENGTH
closed_array = closed_array[0:-closed_overflow]

In [6]:
'''
Split DataFrames into many different dataframes 440 samples long
np.array_split breaks apart a single array into arrays with a certain length
in this case, it splits every 440 rows into different arrays
np.stack puts multiple arrays on top of each other along an axis
here it stacks all the 440-length arrays we created on top of each other as different rows in a matrix
'''

split_open_data = np.stack(np.array_split(open_array, EPOCH_LENGTH), axis=1)
split_closed_data = np.stack(np.array_split(closed_array, EPOCH_LENGTH), axis=1)

# Transform data into a 3D pandas Panel ( n epochs x 4 channels x 440 samples )

open_df = pd.DataFrame(split_open_data)
closed_df = pd.DataFrame(split_closed_data)
open_df.shape[0]


202

In [31]:
# Remove epochs with too much variance

def removeNoise(df):
    for index, row in df.iterrows():
        if np.var(row) > VARIANCE_THRESHOLD:
            print('variance ', np.var(row))
            df.drop(row)
    return df

open_df = removeNoise(open_df)
closed_df = removeNoise(closed_df)
closed_df.shape

(0, 440)

In [8]:
'''
Create a combined dataframe with both the open and closed eye data stacked on top of each other (epochs x EPOCH_LENGTH)
The first closed_df.shape[0] rows will be 1s, indicating eyes closed, and the rest will be 0s
'''

combined_df = pd.concat([closed_df, open_df], axis=0, ignore_index=True)
labels = np.append(np.ones(closed_df.shape[0]),np.zeros(open_df.shape[0]))

# Create a sklearn train test split with this big combined df

X_train, X_test, y_train, y_test = train_test_split(combined_df, labels,  
                                                    train_size=0.75,  
                                                    test_size=0.25)

X_train


Unnamed: 0,0,1,2,3,4,5,6,7,8,9,...,430,431,432,433,434,435,436,437,438,439
157,801.105427,855.389778,858.679739,837.294994,834.005034,845.519896,830.715073,857.034758,843.874916,838.939975,...,840.584955,835.650014,832.360053,861.969699,838.939975,838.939975,855.389778,842.229935,853.744798,845.519896
346,807.685348,863.614680,847.164876,842.229935,852.099817,837.294994,842.229935,860.324719,847.164876,848.809857,...,834.005034,850.454837,853.744798,845.519896,838.939975,842.229935,840.584955,908.029149,848.809857,847.164876
36,850.454837,855.389778,850.454837,855.389778,857.034758,837.294994,842.229935,842.229935,845.519896,824.135152,...,858.679739,857.034758,845.519896,804.395387,850.454837,835.650014,842.229935,843.874916,847.164876,837.294994
98,842.229935,855.389778,857.034758,832.360053,852.099817,852.099817,852.099817,850.454837,850.454837,840.584955,...,843.874916,861.969699,857.034758,852.099817,847.164876,848.809857,843.874916,832.360053,850.454837,861.969699
54,857.034758,850.454837,860.324719,832.360053,850.454837,863.614680,850.454837,852.099817,834.005034,830.715073,...,870.194601,840.584955,835.650014,824.135152,857.034758,834.005034,840.584955,830.715073,829.070093,840.584955
42,852.099817,855.389778,858.679739,852.099817,834.005034,834.005034,857.034758,847.164876,852.099817,852.099817,...,866.904640,858.679739,843.874916,817.555230,850.454837,843.874916,845.519896,842.229935,843.874916,852.099817
300,866.904640,845.519896,863.614680,843.874916,853.744798,855.389778,837.294994,806.040368,848.809857,837.294994,...,858.679739,829.070093,842.229935,843.874916,855.389778,848.809857,850.454837,838.939975,842.229935,861.969699
6,860.324719,807.685348,840.584955,852.099817,845.519896,834.005034,834.005034,827.425112,845.519896,843.874916,...,852.099817,771.495781,850.454837,799.460446,850.454837,858.679739,852.099817,850.454837,852.099817,845.519896
178,804.395387,853.744798,842.229935,830.715073,840.584955,848.809857,848.809857,853.744798,852.099817,838.939975,...,779.720682,847.164876,840.584955,852.099817,848.809857,848.809857,845.519896,852.099817,853.744798,847.164876
163,815.910250,835.650014,868.549621,840.584955,848.809857,843.874916,824.135152,848.809857,848.809857,832.360053,...,827.425112,842.229935,832.360053,837.294994,858.679739,840.584955,832.360053,842.229935,843.874916,840.584955


In [8]:
# Create a TPOTClassifier that will run for 10 generations

my_tpot = TPOTClassifier(generations=10)  

# Fit this baby! Takes a long time to run

my_tpot.fit(X_train, y_train)  
  
# See what kind of score we get
print(my_tpot.score(X_test, y_test))

1.0


In [9]:
# Holy crap! That's really good (or perhaps broken). Let's export the pipeline and see what TPOT came up with

my_tpot.export('exported_pipeline.py')  

Here's what it came up with,

```
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.svm import LinearSVC

# NOTE: Make sure that the class is labeled 'class' in the data file
tpot_data = np.recfromcsv('PATH/TO/DATA/FILE', delimiter='COLUMN_SEPARATOR', dtype=np.float64)
features = np.delete(tpot_data.view(np.float64).reshape(tpot_data.size, -1), tpot_data.dtype.names.index('class'), axis=1)
training_features, testing_features, training_classes, testing_classes = \
    train_test_split(features, tpot_data['class'], random_state=42)

exported_pipeline = LinearSVC(C=25.0, dual=False, penalty="l1", tol=0.1)

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)
```



In [80]:
# Let's import some new data and test this classifier out

new_open_data = pd.read_csv("../Muse Data/DanoEyesOpenRawEEG0.csv", header=0, index_col=False)
new_closed_data = pd.read_csv("../Muse Data/DanoEyesClosedRawEEG1.csv", header=0, index_col=False)

In [89]:
# Get channel 1 data 

open_array = new_open_data['Channel 1']
closed_array = new_closed_data['Channel 1']

# Prune a few rows from the tail of these arrays so that they are all divisible by our desired epoch length

open_overflow = open_array.size % EPOCH_LENGTH
open_array = open_array[0:-open_overflow]
closed_overflow = closed_array.size % EPOCH_LENGTH
closed_array = closed_array[0:-closed_overflow]

# Split into multiple arrays of EPOCH_LENGTH

split_open_data = np.stack(np.array_split(open_array, EPOCH_LENGTH), axis=1)
split_closed_data = np.stack(np.array_split(closed_array, EPOCH_LENGTH), axis=1)

# Transform data into a 3D pandas Panel ( n epochs x 4 channels x 440 samples )

open_df = pd.DataFrame(split_open_data)
closed_df = pd.DataFrame(split_closed_data)

# Remove noise
open_df = removeNoise(open_df)
closed_df = removeNoise(closed_df)


In [90]:
new_combined_df = pd.concat([closed_df, open_df], axis=0, ignore_index=True)
new_labels = np.append(np.ones(closed_df.shape[0]),np.zeros(open_df.shape[0]))

# Create a sklearn train test split with this big combined df

X_train, X_test, y_train, y_test = train_test_split(new_combined_df, new_labels,  
                                                    train_size=0.75,  
                                                    test_size=0.25)

In [91]:
# Since we're doing this by hand in the notebook, I'll just use the meaty parts of the exported pipeline

from sklearn.svm import LinearSVC

training_features, testing_features, training_classes, testing_classes = \
    train_test_split(new_combined_df, new_labels, random_state=42)

exported_pipeline = LinearSVC(C=25.0, dual=False, penalty="l1", tol=0.1)

exported_pipeline.fit(training_features, training_classes)
results = exported_pipeline.predict(testing_features)

In [95]:
# Let's check the accuracy of this guy

exported_pipeline.score(testing_features, testing_classes) 

1.0

Welp, there it is again