In [1]:
%load_ext line_profiler

In [2]:
from src.data_processing import load_data, data_preprocessing
from src.data_processing.config import processed_dir_path
from time import time
import pandas as pd
import numpy as np
import scipy.stats as stats
from functools import partial

start = time()

df = load_data.load_csvs(processed_dir_path)
print(f"Time to load data: {time() - start}")
df.sort_values(by=['session', 'trial', ' Timestamp'], inplace=True)
cols = [str(x) for x in df.columns if x.startswith(' EXG')]
labels = ['left', 'right', 'takeoff', 'land', 'forward', 'backward']

Time to load data: 3.7515220642089844


In [3]:
max_sample_number = df.groupby('trial', sort=False).size().max()
max_sample_number

1636

In [4]:
num_trials = df.trial.nunique()
num_trials

1999

f_df = df.groupby('trial', sort=False, as_index=False).apply(data_preprocessing.apply_filters)

n = f_df[cols].to_numpy()
n_med = np.median(np.abs(n), axis=0)
mad = stats.median_abs_deviation(n, axis=0, center=lambda *_, **__: n_med)
m_scores = np.abs((n - n_med) / mad)

max_ok_score = 4
ok_scores = m_scores < max_ok_score
n[~ok_scores] = 0

f_df[cols] = n
f_df

import matplotlib.pyplot as plt

dfg1 = f_df[f_df.trial == 1000].groupby('trial', sort=False)[cols]
axes1 = dfg1.plot(subplots=True, figsize=(10, 10))
# 

# print(f_df.describe())
# print(df2.describe())

# print()
for ax in axes1:
    for a in ax:
        a.set_ylim(-50, 50)

plt.show()

def all_trials_in_order(d):
    for trial, group in d.groupby('trial', sort=False):
        if not group[' Timestamp'].is_monotonic_increasing:
            return False
    return True

from scipy.fft import rfft, rfftfreq, next_fast_len

n = next_fast_len(max_sample_number)
num_freqs = n // 2 + 1
freq_matrix = np.zeros((num_freqs, len(cols), num_trials))

label_session_map = np.zeros((num_trials, 2), dtype=object)

for i, (trial, group) in enumerate(f_df.groupby('trial', sort=False)):
    label_session_map[i, :] = group['label'].iloc[0], group['session'].iloc[0]
    trial_data = group[cols].to_numpy(copy=True)
    signal_fft = rfft(trial_data, n, axis=0)
    freq_matrix[:, :, i] = np.abs(signal_fft)

signal_freq = rfftfreq(n, d=1 / 125)
mean_freq_matrix = np.mean(freq_matrix, axis=2)
mean_freq_matrix.shape

from scipy.signal import correlate

mean_freq_by_label = {}

for label in labels:
    mean_freq_by_label[label] = np.mean(freq_matrix[:, :, label_session_map[:, 0] == label], axis=2)

fig, axes = plt.subplots(1, 6, figsize=(20, 10))
for i, (label, label_mean_freq_matrix) in enumerate(mean_freq_by_label.items()):
    self_correlation = correlate(label_mean_freq_matrix[:, 0], label_mean_freq_matrix[:, 0], mode='same')
    axes[i].plot(signal_freq, self_correlation, label=label)
    # axes[i].set_title(f'Channel {j}')
# plt.colorbar()
plt.tight_layout()
plt.show()

fig, axes = plt.subplots(freq_matrix.shape[1], 1, figsize=(20, 20))
for j in range(freq_matrix.shape[1]):
    for label, label_mean_freq_matrix in mean_freq_by_label.items():
        axes[j].plot(signal_freq, label_mean_freq_matrix[:, j], label=label)
        axes[j].set_title(f'Channel {j}')
plt.tight_layout()
plt.legend()
plt.show()

## Model Training
Testing GroupKFold first
Raw channel data

In [5]:
def split_on_trial():
    saved_trials = pd.DataFrame(columns=['label', 'trial'])
    for label, group in df.groupby('label', sort=False):
        # select 10% of the trials to take out of the training set
        label_trials_to_take_out = pd.DataFrame(columns=['label', 'trial'])
        label_trials_to_take_out['trial'] = group.trial.drop_duplicates().sample(frac=0.1, random_state=42)
        label_trials_to_take_out['label'] = label
        saved_trials = pd.concat([saved_trials, label_trials_to_take_out])
    return saved_trials


def split_on_session():
    saved_trials = pd.DataFrame(columns=['label', 'trial'])
    for label, group in df.groupby('label', sort=False):
        # select 10% of the trials to take out of the training set
        sessions_to_take_out = group.session.drop_duplicates().sample(frac=0.1, random_state=42)
        session_trials_to_take_out = pd.DataFrame(columns=['label', 'trial'])
        session_trials_to_take_out['trial'] = group[group.session.isin(sessions_to_take_out)].trial.drop_duplicates()
        session_trials_to_take_out['label'] = label
        saved_trials = pd.concat([saved_trials, session_trials_to_take_out])
    return saved_trials


def train_test_split(dataframe, split_on: str = 'trial'):
    if split_on == 'trial':
        saved_trials = split_on_trial()
    elif split_on == 'session':
        saved_trials = split_on_session()
    else:
        raise ValueError('split_on must be either "trial" or "session"')

    training_set = dataframe[~dataframe.trial.isin(saved_trials.trial)]
    testing_set = dataframe[dataframe.trial.isin(saved_trials.trial)]

    return training_set, testing_set

In [6]:
from sklearnex.ensemble import RandomForestClassifier

from sklearn.model_selection import GroupKFold, cross_validate

df.sort_values(by=['session', 'trial', ' Timestamp'], inplace=True)

filtered_df = df.groupby('trial', sort=False, as_index=False).apply(data_preprocessing.apply_filters)

training_set, testing_set = train_test_split(filtered_df, split_on='session')

X = training_set[cols].to_numpy().astype(np.float64)
y = training_set['label'].map(lambda x: labels.index(x)).to_numpy()
groups = training_set['session'].to_numpy()

  filtered_df = df.groupby('trial', sort=False, as_index=False).apply(data_preprocessing.apply_filters)


In [None]:
gkf = GroupKFold(n_splits=5)
clf = RandomForestClassifier(n_estimators=100, max_depth=32, random_state=42)
scoring = {'accuracy': 'accuracy'}
cv_results = cross_validate(clf, X, y, groups=groups, cv=gkf, scoring=scoring, return_train_score=True, verbose=2,
                            n_jobs=-1, return_estimator=True)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 20 concurrent workers.


In [None]:
cv_results

In [None]:
from sklearn.metrics import accuracy_score, confusion_matrix

estimators = cv_results['estimator']
test_X = testing_set[cols].to_numpy().astype(np.float64)
test_y = testing_set['label'].map(lambda x: labels.index(x)).to_numpy()

y_pred = np.zeros((test_X.shape[0], len(estimators)))
accuracy = np.zeros(len(estimators))
for i, estimator in enumerate(estimators):
    y_pred[:, i] = estimator.predict(test_X)
    accuracy[i] = accuracy_score(test_y, y_pred[:, i])

accuracy