In [72]:
import numpy as np
from numpy.lib.stride_tricks import sliding_window_view
np.set_printoptions( threshold=3) # Just a function who limit the print size of a tab to the 3 first and 3 last data

In [73]:
x = np.load("./guided/guided_dataset_X.npy")
y = np.load("./guided/guided_dataset_y.npy")
testset = np.load("./guided/guided_testset_X.npy")


In [74]:
print(testset.shape)
print(x.shape)
print(y.shape)


(5, 332, 8, 500)
(5, 8, 230000)
(5, 51, 230000)


#### Question 1

In [66]:
from scipy.signal import butter, sosfiltfilt, firwin

In [67]:
nyq  = 1024 / 2
low  = 20  / nyq
high = 450 / nyq

sos = butter(4,[low,high], btype='band', output= 'sos')

for sess in range(x.shape[0]):
    for elec in range(x.shape[1]):
        # Application of the filtrage for x
        x[sess, elec, :] = sosfiltfilt(sos, x[sess, elec, :])

#### Question 2

For this question, we decided to use the sliding_window_view function from the Numpy library for several reasons:

-Numpy functions are built in C, making them faster than implementing a double loop in Python.

-The sliding_window_view function creates a view of the array rather than copying the data, minimizing memory usage.

-The function simplifies the implementation by automating window creation and indexing.

In [75]:
def create_overlap_windows(x,y,window_size, axis,overlap):

    step = int(window_size * (1 - overlap))

    # sliding_windows_view Generate all possible windows with the corresponding step, that not what we want.
    x_w = sliding_window_view(x,window_size,axis)
    y_w = sliding_window_view(y,window_size,axis)

    print(x_w.shape)
    print(y_w.shape)

    # We only keep the windows where the step is a multiple of our step 
    x_w = x_w[:,:,::step,:]
    y_w = y_w[:,:,::step,:]

    print(x_w.shape)
    print(y_w.shape)

    # We transpose the axes windows and electrode/signal 
    x_w = x_w.transpose(0, 2, 1, 3)     #  (session, window, electrode, time) and not  (session, electrode, window, time)
    y_w = y_w.transpose(0, 2, 1, 3)     # (session, window, signals, time)
    
    print(x_w.shape)
    print(y_w.shape)

    # Finaly, we keep only the last hand position (targets) for y, because for this project
    # we need to predict, for each window in x, the final hand position in the
    # same windows in the dataset y
    y_w = y_w[..., -1]                # (sessions, windows, targets)

    print(y_w.shape)


    return x_w, y_w

x_windows,y_windows = create_overlap_windows(x,y,500,2,0.5)

print("250 last data in the 1st window/session 1")
print(x_windows[0,0,:,250:])
print("250 first data in the  2st window/session 1")
print(x_windows[0,1,:,:250])
print("We can see that 50% of the value are the same")


(5, 8, 229501, 500)
(5, 51, 229501, 500)
(5, 8, 919, 500)
(5, 51, 919, 500)
(5, 919, 8, 500)
(5, 919, 51, 500)
(5, 919, 51)
250 last data in the 1st window/session 1
[[  1.0986581   -8.41673625  -4.84972558 ...  -1.76873947  -6.56577493
  -10.88819442]
 [ 12.21837735  15.3262705    4.77276143 ...  -3.94538397  -5.50181954
   -7.3692657 ]
 [ -1.02035145   2.75337535 -18.0932997  ...  -9.69223461 -21.23125079
   -6.27851702]
 ...
 [ -4.18208003  -8.56125167  -3.20060061 ... -22.45992589 -95.24715478
  -53.12527546]
 [-14.74996522 -16.58254728 -10.64747431 ...  -1.2357502  -12.37910359
   -1.52116206]
 [-11.45309457 -12.52041445   2.03051126 ...   1.56962661   1.08560054
   -2.22653059]]
250 first data in the  2st window/session 1
[[  1.0986581   -8.41673625  -4.84972558 ...  -1.76873947  -6.56577493
  -10.88819442]
 [ 12.21837735  15.3262705    4.77276143 ...  -3.94538397  -5.50181954
   -7.3692657 ]
 [ -1.02035145   2.75337535 -18.0932997  ...  -9.69223461 -21.23125079
   -6.27851702]
 

#### Question 3

For this question, we have thought about various methods of cross validation. First, our data are continous because it's a signal, so preserving temporal structure is important. We can’t use a method of cross validation which randomly shuffles our windows. 

We also need to prevents data leaking so we can't use a methode who use the windows of one session for training AND validation because we have overlapping data in each session, two windows in the same session can share the same datas, and if these two windows are in train and validation, it will lead to data leakage and overly optimistic performance (data in the train set will also be in the validation set). 

So it's naturally that we have chosen the "Leave One Group Out" method, this method will use each session as the validation set once and the other for training. We completly prevent data leakage because each session is indepandent from the other, and we reduce the bias because each session will be used for validation.

In our case, "LOGO" and "GroupKFold(5)" produce the same splits, but we choose "LOGO" because it's more explicit, readers will immediatly see that we use one session for validation each time while "GroupKFold" need to have 5 in parameter to do the same thong

In [76]:
x_shape = x_windows.shape
y_shape = y_windows.shape


groups = np.repeat(np.arange(1,x_shape[0]+1),x_shape[1] ) # 111 (919 times), 222 (919 times), ...
#print(groups)
print(groups.shape)

# We need to flatten the dataset x and y because the function logo (and latter "croos_val_score"
# want all the data in a 2d list, we will know have  the dataset X for exemple.
# [4595, 4000] and not [5,919,8,500], 4595 is the multiplication of 5 and 919 (3500 = 8*500), and y 
# [4595,51] and not [5,919,51].
# Now all the windows are store in a list and the "groups" list above allow the function 
# logo to know at wich session each windows belong
# The windows 3 for example (x_windows_flat[2]) belong to the sessions groups[2] = 1
x_windows_flat = x_windows.reshape(x_shape[0] * x_shape[1], x_shape[2] * x_shape[3])
y_windows_flat = y_windows.reshape(y_shape[0] * y_shape[1], y_shape[2])

print("New shape of x:", x_windows_flat.shape, " and y:", y_windows_flat.shape)
# The dataset x kept the same data than in the start 

(4595,)
New shape of x: (4595, 4000)  and y: (4595, 51)


In [77]:
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error
from sklearn.linear_model import Ridge


In [78]:
logo = LeaveOneGroupOut()

model = Ridge(); # the futur model 

rmse_scorer = make_scorer(
    lambda y_true, y_pred: np.sqrt(mean_squared_error(y_true, y_pred)),
    greater_is_better=False  # Score near 0 is better 
)

# The cross_val_score function by sklearn will execute our cv and return a tab 
neg_rmse_scores = cross_val_score(
    model,
    x_windows_flat,
    y_windows_flat,
    groups=groups,
    cv=logo,
    scoring=rmse_scorer,
    n_jobs=-1 # Use all the cpu active for doing the calcul in parralelisme 
)

# Conversion of negatifs scores into positifs (convention of sklearn)
rmse_scores = -neg_rmse_scores  
print("RMSE for each folder:", rmse_scores)

RMSE for each folder: [59.78330527 57.89521081 70.76477708 59.82600232 64.81092694]


We can see in the code below that every fold contains four sessions in the training set and one in the test set. The sessions are independent of one another, so the data is completely safe and we have no data leakage.

In [79]:

for fold, (train_i, test_i) in enumerate(logo.split(x_windows_flat, y_windows_flat, groups), start=1):
    #print("Index of train set: ",train_i) # Index of all the windows in four session in x for the train set (3676 windows)
    #print("Index of test set: ",test_i) # Index of all the windows in one session in x for the test set (919 windows)
    train_groups = set(groups[train_i])
    test_groups  = set(groups[test_i])

    print("Fold ", fold)
    print(train_groups)
    print(test_groups)




Fold  1
{np.int64(2), np.int64(3), np.int64(4), np.int64(5)}
{np.int64(1)}
Fold  2
{np.int64(1), np.int64(3), np.int64(4), np.int64(5)}
{np.int64(2)}
Fold  3
{np.int64(1), np.int64(2), np.int64(4), np.int64(5)}
{np.int64(3)}
Fold  4
{np.int64(1), np.int64(2), np.int64(3), np.int64(5)}
{np.int64(4)}
Fold  5
{np.int64(1), np.int64(2), np.int64(3), np.int64(4)}
{np.int64(5)}
