# INFO-f422: ML Project

authors:
+ 1 
+ 2
+ 3

### Imports

In [54]:
# models
from sklearn.linear_model import Ridge, Lasso, LogisticRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

# model selection
from sklearn.model_selection import LeaveOneGroupOut, cross_val_score
from sklearn.metrics import make_scorer, mean_squared_error

# preprocessing
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler

# utils
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from enum import Enum

### Data loading

In [29]:
data_dir = "data"

X_g_train = np.load(f"{data_dir}/guided/guided_dataset_X.npy")
y_g_train = np.load(f"{data_dir}/guided/guided_dataset_y.npy")
X_g_test = np.load(f"{data_dir}/guided/guided_testset_X.npy")

X_f_train = np.load(f"{data_dir}/freemoves/freemoves_dataset_X.npy")
y_f_train = np.load(f"{data_dir}/freemoves/freemoves_dataset_y.npy")
X_f_test = np.load(f"{data_dir}/freemoves/freemoves_testset_X.npy")


In [30]:
print("Guided:")
print(f"X_g_train {X_g_train.shape} / y_g_train{y_g_train.shape} / X_g_test{X_g_test.shape}\n")
print("Free moves:")
print(f"X_f_train{X_f_train.shape} / y_f_train{y_f_train.shape} / X_f_test{X_f_test.shape}")

Guided:
X_g_train (5, 8, 230000) / y_g_train(5, 51, 230000) / X_g_test(5, 332, 8, 500)

Free moves:
X_f_train(5, 8, 270000) / y_f_train(5, 51, 270000) / X_f_test(5, 308, 8, 500)


### 1) Signal filtering

TODO: data exploration to take informed decision on filter (type of noise,....) to use and on filter parametres (no magic number)

In [53]:
# from scipy.signal import butter, sosfiltfilt, firwin

# nyq  = 1024 / 2
# low  = 20  / nyq
# high = 450 / nyq

# sos = butter(4,[low,high], btype='band', output= 'sos')

# for sess in range(X_g_train.shape[0]):
#     for elec in range(X_g_train.shape[1]):
#         # Application of the filtrage for x
#         X_g_train[sess, elec, :] = sosfiltfilt(sos, X_g_train[sess, elec, :])

### 2) Dataset preparation

For this question, we decided to use the sliding_window_view function from the Numpy library for several reasons:

+ Fast vectorized numpy operations, compiled c-code (no python overhead, interpreter).

+ sliding_window_view function returns a view, no copy.

+ The function simplifies the implementation by automating window creation and indexing.

In [24]:
class FeatureWindowAugment(BaseEstimator, TransformerMixin):
    """
    + Augment X through overlapping windows
    """
    
    def __init__(self, window_size=500, overlap=0.5):
        self.window_size = window_size
        self.overlap = overlap
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, X, y=None):
        axis = 2 # time
        step = int(self.window_size * (1 - self.overlap))
        
        # sliding_windows_view Generate all possible windows with the corresponding step, that not what we want.
        X_windows = np.lib.stride_tricks.sliding_window_view(X,self.window_size, axis)

        # only keep windows where the step is a multiple of our step 
        X_windows = X_windows[:,:,::step,:]
         
        # (session, electrode, window, time) to (session, window, electrode, time)
        X_windows = X_windows.transpose(0, 2, 1, 3) 
        
        return X_windows   

In [43]:
class TargetStrategy(Enum):
    """
    strategy to select the target (unique hand pose) from a window
    """
    
    MEAN = 0
    FIRST = 1
    LAST = 2
    MEDIAN = 3
    
class WindowTargetExtractor(BaseEstimator, TransformerMixin):
    """
    + augment y through overlapping windows 
    + extract the target, i.e. unique hand pose (51 val) of the correspondoing window (500 'samples')
    """
    
    def __init__(self, window_size=500, overlap=0.5, target_strat=TargetStrategy.LAST):
        self.window_size = window_size
        self.overlap = overlap
        self.target_strat = target_strat
    
    def fit(self, X, y=None):
        return self
    
    def transform(self, y):
        axis = 2 # time
        step = int(self.window_size * (1 - self.overlap))
        
        # ----- window augment -----
        
        # sliding_windows_view Generate all possible windows with the corresponding step, that not what we want.
        y_windows = np.lib.stride_tricks.sliding_window_view(y,self.window_size, axis)

        # only keep windows where the step is a multiple of our step 
        y_windows = y_windows[:,:,::step,:]
         
        # (session, angles, window, time) to (session, window, angles, time)
        y_windows = y_windows.transpose(0, 2, 1, 3)     
        
        # ----- target extract -----

        match self.target_strat:
            case TargetStrategy.LAST:
                y_windows = y_windows[:, :, :, -1]
            case TargetStrategy.FIRST:
                y_windows = y_windows[:, :, :, 0]
            case TargetStrategy.MEAN:
                y_windows = np.mean(y_windows, axis=3)
            case TargetStrategy.MEDIAN:
                y_windows = np.median(y_windows, axis=3)
            case _:
                raise ValueError(f"Unknown TargetStrategy enum value")

        return y_windows

In [27]:
class TimeFeatureExtractor(BaseEstimator, TransformerMixin):
    """
    Extracts common time-domain features:
        - Mean Absolute Value (MAV)
        - Root Mean Square (RMS)
        - Variance
        - Standard Deviation (STD)
        - Zero Crossing (ZC)
        - Myopulse Percentage Rate (MPR)
    """
    
    def __init__(self, threshold=10):
        self.threshold = threshold
    
    def fit(self, X, y=None):
        return self

    def transform(self, X, y=None):
        
        # TODO: all features/indicators
        
        return X

In [52]:
X_pipeline = Pipeline([
    ('window_extractor', FeatureWindowAugment()),
    ('feat_extractor', TimeFeatureExtractor()),
    # ('scaler', StandardScaler())
])

y_pipeline = WindowTargetExtractor()

X_g_train_final = X_pipeline.fit_transform(X_g_train)
y_g_train_final = y_pipeline.fit_transform(y_g_train)

print(f"X_g_train_final{X_g_train_final.shape} / y_g_train_final{y_g_train_final.shape}")

X_g_train_final(5, 919, 8, 500) / y_g_train_final(5, 919, 51)


In [49]:
assert np.array_equal(X_g_train_final[0, 0, 0, :10], X_g_train[0, 0, :10]) # (sess 0) first 10 of electrode 0 in window 0
assert np.array_equal(X_g_train_final[0, 1, 0, :10], X_g_train[0, 0, 250:260]) # (sess 0) first 10 of electrode 0 in window 1
assert np.array_equal(X_g_train_final[0, 1, 4, :10], X_g_train[0, 4, 250:260]) # (sess 0) first 10 of electrode 4 in window 1
assert np.array_equal(X_g_train_final[0, 918, 0, -10:], X_g_train[0, 0, 229990:230000]) # (sess 0) last 10 of electrode 0 in last window (918) - (perfect fit!)

#### 3) Cross validation strategy

For this question, we have thought about various methods of cross validation. First, our data are continous because it's a signal, so preserving temporal structure is important. We can’t use a method of cross validation which randomly shuffles our windows. 

We also need to prevents data leaking so we can't use a methode who use the windows of one session for training AND validation because we have overlapping data in each session, two windows in the same session can share the same datas, and if these two windows are in train and validation, it will lead to data leakage and overly optimistic performance (data in the train set will also be in the validation set). 

So it's naturally that we have chosen the "Leave One Group Out" method, this method will use each session as the validation set once and the other for training. We completly prevent data leakage because each session is indepandent from the other, and we reduce the bias because each session will be used for validation.

In our case, "LOGO" and "GroupKFold(5)" produce the same splits, but we choose "LOGO" because it's more explicit, readers will immediatly see that we use one session for validation each time while "GroupKFold" need to have 5 in parameter to do the same thong

In [9]:
x_shape = X_g_train_wdw.shape
y_shape = y_g_train_wdw.shape

groups = np.repeat(np.arange(1,x_shape[0]+1),x_shape[1] ) # 111 (919 times), 222 (919 times), ...
print(f"groups{groups.shape}\n")

# We need to flatten the dataset x and y because the function logo (and latter "croos_val_score"
# want all the data in a 2d list, we will know have  the dataset X for exemple.
# [4595, 4000] and not [5,919,8,500], 4595 is the multiplication of 5 and 919 (3500 = 8*500), and y 
# [4595,51] and not [5,919,51].
# Now all the windows are store in a list and the "groups" list above allow the function 
# logo to know at wich session each windows belong
# The windows 3 for example (x_windows_flat[2]) belong to the sessions groups[2] = 1
X_g_train_wdw_flat = X_g_train_wdw.reshape(x_shape[0] * x_shape[1], x_shape[2] * x_shape[3])
y_g_train_wdw_flat = y_g_train_wdw.reshape(y_shape[0] * y_shape[1], y_shape[2])

print("Guided windowed flattened:")
print(f"X_g_train_wdw_flat{X_g_train_wdw_flat.shape} / y_g_train_wdw_flat{y_g_train_wdw_flat.shape}")

groups(4595,)

Guided windowed flattened:
X_g_train_wdw_flat(4595, 4000) / y_g_train_wdw_flat(4595, 51)


In [109]:
# loss functions/scorer
def mse(y, y_hat):
    return np.mean((y-y_hat)**2)

def rmse(y, y_hat):
    return np.sqrt(np.mean((y-y_hat)**2))

def nmse(y, y_hat):
    return np.mean((y-y_hat)**2)/np.std(y)**2

rmse_scorer = make_scorer(rmse, greater_is_better=False)
nmse_scorer = make_scorer(nmse, greater_is_better=False)

# cv function (TODO: by hand ?)
logo = LeaveOneGroupOut()
def logo_cv(X, y, groups, model, scorer):
    cv_scores = cross_val_score(model, X, y, groups=groups, cv=logo, scoring=scorer, n_jobs=-1) # n_jobs=-1 --> use all cores
    return np.mean(cv_scores)

# Lasso example
# logo = LeaveOneGroupOut()
# lasso_reg = Lasso(max_iter=1)
# res = logo_cv(X_g_train_wdw_flat, y_g_train_wdw_flat, logo, groups, lasso_reg, rmse_scorer)
# print(f"cv score = {res}")

In [107]:
# for i, (train_index, test_index) in enumerate(logo.split(X_g_train_wdw_flat, y_g_train_wdw_flat, groups)):
#     print(f"Fold {i}")
#     print(f"   train groups: {np.unique(groups[train_index])}")
#     print(f"   test groups: {np.unique(groups[test_index])}")

### 4) Baseline

In [None]:
models = {
    'logistic': LogisticRegression(),
    'lasso': Lasso(),
    'dec_tree': DecisionTreeRegressor(),
    'random_forest': RandomForestRegressor()
}

results = {}
for name, model in models.items():
    
    model.fit(X_g_train_final, y_g_train_final)
    y_g_pred = model.predict(X_g_test) # transform X_g_test ??
    
    # logo cv

SyntaxError: invalid syntax (4041499946.py, line 11)