In [1]:
# ***       Parameters cell        ***
# *** Used to automate experiments ***

# View > Cell Toolbar > Tags: set the tag as "parameters"

p = 8000 #2000        # Default value
n_ = 63 #  40          # Default value # Not the real n! Real n defined below...
percent = 0.75 # 1.25  # Default value

# Genetic Application: Synthetic

## Intro

- We consider $n$ samples for **few-shot learning**
- The matrix $\mathbf{X}$ now consists of $n$ samples of size $p = 7129$, with $n << p$.
- The vector $\mathbf{y}$ consists of the $n$ labels of $\mathbf{X}$ (i.e. $y_i \in \{-1, +1\}$ for all $i = 1,\dots,n$)
- We consider the following **feature selection** task: learn a **sparse set of explainable features** (i.e. pixels) $\boldsymbol{\hat \beta}$ from which a small set of training samples samples $\mathbf{X}, \mathbf{y}$ can be classified as gppd as possible. Formally,

\begin{eqnarray}
\text{minimize}_{\boldsymbol{\beta} \in \mathbb{B}^p} && \| \mathbf{X} \boldsymbol{\beta} - \mathbf{y} \|^2_2 \\
\text{subject to} && \| \boldsymbol{\beta} \|_1 \leq \eta
\end{eqnarray}

where $\mathbf{X} \in \mathbb{R}^{n \times p}$ and $\mathbf{y} \in \{-1, +1\}^{n}$ are the few training samples and labels, respectively (with $n << p$), and $\eta$ is the number of explainable features to be selected.

## Librairies

In [2]:
# !pip install deeplake
# !pip install -U scikit-learn

# !pip install fcbf
# # https://github.com/m-martin-j/fcbf

# # --- For automated experiments --- #
# !pip install papermill
# !pip install jupyter_contrib_nbextensions
# !jupyter contrib nbextension install --user

In [3]:
import papermill as pm

import os

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm

import pickle

import random
import pandas as pd
import numpy as np
import scipy.io

#import deeplake
import sklearn
from sklearn.datasets import make_classification
from sklearn.datasets import fetch_openml

from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import KFold

from fcbf import fcbf, data
from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest
from sklearn.feature_selection import f_classif, mutual_info_classif, r_regression, chi2

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.decomposition import PCA, SparsePCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score

from utils.utils import *
from utils.models import *

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [4]:
%load_ext autoreload
%autoreload 2

## Parameters

In [5]:
# The parameters cell must be the first cell of the notebook

In [6]:
np.random.seed(42)

n = p // n_    # redefine n
nb_fts = int(p * percent // 100)

print(f"Number of selected features N_z:\t{nb_fts}")
print(f"Number of observations n:\t\t{n}")

Number of selected features N_z:	60
Number of observations n:		126


In [7]:
results_folder = os.path.join( f"results/synthetic_data_{p}" ) # separate folders for different p

if not os.path.exists(results_folder):
    os.mkdir(results_folder)

results_folder = os.path.join( results_folder, f"{n}" )  # separate folders for different n

if not os.path.exists(results_folder):
    os.mkdir(results_folder)
    
results_folder = os.path.join( results_folder, f"{nb_fts}" )  # separate folders for different nb_fts

if not os.path.exists(results_folder):
    os.mkdir(results_folder)

## Models and Methods

In [8]:
# Choose your models

models_l = ["knn", 
            "lr", 
            "svc", 
            "nb-gaussian", 
            ### "nb-bernouilli", 
            ### "nb-categorical",
            ### "rf"
           ]

# Choose your feature selection methods
fts_modes_l = ["full", 
               "random", 
               "k-best", 
               #"k-best-mi", # XXX
               ###"pca", 
               # "sparse-pca",  # takes huge time...
               ###"lfs", 
               ###"lbs", 
               ###"fcbf"        # do it one time, since always the same
              ]

## Create dataset

In [9]:
# *** NEW ***

# Parameters
n_features = p          # Total number of features
n_observations = n      # Number of samples (rows)
n_informative = nb_fts  # Number of informative (relevant) features

# Seed for reproducibility
#np.random.seed(42)

# Generate the synthetic dataset
X, y = make_classification(
    n_samples=n_observations,
    n_features=n_features,
    n_informative=n_informative,
    n_redundant=0,          # No redundant features
    n_repeated=0,           # No repeated features
    n_classes=2,            # Binary classification
    n_clusters_per_class=1, # Single cluster per class
    weights=None,           # Balanced classes
    flip_y=0.0,             # No noise to the labels (default 0.01)
    class_sep=1.0,          # Separation between the classes
    shuffle=True,           # if False, informative features first. True improves the results! # XXX
    random_state=42
)

fts_index = list(range(0, n_informative))

In [10]:
data = X
data.shape, y.shape
# fts_index

((126, 8000), (126,))

In [11]:
y

array([1, 0, 1, 1, 0, 1, 1, 0, 1, 0, 1, 1, 1, 0, 1, 1, 0, 0, 0, 0, 0, 0,
       0, 0, 0, 0, 1, 1, 0, 0, 1, 0, 0, 1, 0, 1, 0, 1, 1, 0, 1, 1, 0, 1,
       0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 1, 1, 0, 1, 1, 0, 0,
       0, 1, 1, 0, 0, 1, 0, 0, 0, 1, 0, 1, 1, 1, 1, 1, 1, 1, 0, 0, 1, 0,
       0, 1, 1, 0, 1, 1, 0, 0, 1, 1, 1, 0, 0, 0, 1, 1, 1, 1, 1, 0, 0, 1,
       1, 1, 1, 1, 1, 1, 0, 1, 0, 0, 0, 1, 1, 0, 1, 1])

## Ten times 10-fold cross validation

In [12]:
def get_CV_splits(data=data, seed=42):

    cv_d = {"train_splits": [], "test_splits": []}

    kf = KFold(n_splits=10, shuffle=True, random_state=seed)
    kf.get_n_splits(data)

    for train_index, test_index in kf.split(data):

        cv_d["train_splits"].append(train_index)
        cv_d["test_splits"].append(test_index)
        
    return cv_d

In [13]:
cv_d = get_CV_splits(data=data, seed=42)

In [14]:
# 10 times 10-fold CV

cv_splits_all = []

for seed in tqdm([33, 42, 1, 5, 1979, 2024, 22, 12, 1996, 11]):
    
    cv_d = get_CV_splits(data, seed=seed)
    
    cv_splits_all.append(cv_d)

100%|███████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:00<00:00, 2203.70it/s]


In [15]:
# cv_splits_all

In [16]:
# # *** Uncomment and debug this for the case `shuffle=True` ***
# # *** in the `make_classification` function                ***

# def get_percentage_retreived_fts(fts_mode, X, y, fts_index=fts_index):
#     """Gets the percentage of retrieved features."""
    
#     # sparse-pca
#     if isinstance(fts_mode, sklearn.decomposition._sparse_pca.SparsePCA):
        
#         # select fts with largest weights
#         selected_fts = fts_mode.components_
#         selected_fts = set(np.argmax(selected_fts, axis=1))
    
#     # k-best and k-best-mi
#     elif isinstance(fts_mode, sklearn.feature_selection._univariate_selection.SelectKBest):
        
#         # select fts with largest scores
#         # for k-best
#         if fts_mode.score_func == sklearn.feature_selection._univariate_selection.f_classif:
#             scores = fts_mode.get_params()["score_func"](X, y)[0]
            
#         # for k-best-mi  
#         elif fts_mode.score_func == sklearn.feature_selection._mutual_info.mutual_info_classif:
#             scores = fts_mode.get_params()["score_func"](X, y)
            
#         scores_tmp = scores.copy()
#         scores_tmp.sort()
#         max_scores = scores_tmp[-nb_fts:]
#         selected_fts = [np.where(scores == x)[0].item() for x in max_scores] # one-liner
    
#     intersection = set(selected_fts).intersection(set(fts_index))
#     retreived_fts_p = len(intersection) / len(fts_index)

#     return retreived_fts_p

In [17]:
# *** new function ***
def select_features(train_indices, test_indices, data=data, y=y, 
                    norm=True, fts_mode="full", fts_index=fts_index):

    # 2. fts selection
    if fts_mode == "random":
        rand_ind = np.random.randint(low=0, high=data.shape[1], size=nb_fts, dtype=int)
        current_data = data[:, rand_ind]

        # percentage of retreiveed features
        intersection = set(rand_ind).intersection(set(fts_index))
        retreived_fts_p = len(intersection) / len(fts_index)

    else:
        current_data = data
        retreived_fts_p = 0.  # dummy value for "full" mode

    # 2. split
    # train set
    X_train_split = current_data[train_indices, :]
    if norm:
        X_train_split = normalize(X_train_split, axis=0)
    y_train_split = y[train_indices]

    label_encoder = LabelEncoder()
    y_train_split = label_encoder.fit_transform(y_train_split)
    y_train_split = 2 * y_train_split - 1               # rescale targets in {-1, +1}
    
    # test set
    X_test_split = current_data[test_indices, :]
    if norm:
        X_test_split = normalize(X_test_split, axis=0)
    y_test_split = y[test_indices]
    y_test_split = label_encoder.transform(y_test_split)
    y_test_split = 2 * y_test_split - 1                 # rescale targets in {-1, +1}

    if fts_mode == "pca": # unsupervised
        pca = PCA(n_components=min(nb_fts, len(X_train_split))) # PCA limited by nb of rows of X (64)
        X_train_split = pca.fit_transform(X_train_split)
        X_test_split = pca.transform(X_test_split)

        retreived_fts_p = 0.  # to be implemented if needed

    if fts_mode == "sparse-pca": # unsupervised
        sparse_pca = SparsePCA(n_components=nb_fts, alpha=0.5, tol=1e-4, verbose=False)
        X_train_split = sparse_pca.fit_transform(X_train_split)
        X_test_split = sparse_pca.transform(X_test_split)

#         retreived_fts_p = get_percentage_retreived_fts(sparse_pca, 
#                                                        X_train_split, 
#                                                        y_train_split, 
#                                                        fts_index) # new

        retreived_fts_p = 0.


    if fts_mode == "lfs": # supervised
        # Note that the model used in the LFS algo and the downstream classifier (current_model) are the same!
        lfs = SequentialFeatureSelector(current_model, n_features_to_select=nb_fts, direction="forward")
        X_train_split = lfs.fit_transform(X_train_split, y_train_split)
        X_test_split = lfs.transform(X_test_split)

        retreived_fts_p = 0.  # to be implemented if needed

    if fts_mode == "lbs": # supervised
        # Note that the model used in the LFS algo and the downstream classifier (current_model) are the same!
        lfs = SequentialFeatureSelector(current_model, n_features_to_select=nb_fts, direction="backward")
        X_train_split = lfs.fit_transform(X_train_split, y_train_split)
        X_test_split = lfs.transform(X_test_split)

        retreived_fts_p = 0.  # to be implemented if needed

    if fts_mode == "k-best": # supervised
        # k_best = SelectKBest(chi2, k=nb_fts)
        k_best = SelectKBest(f_classif, k=nb_fts)
        X_train_split = k_best.fit_transform(X_train_split, y_train_split)
        X_test_split = k_best.transform(X_test_split)  # no y here!
        
#         retreived_fts_p = get_percentage_retreived_fts(k_best, 
#                                                        X_train_split, 
#                                                        y_train_split, 
#                                                        fts_index)  # new

        retreived_fts_p = 0.  # to be implemented if needed
    

    if fts_mode == "k-best-mi": # supervised
        k_best = SelectKBest(mutual_info_classif, k=nb_fts)
        X_train_split = k_best.fit_transform(X_train_split, y_train_split)
        X_test_split = k_best.transform(X_test_split)  # no y here!

#         retreived_fts_p = get_percentage_retreived_fts(k_best, 
#                                                        X_train_split, 
#                                                        y_train_split, 
#                                                        fts_index)  # new

        retreived_fts_p = 0.  # to be implemented if needed

    if fts_mode == "fcbf": # supervised
        X_train_split_df = pd.DataFrame(X_train_split)
        X_test_split_df = pd.DataFrame(X_test_split)
        y_train_split_df = pd.Series(y_train_split).astype(int)

        nb_cols = X_train
        _split.shape[1] + 1
        dataset = pd.concat([X_train_split_df, y_train_split_df], axis=1)
        dataset.columns = list(range(nb_cols))
        X_train_split_df = dataset.iloc[:, :-1]
        y_train_split_df = dataset.iloc[:, -1].astype(int)
        
        fts_ind, _, _ = fcbf(X_train_split_df, y_train_split_df, su_threshold=0.1, base=2)
        print("Selected features", fts_ind)       # only one feature selected whatever the threshold???
        X_train_split = X_train_split[:, fts_ind]
        X_test_split = X_test_split[:, fts_ind]

        retreived_fts_p = 0.  # to be implemented if needed

    return X_train_split, y_train_split, X_test_split, y_test_split, retreived_fts_p

In [18]:
# *** new function ***
def fit_model(X_train_split, y_train_split, X_test_split, y_test_split, model="knn"):
    
    # 1. model
    if model == "knn":
        current_model = KNeighborsClassifier()
    elif model == "lr":
        current_model = LogisticRegression()
    elif model == "svc":
        current_model = SVC()
    elif model == "nb-gaussian":
        current_model = GaussianNB()
    elif model == "nb-complement":
        current_model = ComplementNB()
    elif model == "nb-bernouilli":
        current_model = BernoulliNB()
    elif model == "nb-categorical":
        current_model = CategoricalNB()
    elif model == "rf":
        current_model = RandomForestClassifier()
    
    current_model.fit(X_train_split, y_train_split)
    y_test_preds = current_model.predict(X_test_split)

    # results
    # report = classification_report(y_test_split, y_test_preds)
    f1 = f1_score(y_test_split, y_test_preds, average='macro')
    b_acc = balanced_accuracy_score(y_test_split, y_test_preds)
        
    return f1, b_acc

## All experiments except Pk-LPNN at once

> - The following cell runs all feature selection modes (`fts_modes_l`) and all dowstream models (`models_l`).
> 
> - The results are then saved in `results_folder/`.
>
> - Hence, the individual sections (Full features, Random features, etc.) do not need to be executed anymore.

In [19]:
# *** new loop ***
# 10 times 10-fold CV: 100 experiments

results_all_d = {}

# 1. loop over feat modes:
for fts_mode in fts_modes_l:
        
    results_all_d[fts_mode] = {}

    # 2. 10 times 10-fold CV: 100 experiments
    for cv_d in tqdm(cv_splits_all):
        for train_indices, test_indices in zip(cv_d["train_splits"], cv_d["test_splits"]):
        
            X_train_split, y_train_split, X_test_split, y_test_split, retreived_fts_p = select_features(train_indices, 
                                                                                                        test_indices,
                                                                                                        data=data,
                                                                                                        y=y,
                                                                                                        norm=False, # xxx
                                                                                                        fts_mode=fts_mode)       
            # 3. loop over models
            for model in models_l:
    
                if model not in results_all_d[fts_mode].keys():
                    results_all_d[fts_mode][model] = {"f1" : [], "b_acc" : [], "retreived_fts_p" : []}
                
                f1, b_acc = fit_model(X_train_split, 
                                      y_train_split, 
                                      X_test_split, 
                                      y_test_split, 
                                      model=model)
                
                results_all_d[fts_mode][model]["f1"].append(f1)
                results_all_d[fts_mode][model]["b_acc"].append(b_acc)
                results_all_d[fts_mode][model]["retreived_fts_p"].append(retreived_fts_p)

    # save all results for fts_mode
    for model in models_l:
        
        with open(os.path.join(results_folder, f"{fts_mode}_{nb_fts}_{model}.pkl"), "wb") as fh:
            pickle.dump(results_all_d[fts_mode][model], fh)

100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:10<00:00,  1.07s/it]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:01<00:00,  5.84it/s]
100%|█████████████████████████████████████████████████████████████████████████████████████████████████████████████| 10/10 [00:03<00:00,  2.80it/s]


In [20]:
for fts_mode in fts_modes_l:

    for model in models_l:
        print("*"*60)
        
        scores_full_fts = results_all_d[fts_mode][model]
    
        print(f"*** Features mode: {fts_mode} - Model: {model} ***")
        print(f"""Test: macro F1 (mean, std): \t\t{np.mean(scores_full_fts["f1"])}""")
        print(f"""Test: balanced accuracy (mean, std): \t{np.mean(scores_full_fts["b_acc"])}""")

************************************************************
*** Features mode: full - Model: knn ***
Test: macro F1 (mean, std): 		0.8505012454865397
Test: balanced accuracy (mean, std): 	0.8690871212121213
************************************************************
*** Features mode: full - Model: lr ***
Test: macro F1 (mean, std): 		0.8287126917630013
Test: balanced accuracy (mean, std): 	0.8429280303030304
************************************************************
*** Features mode: full - Model: svc ***
Test: macro F1 (mean, std): 		0.7246005024387376
Test: balanced accuracy (mean, std): 	0.7752074314574315
************************************************************
*** Features mode: full - Model: nb-gaussian ***
Test: macro F1 (mean, std): 		0.4657757626360567
Test: balanced accuracy (mean, std): 	0.5215842352092352
************************************************************
*** Features mode: random - Model: knn ***
Test: macro F1 (mean, std): 		0.4862052776944419
Test: ba

> **STOP**

## Pk-LPNN-selected features (normalized)

> - This is the code for the Pk-LPNN experiments.

In [21]:
# Repeat 10 times:
#   10-fold CV
#   train PK-LPNN on 9 folds                     -> Nz selected fts
#   test PK-LPNN on 1 fold (KNN + selected fts)  -> b_acc, F1-score

In [22]:
# Security...

try:
    del X_train_split
    del y_train_split
    del X_test_split
    del y_test_split
    del f1
    del b_acc
    print("Variables deleted...")
except:
    pass

Variables deleted...


In [23]:
def LPNN_experiment(X_train_split, y_train_split,
                    p, Nz, k, mu_0=0.5, train_indices=None):

    # 1. normalize (here and not later!)
    X_train_split = normalize(X_train_split, axis=0)
    
    # 2. Initialization
    beta_0, mu_0 = beta_0_and_mu_0(p=p, Nz=Nz, k=k, mu_0=mu_0, method="Pk-LPNN_v2")
    # check_conditions(beta, X, beta_0, n, Nz, k, method=method)

    # 3. dynamical system
    z0 = np.hstack([beta_0, mu_0])
    t_span = (0, 30) # (0, 30)
    t = t_span[1]
    eta = Nz

    # with tqdm() as pbar: # too much printing
        
    sol = solve_ivp(LPNN, 
                    t_span=t_span, 
                    y0=z0, 
                    args=(X_train_split, y_train_split, eta, k, "Pk-LPNN_v2"), #, pbar),
                    method="RK45", # DOP853, RK45
                    dense_output=False, 
                    max_step=0.1, 
                    atol=1.2e-4, 
                    rtol=1e-4)

    beta_sol = sol["y"][:-1, -1]
    mu_sol = sol["y"][-1, -1]

    selected_ind = np.argpartition(np.abs(beta_sol), -Nz)[-Nz:]
    
    return list(selected_ind)

In [24]:
# single experiment for selected features

def downstream_models(data=data, y=y, norm=True,
               train_indices=None, test_indices=None, selected_ind=None, 
               model="knn"):
    
    # 1. fts selection    
    current_data = data[:, selected_ind]

    # 2. split
    # train set
    X_train_split = current_data[train_indices, :]
    if norm:
        X_train_split = normalize(X_train_split, axis=0)
    y_train_split = y[train_indices]
    y_train_split = 2 * y_train_split - 1                        # rescale targets in {-1, +1}

    # test set
    X_test_split = current_data[test_indices, :]
    if norm:
        X_test_split = normalize(X_test_split, axis=0)
    y_test_split = y[test_indices]
    y_test_split = 2 * y_test_split - 1                          # rescale targets in {-1, +1}

    # 3. model
    if model == "knn":
        current_model = KNeighborsClassifier()
    elif model == "lr":
        current_model = LogisticRegression()
    elif model == "svc":
        current_model = SVC()
    elif model == "nb-gaussian":
        current_model = GaussianNB()
    elif model == "nb-complement":
        current_model = ComplementNB()
    elif model == "nb-bernouilli":
        current_model = BernoulliNB()
    elif model == "nb-categorical":
        current_model = CategoricalNB()
    elif model == "rf":
        current_model = RandomForestClassifier()
    
    current_model.fit(X_train_split, y_train_split)
    y_test_preds = current_model.predict(X_test_split)

    # results
    # report = classification_report(y_test_split, y_test_preds)
    f1 = f1_score(y_test_split, y_test_preds, average='macro')
    b_acc = balanced_accuracy_score(y_test_split, y_test_preds)
    
    return f1, b_acc

In [25]:
# All experiments: 10 times 10-fold CV: 100 experiments

results_d = {}

for i, cv_d in tqdm(enumerate(cv_splits_all)):

    for train_indices, test_indices in zip(cv_d["train_splits"], cv_d["test_splits"]):
            
        # train set
        X_train_split = data[train_indices, :]
        y_train_split = y[train_indices]
        y_train_split = 2 * y_train_split - 1        # rescale targets in {-1, +1}

        # parameters
        k = 1000     # check effect of k
        Nz = nb_fts  # i.e. 178 Nz_l[0]
        # sigma = 0.02  # useless here, no noise
        mu_0 = 0.5

        # Pk-LPNN ft selection
        selected_ind = LPNN_experiment(X_train_split, y_train_split,
                                       p, Nz, k, mu_0=0.5, train_indices=train_indices)
        
        true_cap_retreived_fts = set(selected_ind).intersection(set(fts_index))
        retreived_fts_p = len(true_cap_retreived_fts) / nb_fts
                
        # model with selected fts
        for model in models_l:

            if model not in results_d: # create dict if not exists
                results_d[model] = {"f1" : [], "b_acc" : [], "retreived_fts_p" : []}
            
            f1, b_acc = downstream_models(data=data, y=y,
                                          norm=False, # xxx 
                                          train_indices=train_indices, 
                                          test_indices=test_indices, 
                                          selected_ind=selected_ind, 
                                          model=model)
                        
            results_d[model]["f1"].append(f1)
            results_d[model]["b_acc"].append(b_acc)
            results_d[model]["retreived_fts_p"].append(retreived_fts_p)

    print(f"CV {i+1} finished for all models.")
    

# save results
for model in models_l:
    
    with open(os.path.join(results_folder, f"pk-lpnn_{nb_fts}_{model}.pkl"), "wb") as fh:
        pickle.dump(results_d[model], fh)

    print(f"*** Features mode: Pk-LPNN - Model: {model} ***")
    print(f"""Test: macro F1 (mean, std): \t\t{np.mean(results_d[model]["f1"])}""")
    print(f"""Test: balanced accuracy (mean, std): \t{np.mean(results_d[model]["b_acc"])}""")

1it [02:38, 158.21s/it]

CV 1 finished for all models.


2it [05:23, 162.21s/it]

CV 2 finished for all models.


3it [07:56, 157.93s/it]

CV 3 finished for all models.


4it [10:29, 156.11s/it]

CV 4 finished for all models.


5it [12:57, 153.21s/it]

CV 5 finished for all models.


6it [15:26, 151.75s/it]

CV 6 finished for all models.


7it [17:55, 150.86s/it]

CV 7 finished for all models.


8it [20:24, 150.20s/it]

CV 8 finished for all models.


9it [22:59, 151.69s/it]

CV 9 finished for all models.


10it [25:26, 152.70s/it]

CV 10 finished for all models.
*** Features mode: Pk-LPNN - Model: knn ***
Test: macro F1 (mean, std): 		0.6279194956660127
Test: balanced accuracy (mean, std): 	0.6516280663780664
*** Features mode: Pk-LPNN - Model: lr ***
Test: macro F1 (mean, std): 		0.6030572139418888
Test: balanced accuracy (mean, std): 	0.6218993506493506
*** Features mode: Pk-LPNN - Model: svc ***
Test: macro F1 (mean, std): 		0.6317216250381885
Test: balanced accuracy (mean, std): 	0.6568495670995671
*** Features mode: Pk-LPNN - Model: nb-gaussian ***
Test: macro F1 (mean, std): 		0.5902101535375529
Test: balanced accuracy (mean, std): 	0.6069330808080808





> **TO DO**
> - Get percentage of retreived features for
>   - ✅ `Pk-LPNN`
>   - ✅ `sparse-pca`
>   - ✅ `k-best`
>   - ✅ `k-best-mi`
>   - ✅ `random`
> - ✅ Add these to the resutls
> - Automate experiments
> - Run experiments
>   - `p = 2000` : `percent = 1.25%, 2.5%, 5%, 10%` / `n_ = 100, 40, 20` => `n = 20,  50,  100`
>   - `p = 4000` : `percent = 1.25%, 2.5%, 5%, 10%` / `n_ = 100, 40, 20` => `n = 40,  100, 200`
>   - `p = 6000` : `percent = 1.25%, 2.5%, 5%, 10%` / `n_ = 100, 40, 20` => `n = 60,  150, 300`
>   - `p = 8000` : `percent = 1.25%, 2.5%, 5%, 10%` / `n_ = 100, 40, 20` => `n = 80,  200, 400`
>   - `p = 10000`: `percent = 1.25%, 2.5%, 5%, 10%` / `n_ = 100, 40, 20` => `n = 100, 250, 500`

**Remarks**
1. Rescaling targets between -1 and +1 does not affect classical feature selection methods but drastically improves Pk-LPNN. Hence, we adopt the rescaling targets schemes.
2. Normalizing data decreases the results! We don't normalize now. But we still normalize in `LPNN_experiment` function.
3. In `make_classification`, setting `shuffle = True` drastically improves the results of Pk-LPNN.

**Troubleshooting:**
- Le problème venait de la seed! La fonction `LPNN_experiment` appelle `beta_0_and_mu_0` de `models/` qui utilise de l'aléatoire (`np.random`). En ajoutant/supprimant/ré-exécutant des cellules du notebook, le fontionnement de `beta_0_and_mu_0` est moodifié, et donc les résultats également.

**Remarque**
- Tester avec $N_z \in \{ 0.75\%, 1.00\%, 1.25\% \}$ et $n \simeq N_z \cdot \log(\frac{p}{N_z})$
-Se comparer à `k-best` et `k-best-mi` et pas à `full-fts` (qui est excellent).