In [1]:
# ***       Parameters cell        ***
# *** Used to automate experiments ***

# View > Cell Toolbar > Tags: set the tag as "parameters"

p = 4000                  # 2000  # Default value
n_ = 10                   #  10   # Default value # Not the real n! Real n defined below...
percent_relevent = 4.25   # 3, 5  # Defalut value
percent = 2.5             # 2.5  # Default value (varying)

In [2]:
# Parameters
p = 4000
n_ = 10
percent_relevent = 4.25
percent = 2.5


# Genetic Application: Synthetic

## Librairies

In [3]:
# !pip install deeplake
# !pip install -U scikit-learn

# # --- For automated experiments --- #
# !pip install papermill
# !pip install jupyter_contrib_nbextensions
# !jupyter contrib nbextension install --user

In [4]:
import os
import sys

module_path = os.path.abspath(os.path.join('..'))

if module_path not in sys.path:
    sys.path.append(module_path)

In [5]:
import papermill as pm

import warnings
warnings.filterwarnings('ignore')

from tqdm import tqdm

import pickle

import random
import pandas as pd
import numpy as np
import scipy.io

#import deeplake
import sklearn
from sklearn.datasets import make_classification
from sklearn.datasets import fetch_openml

from sklearn.preprocessing import normalize
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import KFold

from sklearn.feature_selection import SequentialFeatureSelector, SelectKBest
from sklearn.feature_selection import f_classif, mutual_info_classif, r_regression, chi2

from sklearn.model_selection import KFold
from sklearn.model_selection import train_test_split

from sklearn.linear_model import Lasso
from sklearn.decomposition import PCA, SparsePCA
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.naive_bayes import GaussianNB, BernoulliNB, CategoricalNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV

from sklearn.metrics import classification_report
from sklearn.metrics import accuracy_score
from sklearn.metrics import roc_auc_score
from sklearn.metrics import f1_score
from sklearn.metrics import balanced_accuracy_score

from src.utils import *
from src.models import *

import matplotlib.pyplot as plt
plt.style.use('ggplot')

In [6]:
%load_ext autoreload
%autoreload 2

## Parameters

In [7]:
# The parameters cell must be the first cell of the notebook

In [8]:
np.random.seed(42)

n =  p // n_    # redefine n
nb_fts = int(p * percent // 100)
n_relevant = int((percent_relevent / 100) * p)

print("*** Data ***")
print(f"Number features p:\t\t\t {p}")
print(f"Number of observations n:\t\t {n}")
print(f"Number of relevant features n_relevant:\t {n_relevant}")

print("*** Model ***")
print(f"Number of selected features N_z:\t {nb_fts}")

*** Data ***
Number features p:			 4000
Number of observations n:		 400
Number of relevant features n_relevant:	 170
*** Model ***
Number of selected features N_z:	 100


In [9]:
results_folder = os.path.join( f"../results/Synthetic/synthetic_data_{p}" ) # separate folders for different p

if not os.path.exists(results_folder):
    os.mkdir(results_folder)

results_folder = os.path.join( results_folder, f"{n}" )       # separate folders for different n

if not os.path.exists(results_folder):
    os.mkdir(results_folder)
    
results_folder = os.path.join( results_folder, f"{nb_fts}" )  # separate folders for different nb_fts

if not os.path.exists(results_folder):
    os.mkdir(results_folder)

## Models and Methods

In [10]:
# Choose your models

models_l = ["knn", 
            "lr", 
            "svc", 
            "nb-gaussian", 
            ### "nb-bernouilli", 
            ### "nb-categorical",
            ### "rf"
           ]

# Choose your feature selection methods
fts_modes_l = [#"full", 
               #"random", 
               #"k-best", 
               "k-best-mi",
               #"lasso",
               ###"pca", 
               # "sparse-pca",  # takes huge time...
               ###"lfs", 
               ###"lbs", 
              ]

## Create dataset

In [11]:
# Parameters
n_features = p             # Total number of features
n_observations = n         # Number of samples (rows)
n_important = n_relevant  # Number of informative (relevant) features
fts_index = None
np.random.seed(42)

In [12]:
def generate_classification_data(n, n_important, n_features):
    
    # Step 1: Generate the matrix of relevant features (random normal values)
    important_features = np.random.randn(n, n_important)              # Values from a normal distribution

    # Step 2: Generate redundant features (random linear combinations of relevant features)
    weights = np.random.randn(n_important, n_features - n_important)  # Random weights
    redundant_features = np.dot(important_features, weights)          # Random linear combinations

    # Step 3: Combine relevant and redundant features
    full_matrix = np.hstack((important_features, redundant_features)) # Concatenate horizontally

    # Step 4: Randomly shuffle the order of columns to disperse relevant features
    column_order = np.random.permutation(n_features)                  # Uniform random permutation
    data = full_matrix[:, column_order]                               # Apply shuffled order

    # Step 5: Generate a random model for binary classification
    random_vector = np.random.randn(n_important, 1)                   # Random vector from normal distribution
    scores = np.dot(important_features, random_vector)                # Compute scores

    # Classify: Positive scores as 1, negative scores as 0
    class_labels = np.ones(n)  # Initialize all to 1
    class_labels[scores.flatten() < np.mean(scores)] = 0              # Assign 0 for scores below the mean

    # Step 6: Verify properties
    rank_important = np.linalg.matrix_rank(important_features)        # Rank of relevant features matrix
    print(f"Rank of important features matrix:\t {rank_important}")

    # Display dimensions
    print(f"Size of full features matrix:\t\t {data.shape}")

    return data, class_labels

In [13]:
data, class_labels = generate_classification_data(n=n, n_important=n_relevant, n_features=p)
# print("Classification labels:")
# print(class_labels)

Rank of important features matrix:	 170
Size of full features matrix:		 (400, 4000)


In [14]:
y = class_labels # new
data.shape, y.shape

((400, 4000), (400,))

In [15]:
np.linalg.matrix_rank(data)

np.int64(170)

## Ten times 10-fold cross validation

In [16]:
def get_CV_splits(data=data, seed=42):

    cv_d = {"train_splits": [], "test_splits": []}

    kf = KFold(n_splits=10, shuffle=True, random_state=seed)
    kf.get_n_splits(data)

    for train_index, test_index in kf.split(data):

        cv_d["train_splits"].append(train_index)
        cv_d["test_splits"].append(test_index)
        
    return cv_d

In [17]:
cv_d = get_CV_splits(data=data, seed=42)

In [18]:
# 10 times 10-fold CV

cv_splits_all = []

for seed in tqdm([33, 42, 1, 5, 1979, 2024, 22, 12, 1996, 11]):
    
    cv_d = get_CV_splits(data, seed=seed)
    
    cv_splits_all.append(cv_d)

  0%|          | 0/10 [00:00<?, ?it/s]

100%|██████████| 10/10 [00:00<00:00, 3544.88it/s]




In [19]:
# cv_splits_all

In [20]:
# *** new function ***
def select_features(train_indices, test_indices, data=data, y=y, 
                    norm=True, fts_mode="full", fts_index=fts_index):

    # 2. fts selection
    if fts_mode == "random":
        rand_ind = np.random.randint(low=0, high=data.shape[1], size=nb_fts, dtype=int)
        current_data = data[:, rand_ind]

        # # percentage of retreiveed features
        # intersection = set(rand_ind).intersection(set(fts_index))
        # retreived_fts_p = len(intersection) / len(fts_index)
        retreived_fts_p = 0.

    else:
        current_data = data
        retreived_fts_p = 0.  # dummy value for "full" mode

    # 2. split
    # train set
    X_train_split = current_data[train_indices, :]
    if norm:
        X_train_split = normalize(X_train_split, axis=0)
    y_train_split = y[train_indices]

    label_encoder = LabelEncoder()
    y_train_split = label_encoder.fit_transform(y_train_split)
    y_train_split = 2 * y_train_split - 1               # rescale targets in {-1, +1}
    
    # test set
    X_test_split = current_data[test_indices, :]
    if norm:
        X_test_split = normalize(X_test_split, axis=0)
    y_test_split = y[test_indices]
    y_test_split = label_encoder.transform(y_test_split)
    y_test_split = 2 * y_test_split - 1                 # rescale targets in {-1, +1}
    
    if fts_mode == "lasso": # supervised
        lasso = Lasso(alpha=1)
        lasso.fit(X_train_split, y_train_split)
        coeffs = lasso.coef_[lasso.coef_ != 0]
        coeffs = np.abs(coeffs)
        coeffs = np.sort(coeffs)[-nb_fts:]
        lasso_idx = np.argwhere(np.abs(lasso.coef_) >= coeffs[0]).reshape(-1)
        X_train_split = X_train_split[:, lasso_idx]
        X_test_split = X_test_split[:, lasso_idx]
    
    if fts_mode == "pca": # unsupervised
        pca = PCA(n_components=min(nb_fts, len(X_train_split))) # PCA limited by nb of rows of X (64)
        X_train_split = pca.fit_transform(X_train_split)
        X_test_split = pca.transform(X_test_split)

        retreived_fts_p = 0.  # to be implemented if needed

    if fts_mode == "sparse-pca": # unsupervised
        sparse_pca = SparsePCA(n_components=nb_fts, alpha=0.5, tol=1e-4, verbose=False)
        X_train_split = sparse_pca.fit_transform(X_train_split)
        X_test_split = sparse_pca.transform(X_test_split)

#         retreived_fts_p = get_percentage_retreived_fts(sparse_pca, 
#                                                        X_train_split, 
#                                                        y_train_split, 
#                                                        fts_index) # new

        retreived_fts_p = 0.


    if fts_mode == "lfs": # supervised
        # Note that the model used in the LFS algo and the downstream classifier (current_model) are the same!
        lfs = SequentialFeatureSelector(current_model, n_features_to_select=nb_fts, direction="forward")
        X_train_split = lfs.fit_transform(X_train_split, y_train_split)
        X_test_split = lfs.transform(X_test_split)

        retreived_fts_p = 0.  # to be implemented if needed

    if fts_mode == "lbs": # supervised
        # Note that the model used in the LFS algo and the downstream classifier (current_model) are the same!
        lfs = SequentialFeatureSelector(current_model, n_features_to_select=nb_fts, direction="backward")
        X_train_split = lfs.fit_transform(X_train_split, y_train_split)
        X_test_split = lfs.transform(X_test_split)

        retreived_fts_p = 0.  # to be implemented if needed

    if fts_mode == "k-best": # supervised
        # k_best = SelectKBest(chi2, k=nb_fts)
        k_best = SelectKBest(f_classif, k=nb_fts)
        X_train_split = k_best.fit_transform(X_train_split, y_train_split)
        X_test_split = k_best.transform(X_test_split)  # no y here!
        
#         retreived_fts_p = get_percentage_retreived_fts(k_best, 
#                                                        X_train_split, 
#                                                        y_train_split, 
#                                                        fts_index)  # new

        retreived_fts_p = 0.  # to be implemented if needed
    

    if fts_mode == "k-best-mi": # supervised
        k_best = SelectKBest(mutual_info_classif, k=nb_fts)
        X_train_split = k_best.fit_transform(X_train_split, y_train_split)
        X_test_split = k_best.transform(X_test_split)  # no y here!

#         retreived_fts_p = get_percentage_retreived_fts(k_best, 
#                                                        X_train_split, 
#                                                        y_train_split, 
#                                                        fts_index)  # new

        retreived_fts_p = 0.  # to be implemented if needed

    return X_train_split, y_train_split, X_test_split, y_test_split, retreived_fts_p

In [21]:
# *** new function ***
def fit_model(X_train_split, y_train_split, X_test_split, y_test_split, model="knn"):
    
    # 1. model
    if model == "knn":
        current_model = KNeighborsClassifier()
    elif model == "lr":
        current_model = LogisticRegression()
    elif model == "svc":
        current_model = SVC()
    elif model == "nb-gaussian":
        current_model = GaussianNB()
    elif model == "nb-complement":
        current_model = ComplementNB()
    elif model == "nb-bernouilli":
        current_model = BernoulliNB()
    elif model == "nb-categorical":
        current_model = CategoricalNB()
    elif model == "rf":
        current_model = RandomForestClassifier()
    
    current_model.fit(X_train_split, y_train_split)
    y_test_preds = current_model.predict(X_test_split)

    # results
    # report = classification_report(y_test_split, y_test_preds)
    f1 = f1_score(y_test_split, y_test_preds, average='macro')
    b_acc = balanced_accuracy_score(y_test_split, y_test_preds)
        
    return f1, b_acc

## All experiments except Pk-LPNN at once

In [22]:
# *** new loop ***
# 10 times 10-fold CV: 100 experiments

results_all_d = {}

# 1. loop over feat modes:
for fts_mode in fts_modes_l:
        
    results_all_d[fts_mode] = {}

    # 2. 10 times 10-fold CV: 100 experiments
    for cv_d in tqdm(cv_splits_all):
        for train_indices, test_indices in zip(cv_d["train_splits"], cv_d["test_splits"]):
        
            X_train_split, y_train_split, X_test_split, y_test_split, retreived_fts_p = select_features(train_indices, 
                                                                                                        test_indices,
                                                                                                        data=data,
                                                                                                        y=y,
                                                                                                        norm=False, # drastically influences the results
                                                                                                        fts_mode=fts_mode)       
            # 3. loop over models
            for model in models_l:
    
                if model not in results_all_d[fts_mode].keys():
                    results_all_d[fts_mode][model] = {"f1" : [], "b_acc" : [], "retreived_fts_p" : []}
                
                f1, b_acc = fit_model(X_train_split, 
                                      y_train_split, 
                                      X_test_split, 
                                      y_test_split, 
                                      model=model)
                
                results_all_d[fts_mode][model]["f1"].append(f1)
                results_all_d[fts_mode][model]["b_acc"].append(b_acc)
                results_all_d[fts_mode][model]["retreived_fts_p"].append(retreived_fts_p)

    # save all results for fts_mode
    for model in models_l:
        
        with open(os.path.join(results_folder, f"{fts_mode}_{nb_fts}_{model}.pkl"), "wb") as fh:
            pickle.dump(results_all_d[fts_mode][model], fh)

  0%|          | 0/10 [00:00<?, ?it/s]

 10%|█         | 1/10 [00:34<05:09, 34.35s/it]

 20%|██        | 2/10 [01:08<04:33, 34.14s/it]

 30%|███       | 3/10 [01:42<04:00, 34.30s/it]

 40%|████      | 4/10 [02:16<03:24, 34.15s/it]

 50%|█████     | 5/10 [02:51<02:51, 34.37s/it]

 60%|██████    | 6/10 [03:25<02:17, 34.27s/it]

 70%|███████   | 7/10 [03:59<01:42, 34.18s/it]

 80%|████████  | 8/10 [04:33<01:08, 34.15s/it]

 90%|█████████ | 9/10 [05:07<00:34, 34.11s/it]

100%|██████████| 10/10 [05:41<00:00, 34.17s/it]

100%|██████████| 10/10 [05:41<00:00, 34.20s/it]




In [23]:
for fts_mode in fts_modes_l:

    for model in models_l:
        print("*"*60)
        
        scores_full_fts = results_all_d[fts_mode][model]
    
        print(f"*** Features mode: {fts_mode} - Model: {model} ***")
        print(f"""Test: macro F1 (mean, std): \t\t{np.mean(scores_full_fts["f1"])}""")
        print(f"""Test: balanced accuracy (mean, std): \t{np.mean(scores_full_fts["b_acc"])}""")

************************************************************
*** Features mode: k-best-mi - Model: knn ***
Test: macro F1 (mean, std): 		0.6030860655072339
Test: balanced accuracy (mean, std): 	0.612323114287561
************************************************************
*** Features mode: k-best-mi - Model: lr ***
Test: macro F1 (mean, std): 		0.7233683154953965
Test: balanced accuracy (mean, std): 	0.7291976229386866
************************************************************
*** Features mode: k-best-mi - Model: svc ***
Test: macro F1 (mean, std): 		0.7370353231106832
Test: balanced accuracy (mean, std): 	0.7436315102181436
************************************************************
*** Features mode: k-best-mi - Model: nb-gaussian ***
Test: macro F1 (mean, std): 		0.7148623810385659
Test: balanced accuracy (mean, std): 	0.722134821835446


In [24]:
# Break here to avoid Pk-LPNN experiments
# Comment for running all experiments
print("Experiments finished.")

Experiments finished.


## Pk-LPNN-selected features (normalized)

**Remark**
- Use $N_z \in \{ 0.75\%, 1.00\%, 1.25\% \}$ et $n \simeq N_z \cdot \log(\frac{p}{N_z})$