In [9]:
%cd /kaggle/input/chemcancer-v2/src/

/kaggle/input/chemcancer-v2/src


In [10]:
pip install rvfln

Note: you may need to restart the kernel to use updated packages.


In [11]:
import numpy as np
import matplotlib.pyplot as plt
import os
import time
from tensorflow.keras.optimizers import Adam
from data import *
from machine_learning_models import *
from deep_learning_models import *
from vision_transformer import *
from utils_dl_model import *
from utils_ml_model import print_ml_results
from sklearn.model_selection import train_test_split
from rvfln.bls import BLSClassifier
from sklearn.metrics import accuracy_score

In [12]:
# Set the seed value.
SEED = 7
np.random.seed(SEED)

# Deep Learning parameters
DL_EPOCH = 500
DL_BATCH_SIZE = 32
DL_CNN_VERSION = 3
DL_TRANSFORMER_VISION_VERSION = 11
DL_BLS_VERSION = 1

DO_DL = True
CV_DL = True
OPT_DL = False

DO_CNN = False
DO_TRANSFORMER_VISION = False
DO_BLS = True
DO_ML = False

# Percentage of test set out of the dataset.
TEST_SET = 0.2

# Percentage of validation set out of the training dataset.
VAL_SET = 0.2

In [13]:
# Folder path associated with deep learning models
dl_models_folder = "/kaggle/working/Deep_Learning_models/"
dl_metrics_folder = "/kaggle/working/Deep_Learning_metrics/"
dl_weights_folder = "/kaggle/working/Deep_Learning_weights/"
dl_cv_models_folder = "/kaggle/working/Deep_Learning_CV/"
dl_cv_results_folder = "/kaggle/working/Deep_Learning_CV_results/"

# Folder path associated with machine learning models
ml_models_folder = "/kaggle/working/Machine_Learning_models/"
ml_models_results_folder = "/kaggle/working/Machine_Learning_models_results/"

# Model names (Saved in h5 format)
cnn_model_name = f"cnn_v{DL_CNN_VERSION}_{DL_BATCH_SIZE}_{DL_EPOCH}_seed_{SEED}.h5"
transformer_vis_model_name = f"transformer_vision_v{DL_TRANSFORMER_VISION_VERSION}_{DL_BATCH_SIZE}_{DL_EPOCH}_seed_{SEED}.h5"
bls_model_name = f"bls_v{DL_BLS_VERSION}_{DL_BATCH_SIZE}_{DL_EPOCH}_seed_{SEED}.h5"

# Metric filenames
cnn_metrics_filename = f"metrics_{cnn_model_name}.json"
transformer_vis_metrics_filename = f"metrics_{transformer_vis_model_name}.json"

# Weight filenames
cnn_weights_filename = f"weights_{cnn_model_name}.json"
transformer_vis_weights_filename = f"weights_{transformer_vis_model_name}.json"

# Deep Learning models path
if DO_CNN:
    dl_model_path = os.path.join(
        dl_models_folder, cnn_model_name)
elif DO_TRANSFORMER_VISION:
    dl_model_path = os.path.join(
        dl_models_folder, transformer_vis_model_name)


# Deep Learning metrics path
if DO_CNN:
    dl_metrics_path = os.path.join(
        dl_metrics_folder, cnn_metrics_filename)
elif DO_TRANSFORMER_VISION:
    dl_metrics_path = os.path.join(
        dl_metrics_folder, transformer_vis_metrics_filename)


# Deep Learning weights path
if DO_CNN:
    dl_weights_path = os.path.join(
        dl_weights_folder, cnn_weights_filename)

elif DO_TRANSFORMER_VISION:
    dl_weights_path = os.path.join(
        dl_weights_folder, transformer_vis_weights_filename)

In [22]:
# Load the data
data_file = "../Data/HC05_HC07.csv"

if OPT_DL:
    X_filtered, y = preprocess_cv_raw_data(data_file)
    optimize_hyperparameters(X_filtered, y, n_trials=50)

else:
    # Train deep learning model with cross validation
    if CV_DL:
        print("CV preprocessing")
        X_filtered, y = preprocess_cv_raw_data(data_file)
    else:
        # Preprocess the raw data
        X_train, X_test, y_train, y_test = preprocess_raw_data(
            data_file, TEST_SET)        

CV preprocessing
Load the data
Data shape : (2373, 274)
Extract the feature and target data
X shaped: (2373, 270)
y shaped: (2373,)
Apply filters and background substraction to the features dataset


In [None]:
import optuna
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from rvfln.bls import BLSClassifier

def objective(trial,X,y):
    n_z = trial.suggest_int('n_z', 10, 40)
    n_z_features = trial.suggest_int('n_z_features', 50, 200)
    n_h = trial.suggest_int('n_h', 50, 200)
    alpha = trial.suggest_loguniform('alpha', 0.0001, 0.1)

    bls = BLSClassifier(n_z=n_z, n_z_features=n_z_features, n_h=n_h, alpha=alpha)

    fold = 1
    n_splits = 5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed_value)
    scores = []

    for train_index, test_index in skf.split(X, y):
        # Split the data into training and validation sets
        print("=" * 40)
        print(f"Fold: {fold}")
        print("Splitting the data")
        X_train_full, X_test = X[train_index], X[test_index]
        y_train_full, y_test = y[train_index], y[test_index]

        # Standardize the data
        print("Standardizing the data")
        X_train_full, X_test = standardize_data(X_train_full, X_test)

        # Split the training data into training and validation sets
        X_train, X_test, y_train, y_test = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=seed_value)

        # Fit the model and compute the accuracy
        bls.fit(X_train, y_train)
        score = bls.score(X_test, y_test)
        scores.append(score)
        print(f"Fold {fold}, BLS Test Score: {score:.4f}")
        fold += 1
    
    mean_scores = np.mean(scores)
    print(f"Mean BLS Test score over stratified {n_splits}-fold cross-validation: {mean_scores:.4f}")
    
    return mean_scores

# Create a study object
study = optuna.create_study(direction='maximize')

# Optimize the study
study.optimize(lambda trial: objective(trial, X_filtered, y), n_trials=100)

# Print the best parameters and the corresponding score
print("Best parameters: ", study.best_params)
print("Best score: ", study.best_value)

In [15]:
X_train, X_test, y_train, y_test = preprocess_raw_data(
    data_file, TEST_SET)

Load the data
Data shape : (2373, 274)
Extract the feature and target data
X shaped: (2373, 270)
y shaped: (2373,)
Apply filters and background substraction to the features dataset
Splitting the data...
Standardizing the data...


In [50]:
import optuna
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from rvfln.bls import BLSClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

def perform_bls_cv(X,y,n_z, n_z_features, n_h, alpha):

    bls = BLSClassifier(n_z=n_z, n_z_features=n_z_features, n_h=n_h, alpha=alpha)

    fold = 1
    n_splits = 5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed_value)
    scores = []
    # Initialize lists to store precision, recall, and F1 scores
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []

    for train_index, test_index in skf.split(X, y):
        # Split the data into training and validation sets
        print("=" * 40)
        print(f"Fold: {fold}")
        print("Splitting the data")
        X_train_full, X_test = X[train_index], X[test_index]
        y_train_full, y_test = y[train_index], y[test_index]

        # Split the training data into training and validation sets
        X_train, X_test, y_train, y_test = train_test_split(X_train_full, y_train_full, test_size=0.2, random_state=seed_value)
        
        print(f"X_train shape: {X_train.shape}")
        print(f"X_test shape: {X_test.shape}")
        print(f"y_train shape: {y_train.shape}")
        print(f"y_test shape: {y_test.shape}")
        
        # Fit the bls and compute the accuracy
        bls.fit(X_train, y_train)
        score = bls.score(X_test, y_test)
        y_pred = bls.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        # Calculate precision, recall, and F1 score
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')

        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")

        # Append the scores to the lists
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        scores.append(score)
        
        
        
        print(f"Fold {fold}, BLS Test Score: {score:.4f}")
        fold += 1
    
    scores = np.array(scores)
    accuracies = np.array(accuracies)
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    f1_scores = np.array(f1_scores)
    
    # Calculate and print the mean and standard deviation of precision, recall, and F1 score
    print(f"Mean BLS Test score over stratified {n_splits}-fold cross-validation: {np.mean(scores):.4f}")
    print(f"Mean accuracies over stratified {n_splits}-fold cross-validation: {np.mean(accuracies):.4f}")
    print(f"Mean precision over stratified {n_splits}-fold cross-validation: {np.mean(precisions):.4f}")
    print(f"Mean recall over stratified {n_splits}-fold cross-validation: {np.mean(recalls):.4f}")
    print(f"Mean F1 score over stratified {n_splits}-fold cross-validation: {np.mean(f1_scores):.4f}")
    print("*" * 40)
    print(f"Standard deviation of BLS Test score over stratified {n_splits}-fold cross-validation: {np.std(scores):.4f}")
    print(f"Standard deviation of accuracies over stratified {n_splits}-fold cross-validation: {np.std(accuracies):.4f}")
    print(f"Standard deviation of precision over stratified {n_splits}-fold cross-validation: {np.std(precisions):.4f}")
    print(f"Standard deviation of recall over stratified {n_splits}-fold cross-validation: {np.std(recalls):.4f}")
    print(f"Standard deviation of F1 score over stratified {n_splits}-fold cross-validation: {np.std(f1_scores):.4f}")
    
    return np.mean(scores)

In [51]:
n_z=12
n_z_features=50
n_h=100
alpha=0.0001

perform_bls_cv(X_filtered,y,n_z, n_z_features, n_h, alpha)

Fold: 1
Splitting the data
X_train shape: (1518, 270)
X_test shape: (380, 270)
y_train shape: (1518,)
y_test shape: (380,)
Precision: 0.7508
Recall: 0.7414
F1 Score: 0.7419
Fold 1, BLS Test Score: 0.7500
Fold: 2
Splitting the data
X_train shape: (1518, 270)
X_test shape: (380, 270)
y_train shape: (1518,)
y_test shape: (380,)
Precision: 0.7587
Recall: 0.7368
F1 Score: 0.7384
Fold 2, BLS Test Score: 0.7474
Fold: 3
Splitting the data
X_train shape: (1518, 270)
X_test shape: (380, 270)
y_train shape: (1518,)
y_test shape: (380,)
Precision: 0.7587
Recall: 0.7407
F1 Score: 0.7452
Fold 3, BLS Test Score: 0.7553
Fold: 4
Splitting the data
X_train shape: (1519, 270)
X_test shape: (380, 270)
y_train shape: (1519,)
y_test shape: (380,)
Precision: 0.7660
Recall: 0.7426
F1 Score: 0.7486
Fold 4, BLS Test Score: 0.7579
Fold: 5
Splitting the data
X_train shape: (1519, 270)
X_test shape: (380, 270)
y_train shape: (1519,)
y_test shape: (380,)
Precision: 0.7386
Recall: 0.7278
F1 Score: 0.7304
Fold 5, BLS

0.7489473684210527

In [29]:
# Create a BLS classifier
bls = BLSClassifier(n_z=12, n_z_features=50, n_h=83, alpha=0.0002)

# Fit the classifier to the training data
bls.fit(X_train, y_train)
print(bls.score(X_test,y_test))
y_pred = np.argmax(bls.predict(X_test), axis=-1)
print(y_pred)

0.7536842105263157
13


In [10]:
# Create a BLS classifier
bls = BLSClassifier(n_z=5, n_z_features=50, n_h=500, alpha=0.1)

# Fit the classifier to the training data
bls.fit(X_train, y_train)
print(bls.score(X_test,y_test))

0.76


n_z is the number of feature nodes in the enhancement node layer. It should ideally be set considering the dimensionality of your input data. A larger n_z means the model can learn more complex representations, but it may also lead to overfitting if set too high. For your data with 270 features, a reasonable starting point might be around 10-50.

n_z_features is the number of random features generated from each feature node. This parameter directly controls the model's complexity and its computational requirements. A higher number will allow the model to learn more complex representations, but it will also increase the computational cost and may lead to overfitting. A reasonable starting point might be 100-500.

n_h is the number of enhancement nodes. This is another parameter controlling the model's complexity. A higher number will allow the model to learn more complex representations, but it will also increase the computational cost and may lead to overfitting. A reasonable starting point might be 500-2000.

alpha is a regularization parameter. It helps prevent overfitting by adding a penalty to the loss function based on the weights' magnitude. It is typically a small positive value. Common choices are 0.1, 0.01, or 0.001.

In [11]:
# Print the scores of the classifier
print(bls.score(X_test,y_test))

0.76


In [12]:
# Create a BLS classifier
bls = BLSClassifier(n_z=10, n_z_features=50, n_h=100, alpha=0.001)

# Fit the classifier to the training data
bls.fit(X_train, y_train)

# Print the scores of the classifier
print(bls.score(X_test,y_test))

0.7642105263157895


In [13]:
# Create a BLS classifier
bls = BLSClassifier(n_z=10, n_z_features=50, n_h=100, alpha=0.0001)

# Fit the classifier to the training data
bls.fit(X_train, y_train)

# Print the scores of the classifier
print(bls.score(X_test,y_test))

0.7663157894736842


In [14]:
# Create a BLS classifier
bls = BLSClassifier(n_z=10, n_z_features=50, n_h=100, alpha=0.0001)

# Fit the classifier to the training data
bls.fit(X_train, y_train)

# Print the scores of the classifier
print(bls.score(X_test,y_test))

0.7305263157894737


In [15]:
from sklearn.model_selection import GridSearchCV
from rvfln.bls import BLSClassifier
import joblib

# Define the BLSClassifier model
bls = BLSClassifier(n_z=10, n_z_features=50, n_h=50, alpha=0.0001)

# Define the hyperparameters and the range of their values for the grid search
param_grid = {
    'n_z': [10, 20, 30, 40],
    'n_z_features': [50, 100, 150, 200],
    'n_h': [50, 100, 150,200],
    'alpha': [0.0001, 0.001, 0.01, 0.1]
}

# Create a GridSearchCV object
grid_search = GridSearchCV(bls, param_grid, cv=5, scoring='accuracy',verbose=3)

# Fit the GridSearchCV object to the data
grid_search.fit(X_train, y_train)

# Print the best parameters and the corresponding score
print("Best parameters: ", grid_search.best_params_)
print("Best score: ", grid_search.best_score_)

# You can also retrieve the best estimator
best_bls = grid_search.best_estimator_

# Save it to a file
joblib.dump(best_bls, 'bls_model.pkl')



Fitting 5 folds for each of 256 candidates, totalling 1280 fits

[CV 1/5] END alpha=0.0001, n_h=50, n_z=10, n_z_features=50;, score=0.718 total time=   0.1s

[CV 2/5] END alpha=0.0001, n_h=50, n_z=10, n_z_features=50;, score=0.700 total time=   0.1s

[CV 3/5] END alpha=0.0001, n_h=50, n_z=10, n_z_features=50;, score=0.734 total time=   0.2s

[CV 4/5] END alpha=0.0001, n_h=50, n_z=10, n_z_features=50;, score=0.747 total time=   0.2s

[CV 5/5] END alpha=0.0001, n_h=50, n_z=10, n_z_features=50;, score=0.723 total time=   0.2s

[CV 1/5] END alpha=0.0001, n_h=50, n_z=10, n_z_features=100;, score=0.629 total time=   0.2s

[CV 2/5] END alpha=0.0001, n_h=50, n_z=10, n_z_features=100;, score=0.624 total time=   0.3s

[CV 3/5] END alpha=0.0001, n_h=50, n_z=10, n_z_features=100;, score=0.621 total time=   0.2s

[CV 4/5] END alpha=0.0001, n_h=50, n_z=10, n_z_features=100;, score=0.607 total time=   0.2s

[CV 5/5] END alpha=0.0001, n_h=50, n_z=10, n_z_features=100;, score=0.609 total time=   0.3s



OSError: [Errno 30] Read-only file system: 'bls_model.pkl'

Best parameters:  {'alpha': 0.001, 'n_h': 50, 'n_z': 10, 'n_z_features': 50}
Best score:  0.7470962366338009

In [None]:
# Create a BLS classifier
bls = BLSClassifier(n_z=10, n_z_features=50, n_h=50, alpha=0.001)

# Fit the classifier to the training data
bls.fit(X_train, y_train)

# Print the scores of the classifier
print(bls.score(X_test,y_test))