In [1]:
%cd /kaggle/input/chemcancer-v2/src/
%mkdir /kaggle/working/CV_BLS_fold_data
%mkdir /kaggle/working/CV_BLS_cm

/kaggle/input/chemcancer-v2/src


In [2]:
pip install rvfln

Collecting rvfln
  Downloading rvfln-0.0.6.tar.gz (3.7 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: rvfln
  Building wheel for rvfln (setup.py) ... [?25ldone
[?25h  Created wheel for rvfln: filename=rvfln-0.0.6-py3-none-any.whl size=5114 sha256=298b6019bd575b62e878c2352db2e5fc8ebf89097321112494e33576f2301208
  Stored in directory: /root/.cache/pip/wheels/b3/d7/24/732c199b48ade58f8add6a6273be34bb89a52088c38e4864af
Successfully built rvfln
Installing collected packages: rvfln
Successfully installed rvfln-0.0.6
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import matplotlib.pyplot as plt
import os
import time
from tensorflow.keras.optimizers import Adam
from data import *
from machine_learning_models import *
from deep_learning_models import *
from vision_transformer import *
from utils_dl_model import *
from utils_ml_model import print_ml_results
from sklearn.model_selection import train_test_split
from rvfln.bls import BLSClassifier
from sklearn.metrics import accuracy_score

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [4]:
# Set the seed value.
SEED = 7
np.random.seed(SEED)

# Deep Learning parameters
DL_EPOCH = 500
DL_BATCH_SIZE = 32
DL_CNN_VERSION = 3
DL_TRANSFORMER_VISION_VERSION = 11
DL_BLS_VERSION = 1

DO_DL = True
CV_DL = True
OPT_DL = False

DO_CNN = False
DO_TRANSFORMER_VISION = False
DO_BLS = True
DO_ML = False

# Percentage of test set out of the dataset.
TEST_SET = 0.2

# Percentage of validation set out of the training dataset.
VAL_SET = 0.2

In [5]:
# Folder path associated with deep learning models
dl_models_folder = "/kaggle/working/Deep_Learning_models/"
dl_metrics_folder = "/kaggle/working/Deep_Learning_metrics/"
dl_weights_folder = "/kaggle/working/Deep_Learning_weights/"
dl_cv_models_folder = "/kaggle/working/Deep_Learning_CV/"
dl_cv_results_folder = "/kaggle/working/Deep_Learning_CV_results/"

# Folder path associated with machine learning models
ml_models_folder = "/kaggle/working/Machine_Learning_models/"
ml_models_results_folder = "/kaggle/working/Machine_Learning_models_results/"

# Model names (Saved in h5 format)
cnn_model_name = f"cnn_v{DL_CNN_VERSION}_{DL_BATCH_SIZE}_{DL_EPOCH}_seed_{SEED}.h5"
transformer_vis_model_name = f"transformer_vision_v{DL_TRANSFORMER_VISION_VERSION}_{DL_BATCH_SIZE}_{DL_EPOCH}_seed_{SEED}.h5"
bls_model_name = f"bls_v{DL_BLS_VERSION}_{DL_BATCH_SIZE}_{DL_EPOCH}_seed_{SEED}.h5"

# Metric filenames
cnn_metrics_filename = f"metrics_{cnn_model_name}.json"
transformer_vis_metrics_filename = f"metrics_{transformer_vis_model_name}.json"

# Weight filenames
cnn_weights_filename = f"weights_{cnn_model_name}.json"
transformer_vis_weights_filename = f"weights_{transformer_vis_model_name}.json"

# Deep Learning models path
if DO_CNN:
    dl_model_path = os.path.join(
        dl_models_folder, cnn_model_name)
elif DO_TRANSFORMER_VISION:
    dl_model_path = os.path.join(
        dl_models_folder, transformer_vis_model_name)


# Deep Learning metrics path
if DO_CNN:
    dl_metrics_path = os.path.join(
        dl_metrics_folder, cnn_metrics_filename)
elif DO_TRANSFORMER_VISION:
    dl_metrics_path = os.path.join(
        dl_metrics_folder, transformer_vis_metrics_filename)


# Deep Learning weights path
if DO_CNN:
    dl_weights_path = os.path.join(
        dl_weights_folder, cnn_weights_filename)

elif DO_TRANSFORMER_VISION:
    dl_weights_path = os.path.join(
        dl_weights_folder, transformer_vis_weights_filename)

In [6]:
# Load the data
data_file = "../Data/HC05_HC07.csv"

In [7]:
# Load the data
print("Load the data")
data = load_data(data_file)
print(f"Data shape : {data.shape}")

# Extract the feature and target data
print("Extract the feature and target data")
X, y = extract_data(data)
print(f"X shaped: {X.shape}")
print(f"y shaped: {y.shape}")

Load the data
Data shape : (2373, 274)
Extract the feature and target data
X shaped: (2373, 270)
y shaped: (2373,)


In [8]:
import optuna
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from rvfln.bls import BLSClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

def perform_bls_cv(X,y,n_z, n_z_features, n_h, alpha):

    bls = BLSClassifier(n_z=n_z, n_z_features=n_z_features, n_h=n_h, alpha=alpha)

    fold = 1
    n_splits = 5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed_value)
    scores = []
    # Initialize lists to store precision, recall, and F1 scores
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    confusion_matrices_bls = []


    for train_index, test_index in skf.split(X, y):
        # Split the data into training and validation sets
        print("=" * 40)
        print(f"Fold: {fold}")
        print("Splitting the data")
        X_train, X_test = X[train_index], X[test_index]
        y_train, y_test = y[train_index], y[test_index]

        # Standardize the data
        print("Standardizing the data")
        X_train, X_test = standardize_data(X_train, X_test)
        
        print(f"X_train shape: {X_train.shape}")
        print(f"X_test shape: {X_test.shape}")
        print(f"y_train shape: {y_train.shape}")
        print(f"y_test shape: {y_test.shape}")
        
        print("Saving the data")
        fold_data = {

            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test
        }

        with open(f"/kaggle/working/CV_BLS_fold_data/fold_{fold}_data.pkl", 'wb') as f:
            pickle.dump(fold_data, f)
        
        # Fit the bls and compute the accuracy
        bls.fit(X_train, y_train)
        score = bls.score(X_test, y_test)
        y_pred = bls.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        # Calculate precision, recall, and F1 score
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        cm = confusion_matrix(y_test, y_pred)
        
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        
        with open(f"/kaggle/working/CV_BLS_cm/BLS_cm_fold_{fold}.pkl", 'wb') as cm_file:
            pickle.dump(cm, cm_file)

        # Append the scores to the lists
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        scores.append(score)
        confusion_matrices_bls.append(cm)
        
        
        
        print(f"Fold {fold}, BLS Test Score: {score:.4f}")
        fold += 1
    
    scores = np.array(scores)
    accuracies = np.array(accuracies)
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    f1_scores = np.array(f1_scores)
    
    # Calculate and print the mean and standard deviation of precision, recall, and F1 score
    print(f"Mean BLS Test score over stratified {n_splits}-fold cross-validation: {np.mean(scores):.4f}")
    print(f"Mean accuracies over stratified {n_splits}-fold cross-validation: {np.mean(accuracies):.4f}")
    print(f"Mean precision over stratified {n_splits}-fold cross-validation: {np.mean(precisions):.4f}")
    print(f"Mean recall over stratified {n_splits}-fold cross-validation: {np.mean(recalls):.4f}")
    print(f"Mean F1 score over stratified {n_splits}-fold cross-validation: {np.mean(f1_scores):.4f}")
    print("*" * 40)
    print(f"Standard deviation of BLS Test score over stratified {n_splits}-fold cross-validation: {np.std(scores):.4f}")
    print(f"Standard deviation of accuracies over stratified {n_splits}-fold cross-validation: {np.std(accuracies):.4f}")
    print(f"Standard deviation of precision over stratified {n_splits}-fold cross-validation: {np.std(precisions):.4f}")
    print(f"Standard deviation of recall over stratified {n_splits}-fold cross-validation: {np.std(recalls):.4f}")
    print(f"Standard deviation of F1 score over stratified {n_splits}-fold cross-validation: {np.std(f1_scores):.4f}")
    print(confusion_matrices_bls)
    
    return np.mean(scores)

In [16]:
n_z=12
n_z_features=50
n_h=100
alpha=0.0001

perform_bls_cv(X,y,n_z, n_z_features, n_h, alpha)

Fold: 1
Splitting the data
Standardizing the data
X_train shape: (1898, 270)
X_test shape: (475, 270)
y_train shape: (1898,)
y_test shape: (475,)
Saving the data
Precision: 0.7337
Recall: 0.6997
F1 Score: 0.7027
Fold 1, BLS Test Score: 0.7158
Fold: 2
Splitting the data
Standardizing the data
X_train shape: (1898, 270)
X_test shape: (475, 270)
y_train shape: (1898,)
y_test shape: (475,)
Saving the data
Precision: 0.7151
Recall: 0.6903
F1 Score: 0.6931
Fold 2, BLS Test Score: 0.7053
Fold: 3
Splitting the data
Standardizing the data
X_train shape: (1898, 270)
X_test shape: (475, 270)
y_train shape: (1898,)
y_test shape: (475,)
Saving the data
Precision: 0.7413
Recall: 0.6918
F1 Score: 0.6953
Fold 3, BLS Test Score: 0.7095
Fold: 4
Splitting the data
Standardizing the data
X_train shape: (1899, 270)
X_test shape: (474, 270)
y_train shape: (1899,)
y_test shape: (474,)
Saving the data
Precision: 0.6744
Recall: 0.6563
F1 Score: 0.6555
Fold 4, BLS Test Score: 0.6751
Fold: 5
Splitting the data
S

0.7033204530313125

n_z is the number of feature nodes in the enhancement node layer. It should ideally be set considering the dimensionality of your input data. A larger n_z means the model can learn more complex representations, but it may also lead to overfitting if set too high. For your data with 270 features, a reasonable starting point might be around 10-50.

n_z_features is the number of random features generated from each feature node. This parameter directly controls the model's complexity and its computational requirements. A higher number will allow the model to learn more complex representations, but it will also increase the computational cost and may lead to overfitting. A reasonable starting point might be 100-500.

n_h is the number of enhancement nodes. This is another parameter controlling the model's complexity. A higher number will allow the model to learn more complex representations, but it will also increase the computational cost and may lead to overfitting. A reasonable starting point might be 500-2000.

alpha is a regularization parameter. It helps prevent overfitting by adding a penalty to the loss function based on the weights' magnitude. It is typically a small positive value. Common choices are 0.1, 0.01, or 0.001.

Best parameters:  {'alpha': 0.001, 'n_h': 50, 'n_z': 10, 'n_z_features': 50}
Best score:  0.7470962366338009