In [1]:
%cd /kaggle/input/chemcancer-v2/src/
%mkdir /kaggle/working/CV_BLS_fold_data
%mkdir /kaggle/working/CV_BLS_cm

/kaggle/input/chemcancer-v2/src


In [2]:
pip install rvfln

Collecting rvfln
  Downloading rvfln-0.0.6.tar.gz (3.7 kB)
  Preparing metadata (setup.py) ... [?25ldone
[?25hBuilding wheels for collected packages: rvfln
  Building wheel for rvfln (setup.py) ... [?25ldone
[?25h  Created wheel for rvfln: filename=rvfln-0.0.6-py3-none-any.whl size=5083 sha256=18fbb8be8410da71554b141d341e3b21fa481fc22661dd85e36616a66dd1e7a9
  Stored in directory: /root/.cache/pip/wheels/b3/d7/24/732c199b48ade58f8add6a6273be34bb89a52088c38e4864af
Successfully built rvfln
Installing collected packages: rvfln
Successfully installed rvfln-0.0.6
Note: you may need to restart the kernel to use updated packages.


In [3]:
import numpy as np
import matplotlib.pyplot as plt
import os
import time
from tensorflow.keras.optimizers import Adam
from data import *
from machine_learning_models import *
from deep_learning_models import *
from vision_transformer import *
from utils_dl_model import *
from utils_ml_model import print_ml_results
from sklearn.model_selection import train_test_split
from rvfln.bls import BLSClassifier
from sklearn.metrics import accuracy_score



In [4]:
# Set the seed value.
SEED = 7
np.random.seed(SEED)

# Deep Learning parameters
DL_EPOCH = 500
DL_BATCH_SIZE = 32
DL_CNN_VERSION = 3
DL_TRANSFORMER_VISION_VERSION = 11
DL_BLS_VERSION = 1

DO_DL = True
CV_DL = True
OPT_DL = False

DO_CNN = False
DO_TRANSFORMER_VISION = False
DO_BLS = True
DO_ML = False

# Percentage of test set out of the dataset.
TEST_SET = 0.2

# Percentage of validation set out of the training dataset.
VAL_SET = 0.2

In [5]:
# Folder path associated with deep learning models
dl_models_folder = "/kaggle/working/Deep_Learning_models/"
dl_metrics_folder = "/kaggle/working/Deep_Learning_metrics/"
dl_weights_folder = "/kaggle/working/Deep_Learning_weights/"
dl_cv_models_folder = "/kaggle/working/Deep_Learning_CV/"
dl_cv_results_folder = "/kaggle/working/Deep_Learning_CV_results/"

# Folder path associated with machine learning models
ml_models_folder = "/kaggle/working/Machine_Learning_models/"
ml_models_results_folder = "/kaggle/working/Machine_Learning_models_results/"

# Model names (Saved in h5 format)
cnn_model_name = f"cnn_v{DL_CNN_VERSION}_{DL_BATCH_SIZE}_{DL_EPOCH}_seed_{SEED}.h5"
transformer_vis_model_name = f"transformer_vision_v{DL_TRANSFORMER_VISION_VERSION}_{DL_BATCH_SIZE}_{DL_EPOCH}_seed_{SEED}.h5"
bls_model_name = f"bls_v{DL_BLS_VERSION}_{DL_BATCH_SIZE}_{DL_EPOCH}_seed_{SEED}.h5"

# Metric filenames
cnn_metrics_filename = f"metrics_{cnn_model_name}.json"
transformer_vis_metrics_filename = f"metrics_{transformer_vis_model_name}.json"

# Weight filenames
cnn_weights_filename = f"weights_{cnn_model_name}.json"
transformer_vis_weights_filename = f"weights_{transformer_vis_model_name}.json"

# Deep Learning models path
if DO_CNN:
    dl_model_path = os.path.join(
        dl_models_folder, cnn_model_name)
elif DO_TRANSFORMER_VISION:
    dl_model_path = os.path.join(
        dl_models_folder, transformer_vis_model_name)


# Deep Learning metrics path
if DO_CNN:
    dl_metrics_path = os.path.join(
        dl_metrics_folder, cnn_metrics_filename)
elif DO_TRANSFORMER_VISION:
    dl_metrics_path = os.path.join(
        dl_metrics_folder, transformer_vis_metrics_filename)


# Deep Learning weights path
if DO_CNN:
    dl_weights_path = os.path.join(
        dl_weights_folder, cnn_weights_filename)

elif DO_TRANSFORMER_VISION:
    dl_weights_path = os.path.join(
        dl_weights_folder, transformer_vis_weights_filename)

In [6]:
def extract_data_from_csv(filename="generated_data.csv"):
    # Read the CSV file into a DataFrame
    df_extracted = pd.read_csv(filename)
    
    # Split the DataFrame into features and labels
    X_extracted = df_extracted.drop(columns=["labels"]).values
    y_extracted = df_extracted["labels"].values
    
    return X_extracted, y_extracted

In [7]:
# With bksb, slope and roll set to true
test_data= "/kaggle/input/test-gen-cc-10x-v4/test-gen-cc-10x-v4.csv"

# No bksb, slope and roll set to false
train_data = "/kaggle/input/gen-cc-10x-v4/gen-cc-10x-v4.csv"

In [8]:
X_augmented, y_augmented = extract_data_from_csv(train_data)
print(f"X_augmented shaped: {X_augmented.shape}")
print(f"y_augmented shaped: {y_augmented.shape}")

X_original, y_original = extract_data_from_csv(test_data)
print(f"X_original shaped: {X_original.shape}")
print(f"y_original shaped: {y_original.shape}")

X_augmented shaped: (18980, 270)
y_augmented shaped: (18980,)
X_original shaped: (475, 270)
y_original shaped: (475,)


In [9]:
import optuna
from sklearn.model_selection import StratifiedKFold, train_test_split
from sklearn.preprocessing import StandardScaler
from rvfln.bls import BLSClassifier
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

def perform_bls_cv(X_original, y_original, X_augmented, y_augmented,n_z, n_z_features, n_h, alpha):

    bls = BLSClassifier(n_z=n_z, n_z_features=n_z_features, n_h=n_h, alpha=alpha)

    scores = []
    # Initialize lists to store precision, recall, and F1 scores
    accuracies = []
    precisions = []
    recalls = []
    f1_scores = []
    confusion_matrices_bls = []


    fold = 1
    n_splits=5
    skf = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed_value)
    skf_augmented = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=seed_value)
    
    for train_index, _ in skf_augmented.split(X_augmented, y_augmented):
        # We are only interested in the train index for augmented data
        # Select the augmented data for training
        print("=" * 40)
        print(f"Fold: {fold}")
        print("Selecting the augmented data")
        
        X_train = X_augmented[train_index]
        y_train = y_augmented[train_index]
        
        # Standardize the data
        print("Standardizing the data")
        X_train, X_test = standardize_data(X_train, X_original)
        y_test = y_original
        
        print(f"X_train shape: {X_train.shape}")
        print(f"X_test shape: {X_test.shape}")
        print(f"y_train shape: {y_train.shape}")
        print(f"y_test shape: {y_test.shape}")
        
        print("Saving the data")
        fold_data = {

            'X_train': X_train,
            'X_test': X_test,
            'y_train': y_train,
            'y_test': y_test
        }

        with open(f"/kaggle/working/CV_BLS_fold_data/fold_{fold}_data.pkl", 'wb') as f:
            pickle.dump(fold_data, f)
        
        # Fit the bls and compute the accuracy
        bls.fit(X_train, y_train)
        score = bls.score(X_test, y_test)
        y_pred = bls.predict(X_test)
        accuracy = accuracy_score(y_test, y_pred)
        
        # Calculate precision, recall, and F1 score
        precision = precision_score(y_test, y_pred, average='macro')
        recall = recall_score(y_test, y_pred, average='macro')
        f1 = f1_score(y_test, y_pred, average='macro')
        cm = confusion_matrix(y_test, y_pred)
        
        print(f"Precision: {precision:.4f}")
        print(f"Recall: {recall:.4f}")
        print(f"F1 Score: {f1:.4f}")
        
        with open(f"/kaggle/working/CV_BLS_cm/BLS_cm_fold_{fold}.pkl", 'wb') as cm_file:
            pickle.dump(cm, cm_file)

        # Append the scores to the lists
        accuracies.append(accuracy)
        precisions.append(precision)
        recalls.append(recall)
        f1_scores.append(f1)
        scores.append(score)
        confusion_matrices_bls.append(cm)
        
        
        
        print(f"Fold {fold}, BLS Test Score: {score:.4f}")
        fold += 1
    
    scores = np.array(scores)
    accuracies = np.array(accuracies)
    precisions = np.array(precisions)
    recalls = np.array(recalls)
    f1_scores = np.array(f1_scores)
    
    # Calculate and print the mean and standard deviation of precision, recall, and F1 score
    print(f"Mean BLS Test score over stratified {n_splits}-fold cross-validation: {np.mean(scores):.4f}")
    print(f"Mean accuracies over stratified {n_splits}-fold cross-validation: {np.mean(accuracies):.4f}")
    print(f"Mean precision over stratified {n_splits}-fold cross-validation: {np.mean(precisions):.4f}")
    print(f"Mean recall over stratified {n_splits}-fold cross-validation: {np.mean(recalls):.4f}")
    print(f"Mean F1 score over stratified {n_splits}-fold cross-validation: {np.mean(f1_scores):.4f}")
    print("*" * 40)
    print(f"Standard deviation of BLS Test score over stratified {n_splits}-fold cross-validation: {np.std(scores):.4f}")
    print(f"Standard deviation of accuracies over stratified {n_splits}-fold cross-validation: {np.std(accuracies):.4f}")
    print(f"Standard deviation of precision over stratified {n_splits}-fold cross-validation: {np.std(precisions):.4f}")
    print(f"Standard deviation of recall over stratified {n_splits}-fold cross-validation: {np.std(recalls):.4f}")
    print(f"Standard deviation of F1 score over stratified {n_splits}-fold cross-validation: {np.std(f1_scores):.4f}")
    print(confusion_matrices_bls)
    
    return np.mean(scores)

In [18]:
n_z=12
n_z_features=50
n_h=100
alpha=0.0001

perform_bls_cv(X_original, y_original, X_augmented, y_augmented,n_z, n_z_features, n_h, alpha)

Fold: 1
Selecting the augmented data
Standardizing the data
X_train shape: (15184, 270)
X_test shape: (475, 270)
y_train shape: (15184,)
y_test shape: (475,)
Saving the data
Precision: 0.7177
Recall: 0.6938
F1 Score: 0.6950
Fold 1, BLS Test Score: 0.7200
Fold: 2
Selecting the augmented data
Standardizing the data
X_train shape: (15184, 270)
X_test shape: (475, 270)
y_train shape: (15184,)
y_test shape: (475,)
Saving the data
Precision: 0.7270
Recall: 0.7007
F1 Score: 0.7046
Fold 2, BLS Test Score: 0.7242
Fold: 3
Selecting the augmented data
Standardizing the data
X_train shape: (15184, 270)
X_test shape: (475, 270)
y_train shape: (15184,)
y_test shape: (475,)
Saving the data
Precision: 0.7430
Recall: 0.7076
F1 Score: 0.7122
Fold 3, BLS Test Score: 0.7305
Fold: 4
Selecting the augmented data
Standardizing the data
X_train shape: (15184, 270)
X_test shape: (475, 270)
y_train shape: (15184,)
y_test shape: (475,)
Saving the data
Precision: 0.7300
Recall: 0.6936
F1 Score: 0.6987
Fold 4, BLS

0.7250526315789474

n_z is the number of feature nodes in the enhancement node layer. It should ideally be set considering the dimensionality of your input data. A larger n_z means the model can learn more complex representations, but it may also lead to overfitting if set too high. For your data with 270 features, a reasonable starting point might be around 10-50.

n_z_features is the number of random features generated from each feature node. This parameter directly controls the model's complexity and its computational requirements. A higher number will allow the model to learn more complex representations, but it will also increase the computational cost and may lead to overfitting. A reasonable starting point might be 100-500.

n_h is the number of enhancement nodes. This is another parameter controlling the model's complexity. A higher number will allow the model to learn more complex representations, but it will also increase the computational cost and may lead to overfitting. A reasonable starting point might be 500-2000.

alpha is a regularization parameter. It helps prevent overfitting by adding a penalty to the loss function based on the weights' magnitude. It is typically a small positive value. Common choices are 0.1, 0.01, or 0.001.

Best parameters:  {'alpha': 0.001, 'n_h': 50, 'n_z': 10, 'n_z_features': 50}
Best score:  0.7470962366338009