In [3]:
%cd /kaggle/input/chemcancer-v2/src/
%mkdir /kaggle/working/Machine_Learning_models/
%mkdir /kaggle/working/Machine_Learning_models_results/

/kaggle/input/chemcancer-v2/src


In [4]:
import numpy as np
import matplotlib.pyplot as plt
import os
import time
from tensorflow.keras.optimizers import Adam
from data import *
from machine_learning_models import *
from deep_learning_models import *
from vision_transformer import *
from utils_dl_model import *
from utils_ml_model import print_ml_results
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io_plugins.so: undefined symbol: _ZN3tsl6StatusC1EN10tensorflow5error4CodeESt17basic_string_viewIcSt11char_traitsIcEENS_14SourceLocationE']
caused by: ['/opt/conda/lib/python3.10/site-packages/tensorflow_io/python/ops/libtensorflow_io.so: undefined symbol: _ZTVN10tensorflow13GcsFileSystemE']


In [5]:
# Set the seed value.
SEED = 7
np.random.seed(SEED)

# Deep Learning parameters
DL_EPOCH = 500
DL_BATCH_SIZE = 32
DL_CNN_VERSION = 3
DL_TRANSFORMER_VISION_VERSION = 11
DL_BLS_VERSION = 1

DO_DL = False
CV_DL = True
OPT_DL = False

DO_CNN = False
DO_TRANSFORMER_VISION = False
DO_BLS = False
DO_ML = True

# Percentage of test set out of the dataset.
TEST_SET = 0.2

# Percentage of validation set out of the training dataset.
VAL_SET = 0.2

# Folder path associated with machine learning models
ml_models_folder = "/kaggle/working/Machine_Learning_models/"
ml_models_results_folder = "/kaggle/working/Machine_Learning_models_results/"

In [6]:
import matplotlib.pyplot as plt
import seaborn as sns

def plot_confusion_matrices(results):
    """
    Function to plot confusion matrices one by one.
    
    Parameters:
    results (dict): Dictionary containing the results of the machine learning models
    """
    for name, result in results.items():
        # Create a new figure for each model
        plt.figure(figsize=(5, 5))
        
        # Generate a confusion matrix heatmap
        confusion_matrix = np.array(result['Confusion Matrix'])
        sns.heatmap(confusion_matrix, annot=True, fmt='d', cmap="Blues")
        
        # Set the plot labels
        plt.title(f'{name}')
        plt.xlabel('Predicted Label')
        plt.ylabel('True Label')
        
        # Display the plot
        plt.show()



In [7]:
def preprocess_with_pca(X, variance_threshold=0.95):
    """
    Preprocesses the data using PCA, selecting the number of components
    such that the specified variance threshold is retained.

    :param X: The already standardized input data.
    :param variance_threshold: The threshold for explained variance.
    :return: Data transformed by PCA and the PCA model.
    """
    pca_temp = PCA()
    pca_temp.fit(X)
    cumulative_variance_ratio = np.cumsum(pca_temp.explained_variance_ratio_)
    n_components = np.argmax(cumulative_variance_ratio >= variance_threshold) + 1
#
    pca = PCA(n_components)
    X_pca = pca.fit_transform(X)

    print(f"PCA with {n_components} components retaining {variance_threshold * 100}% of variance")
    
    return X_pca, pca


In [8]:

def load_extract_data(data_file):
    # Load the data
    print("Load the data")
    data = load_data(data_file)
    print(f"Data shape : {data.shape}")

    # Extract the feature and target data
    print("Extract the feature and target data")
    X, y = extract_data(data)
    print(f"X shaped: {X.shape}")
    print(f"y shaped: {y.shape}")

    return X, y

def extract_data_from_csv(filename="generated_data.csv"):
    # Read the CSV file into a DataFrame
    df_extracted = pd.read_csv(filename)
    
    # Split the DataFrame into features and labels
    X_extracted = df_extracted.drop(columns=["labels"]).values
    y_extracted = df_extracted["labels"].values
    
    return X_extracted, y_extracted

In [9]:
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix

def compute_tp_tn_fp_fn_percentage(y_test, y_pred, class_label):
    """
    Function to compute the True Positives (TP), True Negatives (TN),
    False Positives (FP), and False Negatives (FN) for a specific class as percentages.
    """
    cm = confusion_matrix(y_test, y_pred)
    total_samples = np.sum(cm)

    tp = cm[class_label, class_label] / total_samples
    fp = (sum(cm[:, class_label]) - cm[class_label, class_label]) / total_samples
    fn = (sum(cm[class_label, :]) - cm[class_label, class_label]) / total_samples
    tn = (total_samples - (tp + fp + fn)) / total_samples

    return tp, tn, fp, fn

# Replace compute_tp_tn_fp_fn with compute_tp_tn_fp_fn_percentage in the function train_and_evaluate_ml_models
def train_and_evaluate_ml_models(models, X, y, apply_pca=False, pca_variance_threshold=0.95, apply_filters_bg_subtraction=False, cv=5, standardize_data_func=None):
    results = {}

    # Create a StratifiedKFold object
    skf = StratifiedKFold(n_splits=cv, shuffle=True, random_state=42)
    
    for name, model in models.items():
        fold_metrics = {
            'accuracy': [],
            'precision': [],
            'recall': [],
            'f1_score': [],
            'confusion_matrix': [],
            'tp_tn_fp_fn': [],
            'tp': [],
            'tn': [],
            'fp': [],
            'fn': []
        }

        # Perform stratified cross-validation
        for train_index, test_index in skf.split(X, y):
            X_train, X_test = X[train_index], X[test_index]
            y_train, y_test = y[train_index], y[test_index]
            
            # Apply filters and background subtraction if enabled
            if apply_filters_bg_subtraction:
                print("Apply filters and background subtraction")
                X_train = apply_filters_and_background_subtraction(X_train)
                X_test = apply_filters_and_background_subtraction(X_test)
            
           # Standardize data if a standardization function is provided
            if standardize_data_func is not None:
                print("Standardize the data")
                X_train, X_test = standardize_data_func(X_train, X_test)

            # Apply PCA if enabled
            if apply_pca:
                print("Apply PCA")
                pca = PCA(n_components=pca_variance_threshold)
                X_train = pca.fit_transform(X_train)
                X_test = pca.transform(X_test)
            
            # Train the model
            print(f"Training {name} model...")
            model.fit(X_train, y_train)

            # Evaluate the model
            print(f"Evaluating {name} model...")
            y_pred = model.predict(X_test)
            accuracy = accuracy_score(y_test, y_pred)
            precision = precision_score(y_test, y_pred, average='macro')
            recall = recall_score(y_test, y_pred, average='macro')
            f1 = f1_score(y_test, y_pred, average='macro')
            confusion = confusion_matrix(y_test, y_pred)

            # Compute TP, TN, FP, FN for each class
            tp_tn_fp_fn = {class_label: compute_tp_tn_fp_fn_percentage(y_test, y_pred, class_label) for class_label in range(len(np.unique(y)))}

            # Append the tp, tn, fp, fn values to the corresponding lists
            for class_label, (tp, tn, fp, fn) in tp_tn_fp_fn.items():
                fold_metrics['tp'].append(tp)
                fold_metrics['tn'].append(tn)
                fold_metrics['fp'].append(fp)
                fold_metrics['fn'].append(fn)

            fold_metrics['accuracy'].append(accuracy)
            fold_metrics['precision'].append(precision)
            fold_metrics['recall'].append(recall)
            fold_metrics['f1_score'].append(f1)
            fold_metrics['confusion_matrix'].append(confusion)
            fold_metrics['tp_tn_fp_fn'].append(tp_tn_fp_fn)

        # Calculate the mean and standard deviation of the metrics from the CV folds
        results[name] = {
            'CV': cv,
            'Accuracy': np.mean(fold_metrics['accuracy']),
            'Precision': np.mean(fold_metrics['precision']),
            'Recall': np.mean(fold_metrics['recall']),
            'F1 Score': np.mean(fold_metrics['f1_score']),
            'Confusion Matrix': np.mean(fold_metrics['confusion_matrix'], axis=0).tolist(),  # average confusion matrix across folds
            'TP_TN_FP_FN': fold_metrics['tp_tn_fp_fn'],  # TP, TN, FP, FN for each fold
            'Avg TP': np.mean(fold_metrics['tp']),
            'Avg TN': np.mean(fold_metrics['tn']),
            'Avg FP': np.mean(fold_metrics['fp']),
            'Avg FN': np.mean(fold_metrics['fn']),
            'Std Accuracy': np.std(fold_metrics['accuracy']),
            'Std Precision': np.std(fold_metrics['precision']),
            'Std Recall': np.std(fold_metrics['recall']),
            'Std F1 Score': np.std(fold_metrics['f1_score'])
        }

        # Print the results for the current model
        print(f"\n{name} Model Results:")
        print(f"CV: {results[name]['CV']}")
        print(f"Accuracy: {results[name]['Accuracy']}")
        print(f"Precision: {results[name]['Precision']}")
        print(f"Recall: {results[name]['Recall']}")
        print(f"F1 Score: {results[name]['F1 Score']}")
        print(f"STD Accuracy: {results[name]['Std Accuracy']}")
        print(f"STD Precision: {results[name]['Std Precision']}")
        print(f"STD Recall: {results[name]['Std Recall']}")
        print(f"STD F1 Score: {results[name]['Std F1 Score']}")
        print(f"Confusion Matrix: \n{np.array(results[name]['Confusion Matrix'])}\n")
        print(f"TP_TN_FP_FN: \n{results[name]['TP_TN_FP_FN']}\n")
        print(f"Avg TP: {results[name]['Avg TP']}")
        print(f"Avg TN: {results[name]['Avg TN']}")
        print(f"Avg FP: {results[name]['Avg FP']}")
        print(f"Avg FN: {results[name]['Avg FN']}\n")

    return results

In [10]:
# With bksb, slope and roll set to true
data_file= "/kaggle/input/chemcancer-v2/Data/HC05_HC07.csv"

In [32]:
# Apply median filter and background subtraction to the data
def apply_filters_and_background_subtraction(X):
    # Reshape the input data to a 2D array if necessary
    if len(X.shape) == 1:
        X = X.reshape((1, -1))
    
    # Apply median filter
    datamedfilt = scipy.ndimage.median_filter(X, size=(1, 5))
    
    # Apply airPLS for background subtraction
    baseline = np.zeros_like(datamedfilt)
    cols = baseline.shape[1]
    for col in range(cols):
        baseline[:, col] = airPLS(datamedfilt[:, col], lambda_=150)
    
    data_bksb = datamedfilt - baseline
    return datamedfilt

In [33]:
# Load the data
print("Load the data")
data = load_data(data_file)
print(f"Data shape : {data.shape}")

# Extract the feature and target data
print("Extract the feature and target data")
X, y = extract_data(data)
print(f"X shaped: {X.shape}")
print(f"y shaped: {y.shape}")

Load the data
Data shape : (2373, 274)
Extract the feature and target data
X shaped: (2373, 270)
y shaped: (2373,)


## (PCA) Not Filtered - Original was used

In [None]:
if DO_ML:
    print("Building machine learning models...")
    ml_models = build_ml_models()
    results = train_and_evaluate_ml_models(
        models=ml_models,
        X=X, 
        y=y, 
        apply_pca=True, 
        pca_variance_threshold=0.95, 
        apply_filters_bg_subtraction=False,
        cv=5, 
        standardize_data_func=standardize_data
    )


## (No PCA) Not filtered - Original data was used

In [None]:
if DO_ML:
    print("Building machine learning models...")
    ml_models = build_ml_models()
    results = train_and_evaluate_ml_models(
    models=ml_models,
    X=X, 
    y=y, 
    apply_pca=False, 
    pca_variance_threshold=0.95, 
    apply_filters_bg_subtraction=False,
    cv=5, 
    standardize_data_func=standardize_data
)

## (PCA) Filtered - Original Dataset was used

In [None]:
  if DO_ML:
    print("Building machine learning models...")
    ml_models = build_ml_models()
    results = train_and_evaluate_ml_models(
    models=ml_models,
    X=X, 
    y=y, 
    apply_pca=True, 
    pca_variance_threshold=0.95, 
    apply_filters_bg_subtraction=True,
    cv=5, 
    standardize_data_func=standardize_data
)

 ## (No PCA) Filtered - Original Dataset was used

In [35]:
  if DO_ML:
    print("Building machine learning models...")
    ml_models = build_ml_models()
    results = train_and_evaluate_ml_models(
    models=ml_models,
    X=X, 
    y=y, 
    apply_pca=False, 
    pca_variance_threshold=0.95, 
    apply_filters_bg_subtraction=True,
    cv=5, 
    standardize_data_func=standardize_data
)

Building machine learning models...
Apply filters and background subtraction
Standardize the data
Training SVM model...
Evaluating SVM model...
Apply filters and background subtraction
Standardize the data
Training SVM model...
Evaluating SVM model...
Apply filters and background subtraction
Standardize the data
Training SVM model...
Evaluating SVM model...
Apply filters and background subtraction
Standardize the data
Training SVM model...
Evaluating SVM model...
Apply filters and background subtraction
Standardize the data
Training SVM model...
Evaluating SVM model...

SVM Model Results:
CV: 5
Accuracy: 0.5697392849211637
Precision: 0.5845534317170837
Recall: 0.5544415176093183
F1 Score: 0.5325799566014701
STD Accuracy: 0.004998797108884473
STD Precision: 0.007861299951589996
STD Recall: 0.004794989355675116
STD F1 Score: 0.007421322962028428
Confusion Matrix: 
[[145.2  30.    7.8]
 [ 39.6  89.   13. ]
 [ 70.2  43.6  36.2]]

TP_TN_FP_FN: 
[{0: (0.3178947368421053, 0.9987102493074793, 

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating Logistic Regression model...
Apply filters and background subtraction
Standardize the data
Training Logistic Regression model...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating Logistic Regression model...
Apply filters and background subtraction
Standardize the data
Training Logistic Regression model...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating Logistic Regression model...
Apply filters and background subtraction
Standardize the data
Training Logistic Regression model...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating Logistic Regression model...
Apply filters and background subtraction
Standardize the data
Training Logistic Regression model...


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


Evaluating Logistic Regression model...

Logistic Regression Model Results:
CV: 5
Accuracy: 0.6818307794803464
Precision: 0.6716952185836872
Recall: 0.6686716065879444
F1 Score: 0.667241770187357
STD Accuracy: 0.027150256208366096
STD Precision: 0.02741469771764335
STD Recall: 0.026747825567832114
STD F1 Score: 0.026505670346666253
Confusion Matrix: 
[[154.2  12.8  16. ]
 [ 24.8  86.   30.8]
 [ 26.2  40.4  83.4]]

TP_TN_FP_FN: 
[{0: (0.32842105263157895, 0.9989229916897506, 0.12631578947368421, 0.056842105263157895), 1: (0.15789473684210525, 0.9992066481994459, 0.07789473684210527, 0.14105263157894737), 2: (0.18526315789473685, 0.9990736842105263, 0.12421052631578948, 0.13052631578947368)}, {0: (0.33473684210526317, 0.9989983379501386, 0.09052631578947369, 0.05052631578947368), 1: (0.19157894736842104, 0.9991180055401662, 0.12, 0.10736842105263159), 2: (0.17473684210526316, 0.9991490304709141, 0.08842105263157894, 0.14105263157894737)}, {0: (0.3389473684210526, 0.9989540166204985, 0.11