# Cohort 60x60 Neural Networks AUCs analysis

### Imports and environment setup

- Date of run: 2024-12-30
- Environment: python 3.12
- Packages required: pandas, numpy, sklearn, statsmodels, seaborn, matplotlib, tensorflow, keras

In [1]:
# Include in the environment the code directory with the utils function
import sys
sys.path.append('../code/')

In [2]:
# Library imports
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

# ML imports
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Utils imports
import cohort_analysis_utils as utils

2024-12-30 11:54:57.831999: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-30 11:54:57.834765: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2024-12-30 11:54:57.843277: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1735559697.858178  519496 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1735559697.862546  519496 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2024-12-30 11:54:57.876787: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [3]:
# Remove warnings for readability
import warnings
warnings.filterwarnings('ignore')

# Remove cell printing limits
pd.set_option('display.max_rows', None)


# Data loading and preprosessing

In [4]:
# Load the uploaded dataset
file_path_mmk = '../data/60x60_dr_ruo_20241209.csv'
data_mmk = pd.read_csv(file_path_mmk, delimiter='\t')

# Rename column with trailing space
data_mmk.rename(columns={'KPYM ng/mL ': 'KPYM ng/mL'}, inplace=True)

# Define features and target
features = ['MMP9 ng/mL', 'HSPB1 ng/mL', 'PERM ng/mL', 'ADIPOQ ng/mL', 'TIMP-2 ng/mL']
target = 'Pathology'

# Preprocess data
data_mmk[features] = data_mmk[features].apply(pd.to_numeric, errors='coerce')

data_clean = data_mmk[features + [target]].dropna()

# Encode the target variable
label_encoder = LabelEncoder()
data_clean[target] = label_encoder.fit_transform(data_clean[target])

# Split into features and target
X = data_clean[features]
y = data_clean[target]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Divide train and test data

In [5]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42, stratify=y)

# Build Model func

In [6]:
def build_model(input_dim, nodes=[], dropout_rate=0.5):
    # Define the layers of the neural network based on the input parameters
    layers= [Dense(nodes[0], input_dim=input_dim, activation='relu')]
    layers.append(Dropout(dropout_rate))
    for node in nodes[1:]:
        layers.append(Dense(node, activation='relu'))
        layers.append(Dropout(dropout_rate))
    layers.append(Dense(1, activation='sigmoid'))

    # Build the neural network model
    model = Sequential(layers)

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

    return model

# Fit model func

In [7]:
def fit_model(model, X_train, y_train, X_test, y_test, epochs=100, batch_size=32, verbose=0):
    # Define early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train the model
    history = model.fit(
                        X_train, 
                        y_train, 
                        validation_data=(X_test, y_test), 
                        epochs=epochs, 
                        batch_size=batch_size, 
                        callbacks=[early_stopping],
                        verbose=verbose
                        )

    return history

# Training an overfitted model over the BMKs readout

In [8]:

model = build_model(input_dim=X_train.shape[1], nodes=[128, 64, 32], dropout_rate=0.0)

# replace the code above with the train_model function
history = fit_model(model, X_train, y_train, X_test, y_test, epochs=400, batch_size=32)


# Predict probabilities for the test set
y_pred_proba_nn = model.predict(X_test).flatten()

# Calculate the AUC
auc_nn = roc_auc_score(y_test, y_pred_proba_nn)

print(f'Neural Network AUC: {auc_nn:.4f}')


2024-12-30 11:54:59.765552: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
2024-12-30 11:54:59.765577: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:137] retrieving CUDA diagnostic information for host: lradusky
2024-12-30 11:54:59.765582: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:144] hostname: lradusky
2024-12-30 11:54:59.765650: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:168] libcuda reported version is: NOT_FOUND: was unable to find libcuda.so DSO loaded into this program
2024-12-30 11:54:59.765671: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:172] kernel reported version is: 470.223.2


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 53ms/step
Neural Network AUC: 0.9500


# Create Biomarkers Ratios

In [10]:
# Define the biomarkers to use for ratios
biomarkers = ['MMP9 ng/mL', 'TIMP-2 ng/mL', 'PERM ng/mL', 'ADIPOQ ng/mL', 'HSPB1 ng/mL']

# Create all possible ratios
ratios = []
for biomarker1, biomarker2 in itertools.combinations(biomarkers, 2):
    ratio_name = f"{biomarker1}_to_{biomarker2}"
    data_clean[ratio_name] = np.where(data_clean[biomarker2] != 0, 
                                      data_clean[biomarker1] / data_clean[biomarker2], 
                                      np.nan)
    ratios.append(ratio_name)

# Drop rows with NaN values resulting from invalid ratios
data_ratios = data_clean.dropna()

# Prepare features and target
X_ratios = data_ratios[ratios]
y_ratios = data_ratios[target]

# Normalize the features
X_scaled_ratios = scaler.fit_transform(X_ratios)

# Split into training and testing sets
X_train_ratios, X_test_ratios, y_train_ratios, y_test_ratios = train_test_split(
    X_scaled_ratios, y_ratios, test_size=0.1, random_state=42, stratify=y_ratios
)

# Create a model for the ratios

In [11]:
# Build the neural network model using the build_model function
model_ratios = build_model(input_dim=X_train_ratios.shape[1], nodes=[64, 32], dropout_rate=0.1)


# Train the model with early stopping
history_ratios = fit_model(
                            model_ratios, 
                            X_train_ratios, y_train_ratios, 
                            X_test_ratios, y_test_ratios, 
                            epochs=400, batch_size=32
                            )

# Predict probabilities for the test set
y_pred_proba_ratios = model_ratios.predict(X_test_ratios).flatten()

# Calculate the AUC
auc_ratios = roc_auc_score(y_test_ratios, y_pred_proba_ratios)

print(f"AUC with Ratios: {auc_ratios}")


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
AUC with Ratios: 0.3


# Model Cross-validation

In [12]:
def cross_validate_model(X, y, nodes, dropout_rate=0.5, epochs=100, batch_size=16, n_splits=10):
    # Convert `y` to a NumPy array for compatibility with KFold
    y_array = y.to_numpy()

    # Initialize k-fold cross-validator
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_auc_scores = []

    # Perform k-fold cross-validation
    for train_idx, val_idx in kfold.split(X, y_array):
        # Split the data into training and validation sets
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y_array[train_idx], y_array[val_idx]

        # Build the neural network model
        model = build_model(input_dim=X_train_fold.shape[1], nodes=nodes, dropout_rate=dropout_rate)

        # Train the model
        history = fit_model(model, X_train_fold, y_train_fold, X_val_fold, y_val_fold, epochs=epochs, batch_size=batch_size, verbose=0)

        # Predict probabilities for the validation fold
        y_val_pred = model.predict(X_val_fold).flatten()

        # Calculate AUC for the current fold
        fold_auc = roc_auc_score(y_val_fold, y_val_pred)
        fold_auc_scores.append(fold_auc)
        print(f"AUC for Fold: {fold_auc}")

    # Compute the average AUC across all folds
    average_auc = np.mean(fold_auc_scores)
    print(f"Average AUC across all folds: {average_auc}")

    return average_auc


## Direct readouts (not ratios)

In [13]:
average_auc = cross_validate_model(
                                    X_scaled, y, 
                                    nodes=[32, 16], 
                                    dropout_rate=0.2, 
                                    epochs=100, 
                                    batch_size=16, 
                                    n_splits=10
                                  )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.9
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.9500000000000001
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
AUC for Fold: 0.55
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 54ms/step
AUC for Fold: 0.3
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.9
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.9
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
AUC for Fold: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
AUC for Fold: 0.5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 1.0
Average AUC across all folds: 0.8


## Ratios with the most-common parameters

In [14]:
average_auc = cross_validate_model(
                                    X_scaled_ratios, y_ratios, 
                                    nodes=[32, 16], 
                                    dropout_rate=0.2, 
                                    epochs=100, 
                                    batch_size=16, 
                                    n_splits=10
                                )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
AUC for Fold: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 0.9500000000000001
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
AUC for Fold: 0.7
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.9
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
AUC for Fold: 0.55
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.85
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 0.9375
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.75
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 1.0
Average AUC across all folds: 0.84375


## Ratios with a bigger dropout (to ensure no overfitting)

In [15]:
average_auc = cross_validate_model(
                                    X_scaled_ratios, y_ratios, 
                                    nodes=[32, 16], 
                                    dropout_rate=0.5, 
                                    epochs=100, 
                                    batch_size=16, 
                                    n_splits=10
                                )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.9
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
AUC for Fold: 0.44999999999999996
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.9
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.7500000000000001
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.6
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 0.75
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
AUC for Fold: 0.5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 1.0
Average AUC across all folds: 0.7849999999999999


## Ratios with more split (to observe if its more unstable)

In [16]:
average_auc = cross_validate_model(
                                    X_scaled_ratios, y_ratios, 
                                    nodes=[32, 16], 
                                    dropout_rate=0.2, 
                                    epochs=100, 
                                    batch_size=16, 
                                    n_splits=20
                                )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
AUC for Fold: 0.6666666666666666
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 44ms/step
AUC for Fold: 0.5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.8333333333333333
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
AUC for Fold: 0.25
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/st

## Ratios with less split (to see if its more stable)

In [17]:
average_auc = cross_validate_model(
                                    X_scaled_ratios, y_ratios, 
                                    nodes=[32, 16], 
                                    dropout_rate=0.5, 
                                    epochs=100, 
                                    batch_size=16, 
                                    n_splits=5
                                )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.691358024691358
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.7654320987654321
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.5972222222222222
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.9027777777777778
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.8055555555555556
Average AUC across all folds: 0.7524691358024691


just let's run again

In [18]:
average_auc = cross_validate_model(
                                    X_scaled_ratios, y_ratios, 
                                    nodes=[32, 16], 
                                    dropout_rate=0.5, 
                                    epochs=100, 
                                    batch_size=16, 
                                    n_splits=5
                                )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.8024691358024691
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 0.7530864197530864
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.5416666666666666
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 0.9583333333333333
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
AUC for Fold: 0.8055555555555556
Average AUC across all folds: 0.7722222222222221


# Doing the same just with MMP9, PERM and TIMP-2

In [20]:
# Define the biomarkers to use for ratios
biomarkers = ['MMP9 ng/mL', 'PERM ng/mL', 'TIMP-2 ng/mL']

# Create all possible ratios
ratios = []
for biomarker1, biomarker2 in itertools.combinations(biomarkers, 2):
    ratio_name = f"{biomarker1}_to_{biomarker2}"
    data_clean[ratio_name] = np.where(data_clean[biomarker2] != 0, 
                                      data_clean[biomarker1] / data_clean[biomarker2], 
                                      np.nan)
    ratios.append(ratio_name)

# Drop rows with NaN values resulting from invalid ratios
data_ratios = data_clean.dropna()

# Prepare features and target
X_ratios = data_ratios[ratios]
y_ratios = data_ratios[target]

# Normalize the features
X_scaled_ratios = scaler.fit_transform(X_ratios)

In [21]:
average_auc = cross_validate_model(
                                    X_scaled_ratios, y_ratios, 
                                    nodes=[32, 16], 
                                    dropout_rate=0.5, 
                                    epochs=100, 
                                    batch_size=16, 
                                    n_splits=5
                                )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 49ms/step
AUC for Fold: 0.8148148148148149
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
AUC for Fold: 0.7654320987654322
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
AUC for Fold: 0.6944444444444444
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 50ms/step
AUC for Fold: 0.9027777777777778
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 48ms/step
AUC for Fold: 0.5277777777777778
Average AUC across all folds: 0.7410493827160494


In [26]:
average_auc = cross_validate_model(
                                    X_scaled_ratios, y_ratios, 
                                    nodes=[32, 16], 
                                    dropout_rate=0.5, 
                                    epochs=100, 
                                    batch_size=16, 
                                    n_splits=10
                                )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 0.8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.9
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.19999999999999998
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.55
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 0.6499999999999999
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.4
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 1.0
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.9375
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 0.5
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.375
Average AUC across all folds: 0.63125


# Checking the same does not happens with other biomarkers

In [23]:
# Define the biomarkers to use for ratios
biomarkers = ['ADIPOQ ng/mL', 'HSPB1 ng/mL',]

# Create all possible ratios
ratios = []
for biomarker1, biomarker2 in itertools.combinations(biomarkers, 2):
    ratio_name = f"{biomarker1}_to_{biomarker2}"
    data_clean[ratio_name] = np.where(data_clean[biomarker2] != 0, 
                                      data_clean[biomarker1] / data_clean[biomarker2], 
                                      np.nan)
    ratios.append(ratio_name)

# Drop rows with NaN values resulting from invalid ratios
data_ratios = data_clean.dropna()

# Prepare features and target
X_ratios = data_ratios[ratios]
y_ratios = data_ratios[target]

# Normalize the features
X_scaled_ratios = scaler.fit_transform(X_ratios)

In [24]:
average_auc = cross_validate_model(
                                    X_scaled_ratios, y_ratios, 
                                    nodes=[32, 16], 
                                    dropout_rate=0.5, 
                                    epochs=100, 
                                    batch_size=16, 
                                    n_splits=5
                                )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.5925925925925926
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
AUC for Fold: 0.4814814814814815
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.5138888888888888
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
AUC for Fold: 0.6388888888888888
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 51ms/step
AUC for Fold: 0.7361111111111112
Average AUC across all folds: 0.5925925925925926


In [25]:
average_auc = cross_validate_model(
                                    X_scaled_ratios, y_ratios, 
                                    nodes=[32, 16], 
                                    dropout_rate=0.5, 
                                    epochs=100, 
                                    batch_size=16, 
                                    n_splits=10
                                )

[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.44999999999999996
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.9
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.5499999999999999
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 0.8
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 45ms/step
AUC for Fold: 0.6499999999999999
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.8500000000000001
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.7
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.5625
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 47ms/step
AUC for Fold: 0.6875
[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 46ms/step
AUC for Fold: 0.875
Average AUC across all folds: 0