# Cohort 60x60 Neural Networks AUCs analysis

### Imports and environment setup

- Date of run: 2024-12-30
- Environment: python 3.12
- Packages required: pandas, numpy, sklearn, statsmodels, seaborn, matplotlib, tensorflow, keras

In [1]:
# Include in the environment the code directory with the utils function
import sys
sys.path.append('../code/')

In [2]:
# Library imports
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns
import itertools

# ML imports
from sklearn.model_selection import train_test_split, KFold, StratifiedKFold
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import roc_auc_score

import tensorflow as tf
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, BatchNormalization
from tensorflow.keras.regularizers import l2
from tensorflow.keras.callbacks import EarlyStopping, ReduceLROnPlateau

# Utils imports
import cohort_analysis_utils as utils

2025-01-24 08:53:16.032404: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-24 08:53:16.096556: I external/local_xla/xla/tsl/cuda/cudart_stub.cc:32] Could not find cuda drivers on your machine, GPU will not be used.
2025-01-24 08:53:16.146549: E external/local_xla/xla/stream_executor/cuda/cuda_fft.cc:477] Unable to register cuFFT factory: Attempting to register factory for plugin cuFFT when one has already been registered
E0000 00:00:1737708796.196450   12467 cuda_dnn.cc:8310] Unable to register cuDNN factory: Attempting to register factory for plugin cuDNN when one has already been registered
E0000 00:00:1737708796.212776   12467 cuda_blas.cc:1418] Unable to register cuBLAS factory: Attempting to register factory for plugin cuBLAS when one has already been registered
2025-01-24 08:53:16.325105: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU ins

In [3]:
# Remove warnings for readability
import warnings
warnings.filterwarnings('ignore')

# Remove cell printing limits
pd.set_option('display.max_rows', None)


# Data loading and preprosessing

In [4]:
# Load the uploaded dataset
file_path_mmk = '../data/60x60_dr_mmk_20241209.csv'
data_mmk = pd.read_csv(file_path_mmk, delimiter='\t', index_col=0)

# Rename column with trailing space
data_mmk.rename(columns={'KPYM ng/mL ': 'KPYM ng/mL'}, inplace=True)

In [5]:
df_pipelle = pd.read_csv('../data/pipelle_results_20250121.csv' , sep='\t', index_col=0, header=0)

In [6]:
# in df_120, merge the columns from df_pipelle named 'Result', using the index to match the rows, 
# make the "Material Insuficiente o ausencia de diagnóstico" to have a value of 2
# make the NaN values to have a value of 1
# make the "Cáncer" to have a value of 8
# and all the rest to have a value of 4
#remove duplicate indexes in df_pipelle
df_pipelle = df_pipelle[~df_pipelle.index.duplicated(keep='first')]
data_mmk['Result'] = data_mmk.index.map(df_pipelle['Result'])
data_mmk['Result'] = data_mmk['Result'].replace({"Material Insuficiente o ausencia de diagnóstico": 1, "Cáncer": 2})
data_mmk['Result'] = data_mmk['Result'].fillna(1)
# Now all the remaining string values are replaced by 4
data_mmk['Result'] = data_mmk['Result'].replace({value: 0 for value in data_mmk['Result'].unique() if type(value) == str})



In [7]:
data_mmk.columns

Index(['Collection center', 'Age', 'Pathology', 'Hystology grade',
       'Hystology type', 'FIGO stage 2009', 'TCGA',
       'Time between collection and processing (h)', 'Group time',
       'Collected volume (mL)', 'Sample visual description', 'Hemolysis', 'pH',
       'Collected at', 'MMP9 ng/mL', 'HSPB1 ng/mL', 'PERM ng/mL',
       'ADIPOQ ng/mL', 'TIMP-2 ng/mL', 'AGRIN ng/mL', 'KPYM ng/mL',
       'Total protein BCA mg/mL', 'Result'],
      dtype='object')

In [8]:

# Rename column with trailing space
data_mmk.rename(columns={'KPYM ng/mL ': 'KPYM ng/mL'}, inplace=True)

# Define features and target
features = ['MMP9 ng/mL', 'HSPB1 ng/mL', 'PERM ng/mL', 'ADIPOQ ng/mL', 'TIMP-2 ng/mL', 'AGRIN ng/mL', 'KPYM ng/mL', "Result"]
target = 'Pathology'

# Preprocess data
data_mmk[features] = data_mmk[features].apply(pd.to_numeric, errors='coerce')

data_clean = data_mmk[features + [target]].dropna()

# Encode the target variable
label_encoder = LabelEncoder()
data_clean[target] = label_encoder.fit_transform(data_clean[target])

# Split into features and target
X = data_clean[features]
y = data_clean[target]

# Normalize the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

# Divide train and test data

In [9]:
# Split into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.1, random_state=42, stratify=y)

# Build Model func

In [10]:
def build_model(input_dim, nodes=[], dropout_rate=0.5):
    # Define the layers of the neural network based on the input parameters
    layers= [Dense(nodes[0], input_dim=input_dim, activation='relu')]
    layers.append(Dropout(dropout_rate))
    for node in nodes[1:]:
        layers.append(Dense(node, activation='relu'))
        layers.append(Dropout(dropout_rate))
    layers.append(Dense(1, activation='sigmoid'))

    # Build the neural network model
    model = Sequential(layers)

    # Compile the model
    model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['AUC'])

    return model

# Fit model func

In [11]:
def fit_model(model, X_train, y_train, X_test, y_test, epochs=100, batch_size=32, verbose=0):
    # Define early stopping callback
    early_stopping = EarlyStopping(monitor='val_loss', patience=10, restore_best_weights=True)

    # Train the model
    history = model.fit(
                        X_train, 
                        y_train, 
                        validation_data=(X_test, y_test), 
                        epochs=epochs, 
                        batch_size=batch_size, 
                        callbacks=[early_stopping],
                        verbose=verbose
                        )

    return history

# Training an overfitted model over the BMKs readout

In [12]:

model = build_model(input_dim=X_train.shape[1], nodes=[128, 64, 32], dropout_rate=0.0)

# replace the code above with the train_model function
history = fit_model(model, X_train, y_train, X_test, y_test, epochs=400, batch_size=32)


# Predict probabilities for the test set
y_pred_proba_nn = model.predict(X_test).flatten()

# Calculate the AUC
auc_nn = roc_auc_score(y_test, y_pred_proba_nn)

print(f'Neural Network AUC: {auc_nn:.4f}')


2025-01-24 08:53:18.736869: E external/local_xla/xla/stream_executor/cuda/cuda_driver.cc:152] failed call to cuInit: INTERNAL: CUDA error: Failed call to cuInit: UNKNOWN ERROR (303)
2025-01-24 08:53:18.736900: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:137] retrieving CUDA diagnostic information for host: lradusky
2025-01-24 08:53:18.736906: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:144] hostname: lradusky
2025-01-24 08:53:18.736988: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:168] libcuda reported version is: NOT_FOUND: was unable to find libcuda.so DSO loaded into this program
2025-01-24 08:53:18.737011: I external/local_xla/xla/stream_executor/cuda/cuda_diagnostics.cc:172] kernel reported version is: 470.223.2


[1m1/1[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 52ms/step
Neural Network AUC: 1.0000


# Create Biomarkers Ratios

In [13]:
# Define the biomarkers to use for ratios
biomarkers = ['MMP9 ng/mL', 'AGRIN ng/mL', 'TIMP-2 ng/mL', 'PERM ng/mL', 'KPYM ng/mL', 'ADIPOQ ng/mL', 'HSPB1 ng/mL']

# Create all possible ratios
ratios = []
for biomarker1, biomarker2 in itertools.combinations(biomarkers, 2):
    ratio_name = f"{biomarker1}_to_{biomarker2}"
    data_clean[ratio_name] = np.where(data_clean[biomarker2] != 0, 
                                      data_clean[biomarker1] / data_clean[biomarker2], 
                                      np.nan)
    ratios.append(ratio_name)

# Drop rows with NaN values resulting from invalid ratios
data_ratios = data_clean.dropna()

# Prepare features and target
X_ratios = data_ratios[ratios + ["Result"]]
y_ratios = data_ratios[target]

# Normalize the features
X_scaled_ratios = scaler.fit_transform(X_ratios)

# Split into training and testing sets
X_train_ratios, X_test_ratios, y_train_ratios, y_test_ratios = train_test_split(
    X_scaled_ratios, y_ratios, test_size=0.1, random_state=42, stratify=y_ratios
)

# Model Cross-validation

In [14]:
def cross_validate_model(X, y, nodes, dropout_rate=0.5, epochs=100, batch_size=16, n_splits=10):
    # Convert `y` to a NumPy array for compatibility with KFold
    y_array = y.to_numpy()

    # Initialize k-fold cross-validator
    kfold = StratifiedKFold(n_splits=n_splits, shuffle=True, random_state=42)
    fold_auc_scores = []

    # Perform k-fold cross-validation
    for train_idx, val_idx in kfold.split(X, y_array):
        # Split the data into training and validation sets
        X_train_fold, X_val_fold = X[train_idx], X[val_idx]
        y_train_fold, y_val_fold = y_array[train_idx], y_array[val_idx]

        # Build the neural network model
        model = build_model(input_dim=X_train_fold.shape[1], nodes=nodes, dropout_rate=dropout_rate)

        # Train the model
        history = fit_model(model, X_train_fold, y_train_fold, X_val_fold, y_val_fold, epochs=epochs, batch_size=batch_size, verbose=0)

        # Predict probabilities for the validation fold
        y_val_pred = model.predict(X_val_fold).flatten()

        # Calculate AUC for the current fold
        fold_auc = roc_auc_score(y_val_fold, y_val_pred)
        fold_auc_scores.append(fold_auc)
        print(f"AUC for Fold: {fold_auc}")

    # Compute the average AUC across all folds
    average_auc = np.mean(fold_auc_scores)
    print(f"Average AUC across all folds: {average_auc}")

    return average_auc


# Doing the same just with MMP9, AGRIN and TIMP-2

In [15]:
# Define the biomarkers to use for ratios
biomarkers = ['AGRIN ng/mL', 'TIMP-2 ng/mL']

# Create all possible ratios
ratios = []
for biomarker1, biomarker2 in itertools.combinations(biomarkers, 2):
    ratio_name = f"{biomarker1}_to_{biomarker2}"
    data_clean[ratio_name] = np.where(data_clean[biomarker2] != 0, 
                                      data_clean[biomarker1] / data_clean[biomarker2], 
                                      np.nan)
    ratios.append(ratio_name)

# Drop rows with NaN values resulting from invalid ratios
data_ratios = data_clean.dropna()

# Prepare features and target
X_ratios = data_ratios[ratios +["Result"]]
y_ratios = data_ratios[target]

# Normalize the features
X_scaled_ratios = scaler.fit_transform(X_ratios)

In [16]:
df_2mL = pd.read_csv('../data/2mL.csv' , sep='\t', index_col=0, header=0)
df_2mL = utils.normalize_column_names(df_2mL)
# Ensure numeric columns are treated as such
cols_2mL_to_num = ['TIMP-2', 'ADIPOQ', 'MMP9', 
                    'KPYM', 'AGRIN', 'PERM', 'HSPB1',
                    'Total_protein_BCA']
df_2mL = utils.cols_as_numbers(df_2mL, cols_2mL_to_num)
# Ensure categorical columns are treated as such
df_2mL = utils.cols_as_category(df_2mL, {'Pathology':{
                                            'Benigna': 0, 
                                            'Adenocarcinoma de endometrio': 1,
                                            'Otros': np.nan,
                                            'Hiperplasia atípica endometrial': np.nan,
                                        }})

# Columns to be considered as biomarkers
BIOMARKERS_2mL = ['TIMP-2', 'ADIPOQ', 'MMP9', 'KPYM', 'AGRIN', 'PERM', 'HSPB1']
# Create new columns with the ratios between the biomarkers
for biomarker1 in BIOMARKERS_2mL:
    for biomarker2 in BIOMARKERS_2mL:
        if biomarker1 != biomarker2:
            df_2mL[f'{biomarker1}_{biomarker2}'] = df_2mL[biomarker1].div(df_2mL[biomarker2], axis=0)
            # Make infinite values NaN
            df_2mL[f'{biomarker1}_{biomarker2}'] = df_2mL[f'{biomarker1}_{biomarker2}'].replace([np.inf, -np.inf], np.nan)

RATIOS_2mL = [f'{biomarker1}_{biomarker2}' for biomarker1 in BIOMARKERS_2mL for biomarker2 in BIOMARKERS_2mL if biomarker1 != biomarker2]

In [17]:
df_2mL['Result'] = df_2mL.index.map(df_pipelle['Result'])
df_2mL['Result'] = df_2mL['Result'].replace({"Material Insuficiente o ausencia de diagnóstico": 1, "Cáncer": 2})
df_2mL['Result'] = df_2mL['Result'].fillna(1)
# Now all the remaining string values are replaced by 4
df_2mL['Result'] = df_2mL['Result'].replace({value: 0 for value in df_2mL['Result'].unique() if type(value) == str})



In [18]:
data_clean.columns

Index(['MMP9 ng/mL', 'HSPB1 ng/mL', 'PERM ng/mL', 'ADIPOQ ng/mL',
       'TIMP-2 ng/mL', 'AGRIN ng/mL', 'KPYM ng/mL', 'Result', 'Pathology',
       'MMP9 ng/mL_to_AGRIN ng/mL', 'MMP9 ng/mL_to_TIMP-2 ng/mL',
       'MMP9 ng/mL_to_PERM ng/mL', 'MMP9 ng/mL_to_KPYM ng/mL',
       'MMP9 ng/mL_to_ADIPOQ ng/mL', 'MMP9 ng/mL_to_HSPB1 ng/mL',
       'AGRIN ng/mL_to_TIMP-2 ng/mL', 'AGRIN ng/mL_to_PERM ng/mL',
       'AGRIN ng/mL_to_KPYM ng/mL', 'AGRIN ng/mL_to_ADIPOQ ng/mL',
       'AGRIN ng/mL_to_HSPB1 ng/mL', 'TIMP-2 ng/mL_to_PERM ng/mL',
       'TIMP-2 ng/mL_to_KPYM ng/mL', 'TIMP-2 ng/mL_to_ADIPOQ ng/mL',
       'TIMP-2 ng/mL_to_HSPB1 ng/mL', 'PERM ng/mL_to_KPYM ng/mL',
       'PERM ng/mL_to_ADIPOQ ng/mL', 'PERM ng/mL_to_HSPB1 ng/mL',
       'KPYM ng/mL_to_ADIPOQ ng/mL', 'KPYM ng/mL_to_HSPB1 ng/mL',
       'ADIPOQ ng/mL_to_HSPB1 ng/mL'],
      dtype='object')

In [19]:
# replace " ng/mL" with "" in the column names
data_clean.columns = data_clean.columns.str.replace(" ng/mL", "")
data_clean.columns = data_clean.columns.str.replace("_to_", "_")


In [20]:
df_2mL.columns

Index(['TIMP-2', 'ADIPOQ', 'MMP9', 'KPYM', 'AGRIN', 'PERM', 'HSPB1',
       'Total_protein_BCA', 'Pathology', 'TIMP-2_ADIPOQ', 'TIMP-2_MMP9',
       'TIMP-2_KPYM', 'TIMP-2_AGRIN', 'TIMP-2_PERM', 'TIMP-2_HSPB1',
       'ADIPOQ_TIMP-2', 'ADIPOQ_MMP9', 'ADIPOQ_KPYM', 'ADIPOQ_AGRIN',
       'ADIPOQ_PERM', 'ADIPOQ_HSPB1', 'MMP9_TIMP-2', 'MMP9_ADIPOQ',
       'MMP9_KPYM', 'MMP9_AGRIN', 'MMP9_PERM', 'MMP9_HSPB1', 'KPYM_TIMP-2',
       'KPYM_ADIPOQ', 'KPYM_MMP9', 'KPYM_AGRIN', 'KPYM_PERM', 'KPYM_HSPB1',
       'AGRIN_TIMP-2', 'AGRIN_ADIPOQ', 'AGRIN_MMP9', 'AGRIN_KPYM',
       'AGRIN_PERM', 'AGRIN_HSPB1', 'PERM_TIMP-2', 'PERM_ADIPOQ', 'PERM_MMP9',
       'PERM_KPYM', 'PERM_AGRIN', 'PERM_HSPB1', 'HSPB1_TIMP-2', 'HSPB1_ADIPOQ',
       'HSPB1_MMP9', 'HSPB1_KPYM', 'HSPB1_AGRIN', 'HSPB1_PERM', 'Result'],
      dtype='object')

In [36]:
# Build a model using "AGRIN_TIMP-2" and "Result" as features
# Train it over df_mmk and evaluate it over df_2mL
features = ['AGRIN_TIMP-2', 'Result']
target = 'Pathology'

# Preprocess data
df_2mL[features] = df_2mL[features].apply(pd.to_numeric, errors='coerce')
data_clean[features] = data_clean[features].apply(pd.to_numeric, errors='coerce')

df_2mL_clean = df_2mL[features + [target]].dropna()
df_mmk_clean = data_clean[features + [target]].dropna()

# Encode the target variable
label_encoder = LabelEncoder()
df_2mL_clean[target] = label_encoder.fit_transform(df_2mL_clean[target])
df_mmk_clean[target] = label_encoder.fit_transform(df_mmk_clean[target])

# Split into features and target
X_train = df_mmk_clean[features]
y_train = df_mmk_clean[target]
X_test = df_2mL_clean[features]
y_test = df_2mL_clean[target]


# Normalize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)


# Build the neural network model using the build_model function
model = build_model(input_dim=X_train.shape[1], nodes=[64, 32], dropout_rate=0.1)

# Train the model with early stopping
history = fit_model(model, X_train, y_train, X_test, y_test, epochs=200, batch_size=16)

# Predict probabilities for the test set
y_pred_proba = model.predict(X_test).flatten()

# Calculate the AUC
auc = roc_auc_score(y_test, y_pred_proba)

print(f"AUC with AGRIN_TIMP-2: {auc}")




[1m2/2[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 35ms/step
AUC with AGRIN_TIMP-2: 0.9704433497536945


In [37]:
# make a df of the sensitivity, specificity, NPV, PPV, TN, FP, FN, TP, TNR, TPR, FNR, FPR
# for all the possible thresholds
import numpy as np
from sklearn.metrics import confusion_matrix


thresholds = np.linspace(0, 1, 100)

sensitivity = []
specificity = []
npv = []
ppv = []
tn = []
fp = []
fn = []
tp = []
tnr = []
tpr = []
fnr = []
fpr = []

for threshold in thresholds:
    y_pred_thresholded = y_pred_proba > threshold
    tn_, fp_, fn_, tp_ = confusion_matrix(y_test, y_pred_thresholded).ravel()
    tn.append(tn_)
    fp.append(fp_)
    fn.append(fn_)
    tp.append(tp_)
    sensitivity.append(tp_ / (tp_ + fn_))
    specificity.append(tn_ / (tn_ + fp_))
    npv.append(tn_ / (tn_ + fn_))
    ppv.append(tp_ / (tp_ + fp_))
    tnr.append(tn_ / (tn_ + fp_))
    tpr.append(tp_ / (tp_ + fn_))
    fnr.append(fn_ / (fn_ + tp_))
    fpr.append(fp_ / (fp_ + tn_))


df_metrics = pd.DataFrame({
    'Threshold': thresholds,
    'Sensitivity': sensitivity,
    'Specificity': specificity,
    'NPV': npv,
    'PPV': ppv,
    'TN': tn,
    'FP': fp,
    'FN': fn,
    'TP': tp,
    'TNR': tnr,
    'TPR': tpr,
    'FNR': fnr,
    'FPR': fpr
})

# Remove repeated sensitivity values taking the last
df_metrics = df_metrics.drop_duplicates(subset=['Specificity'], keep='last')

# Round to 3 decimal places
df_metrics = df_metrics.round(3)

df_metrics



Unnamed: 0,Threshold,Sensitivity,Specificity,NPV,PPV,TN,FP,FN,TP,TNR,TPR,FNR,FPR
3,0.03,1.0,0.0,,0.58,0,21,0,29,0.0,1.0,0.0,1.0
4,0.04,1.0,0.429,1.0,0.707,9,12,0,29,0.429,1.0,0.0,0.571
5,0.051,1.0,0.524,1.0,0.744,11,10,0,29,0.524,1.0,0.0,0.476
21,0.212,1.0,0.571,1.0,0.763,12,9,0,29,0.571,1.0,0.0,0.429
23,0.232,1.0,0.619,1.0,0.784,13,8,0,29,0.619,1.0,0.0,0.381
24,0.242,1.0,0.667,1.0,0.806,14,7,0,29,0.667,1.0,0.0,0.333
25,0.253,0.966,0.762,0.941,0.848,16,5,1,28,0.762,0.966,0.034,0.238
26,0.263,0.966,0.81,0.944,0.875,17,4,1,28,0.81,0.966,0.034,0.19
35,0.354,0.931,0.857,0.9,0.9,18,3,2,27,0.857,0.931,0.069,0.143
49,0.495,0.828,0.905,0.792,0.923,19,2,5,24,0.905,0.828,0.172,0.095


In [39]:
df_2mL[["Result","Pathology"]].value_counts()

Result  Pathology
2       1.0          16
1       1.0          14
0       0.0          13
1       0.0           9
Name: count, dtype: int64