# Cohort 240 RUO 11 biomarkers AUCs analysis

### Imports and environment setup

- Date of run: 2025-01-23
- Environment: python 3.12
- Packages required: pandas, numpy, sklearn, statsmodels, seaborn, matplotlib

In [13]:
# Include in the environment the code directory with the utils function
import sys
sys.path.append('../code/')

In [14]:
# Library imports
import pandas as pd
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns

# Utils imports
import cohort_analysis_utils as utils

In [15]:
# Remove warnings for readability
import warnings
warnings.filterwarnings('ignore')

# Remove cell printing limits
pd.set_option('display.max_rows', None)


# Data loading and preprosessing

The original excel file (available [here](<https://mimarkdx.sharepoint.com/sites/Scientific/Documentos compartidos/General/PHASE 6 - SOFTWARE DEVELOPMENT/DATA/../../../../../../:x:/s/Scientific/Eaw9d-fa2BREg_iZB1SL02YBG4mfVaJtoylG46bROmXVJA?e=8chcN7>)) was saved into a CSV file in the data folder of this repository, separating fields by TABs.

In [26]:
df_240 = pd.read_csv('../data/ruo_240_11bmk.csv', sep='\t', index_col=0, header=0)

In [27]:
# Harmonization of column names
df_240 = utils.normalize_column_names(df_240)

In [29]:
# Ensure numeric columns are treated as such
cols_240_to_num = ['collected_volume', 'total_protein',
                   'HSPB1', 'MDK', 'MMP9','OSTP', 'PERM',
                    'AGRIN', 'NGAL','MVP', 'FABP5',
                    'PIGR', 'BCAM','MIF']
df_240 = utils.cols_as_numbers(df_240, cols_240_to_num)


In [30]:
# Ensure categorical columns are treated as such
df_240 = utils.cols_as_category(df_240, {'Pathology':{'Benigna': 0, 'Adenocarcinoma de endometrio': 1}})

In [31]:
# For # Selecting columns with "final_result" in their names
final_result_columns = [col for col in df_240.columns if "final_result" in col]

# Converting values in the selected columns to binary format
df_240[final_result_columns] = df_240[final_result_columns].applymap(lambda x: 1 if "yes" in str(x).lower() else 0)


In [32]:
# Iterate over the identified "final_result" columns
for final_col in final_result_columns:
    # Get the base column name (assuming format "xxx_final_result")
    base_col = final_col.replace("_final_result", "")
    
    # Check if the corresponding base column exists
    if base_col in df_240.columns:
        # Update values in the base column to NaN where the final_result is 0
        df_240.loc[df_240[final_col] == 0, base_col] = None


In [33]:
df_240.head()

Unnamed: 0_level_0,Pathology,grado_histologico,tipo_histologico,tipo_grado,figo,clasificacion_tcga,menopausia,grosor_endometrial_eco_mm,fecha_nacimiento,fecha_recogida,...,MVP,MVP_final_result,FABP5,FABP5_final_result,PIGR,PIGR_final_result,BCAM,BCAM_final_result,MIF,MIF_final_result
paciente,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
40,0,0,0,Benign,0,0,No,,23/12/1977,22/09/2011,...,109.204346,1,553.500577,1,3632.017564,1,5.345615,1,6.471135,1
46,0,0,0,Benign,0,0,No,,18/09/1985,09/11/2011,...,93.488111,1,918.957451,1,17855.68497,1,16.808486,1,4.656249,1
53,0,0,0,Benign,0,0,No,,12/03/1971,23/11/2011,...,159.747441,1,897.813783,1,9854.623736,1,4.789904,1,5.645072,1
54,0,0,0,Benign,0,0,Si,,03/08/1955,01/12/2011,...,111.921434,1,246.403476,1,2256.654812,1,2.715494,1,3.967058,1
148,0,0,0,Benign,0,0,Si,,06/10/1959,08/10/2012,...,,0,12.57721,1,4467.187709,1,10.171978,1,0.067939,1


# Execution parameters

In [50]:
PLOT_ROCS = True
MAX_BIOMARKER_COUNT = 1
RESULTS_PATH = '../data/results/240_11bmk'

# Columns to be considered as biomarkers
BIOMARKERS_240 = ['HSPB1', 'MDK', 'MMP9','OSTP', 'PERM',
                    'AGRIN', 'NGAL','MVP', 'FABP5',
                    'PIGR', 'BCAM','MIF']

NORMALIZING_COL_240 = 'total_protein' # Column to be used for normalizing the biomarkers
VOLUME_COL = 'collected_volume' # Column to be used as volume for scatters and undoing the dilution

In [51]:
METHODS = ['direct', 'normalized']

# Computing the models

All the functions to generate the models are included in the [cohort_analysis_utils.py](../code/cohort_analysis_utils.py) file.

In [52]:
models_120 = utils.compute_all_models_and_save(
                            df=df_240,
                            biomarkers=BIOMARKERS_240,
                            normalizing_col=NORMALIZING_COL_240, 
                            volume_col= VOLUME_COL,
                            volume_added=0.5,
                            apply_log=True,
                            avoid_same_biomarker=True,
                            methods=METHODS,
                            max_biomarker_count=MAX_BIOMARKER_COUNT,
                            folder_name=RESULTS_PATH,
                            plot_rocs=PLOT_ROCS,
                            auc_threshold=0.3
                            )