# Cohort 60x60 AUCs analysis

### Imports and environment setup

- Date of run: 2024-12-28
- Environment: python 3.12
- Packages required: pandas, numpy, sklearn, statsmodels, seaborn, matplotlib

In [1]:
# Include in the environment the code directory with the utils function
import sys
sys.path.append('../code/')

In [2]:
# Library imports
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns

# Utils imports
import cohort_analysis_utils as utils

In [3]:
# Remove warnings for readability
import warnings
warnings.filterwarnings('ignore')

# Remove cell printing limits
pd.set_option('display.max_rows', None)


# Data loading and preprosessing

The original excel file was saved into a CSV file in the data folder of this repository, separating fields by TABs.

In [4]:
df_120 = pd.read_csv('../data/60x60_enlarged_20250313.csv' , sep='\t', index_col=0, header=0)


In [5]:
df_120['BMI'].isna().sum()

np.int64(22)

In [6]:
#df_120 = df_120.iloc[:120, :]

In [7]:
# Harmonization of column names
df_120 = utils.normalize_column_names(df_120)

In [8]:
df_120.columns

Index(['Collection_center', 'Age', 'BMI', 'Pathology', 'Hystology_grade',
       'Hystology_type', 'FIGO_stage_2009', 'TCGA',
       'Time_between_collection_and_processing_h', 'Group_time',
       'Collected_volume_mL', 'Sample_visual_description', 'Hemolysis', 'pH',
       'Collected_at', 'MMP9_RUO', 'HSPB1_RUO', 'PERM_RUO', 'MMP9_MMK',
       'HSPB1_MMK', 'AGRIN_MMK', 'KPYM_MMK', 'PERM_MMK', 'Total_protein_UCFP',
       'Total_protein_BCA', 'ADIPOQ_RUO', 'TIMP-2_RUO', 'Unnamed:_28'],
      dtype='object')

In [9]:
# Ensure numeric columns are treated as such
cols_120_to_num = ['MMP9_RUO', 'HSPB1_RUO', 'PERM_RUO', 'MMP9_MMK',
       'HSPB1_MMK', 'AGRIN_MMK', 'KPYM_MMK', 'PERM_MMK', 'Total_protein_UCFP',
       'Total_protein_BCA', 'ADIPOQ_RUO', 'TIMP-2_RUO']
df_120 = utils.cols_as_numbers(df_120, cols_120_to_num)

In [10]:
# Columns to be considered as biomarkers
BIOMARKERS_120 = ['MMP9_MMK', 'PERM_MMK', 'TIMP-2_RUO', 'AGRIN_MMK', 'KPYM_MMK']



In [11]:
# Rename the BIOMARKERS_120 columns to the result of splitting them by _ and taking the first part
for biomarker in BIOMARKERS_120:
    df_120 = df_120.rename(columns={biomarker: biomarker.split('_')[0]})

BIOMARKERS_120 = [biomarker.split('_')[0] for biomarker in BIOMARKERS_120]

### Create new columns for biomarker ratios

In [12]:
# Create new columns with the ratios between the biomarkers
for biomarker1 in BIOMARKERS_120:
    for biomarker2 in BIOMARKERS_120:
        if biomarker1 != biomarker2:
            df_120[f'{biomarker1}_{biomarker2}'] = df_120[biomarker1].div(df_120[biomarker2], axis=0)
            # Make infinite values NaN
            df_120[f'{biomarker1}_{biomarker2}'] = df_120[f'{biomarker1}_{biomarker2}'].replace([np.inf, -np.inf], np.nan)
    

In [13]:
# Ensure categorical columns are treated as such
df_120 = utils.cols_as_category(df_120, {'Pathology':{'Benign': 0, 'Endometrial cancer': 1}})

In [14]:
df_120_criteria = pd.read_csv('../data/60x60_enlarged_dx_certainty.csv' , sep='\t', index_col=0, header=0)


In [15]:
df_120_criteria = utils.cols_as_category(df_120_criteria, {'Pathology':{'Benigna': 0, 'Adenocarcinoma de endometrio': 1}})

In [None]:
# get indices of df_120_criteria where criteria columns are not NaN
query = '(criteria == "B1" or criteria == "T1" or criteria == "T2" or criteria == "B2" )' # or criteria == "B2" 
#query += ' and not(sample == "003-0028" or sample == "006-0030" or sample == "012-0029") '
idx = df_120_criteria.query(query).index

# Filter BMI < 30 for df_120
idx = idx.intersection(df_120.query('BMI <= 30').index)


In [17]:
to_remove_from_idx = set(idx) - set(df_120.index)
idx = list(set(idx) - to_remove_from_idx)

In [18]:
# keep only the rows in df_120 that are in df_120_criteria
df_120 = df_120.loc[idx]

In [19]:
# for those "sample" (index) present in df_120_criteria, replace the Pathology column in df_120 with the one in df_120_criteria
df_120['Pathology'] = df_120_criteria['Pathology']


In [20]:
df_120.shape

(55, 48)

In [21]:
df_120.Pathology.value_counts()

Pathology
1    32
0    23
Name: count, dtype: int64

# Execution parameters

In [None]:
PLOT_ROCS = False
MAX_BIOMARKER_COUNT = 3
RESULTS_PATH = '../data/results/60x60_dr_ratios_dx_criteria/b12t12_bmi30/'

# The biomarkers now include the ratios
BIOMARKERS_120 = [f"{biomarker1}_{biomarker2}" for biomarker1 in BIOMARKERS_120 for biomarker2 in BIOMARKERS_120 if biomarker1 != biomarker2]

NORMALIZING_COL_120 = 'Total_protein_BCA' # Column to be used for normalizing the biomarkers
VOLUME_COL = 'Collected_volume_mL' # Column to be used as volume for scatters and undoing the dilution

Since we are using ratios, we are interested only in the $direct$ method.

In [23]:
METHODS = ['direct']

# Computing the models

All the functions to generate the models are included in the [cohort_analysis_utils.py](../code/cohort_analysis_utils.py) file.

In [24]:
models_120 = utils.compute_all_models_and_save(
                            df=df_120,
                            biomarkers=BIOMARKERS_120,
                            normalizing_col=NORMALIZING_COL_120, 
                            volume_col= VOLUME_COL,
                            volume_added=0.5,
                            apply_log=True,
                            avoid_same_biomarker=True,
                            methods=METHODS,
                            max_biomarker_count=2,
                            folder_name=RESULTS_PATH,
                            plot_rocs=True,
                            auc_threshold=0.7,
                            )