# Cohort 2mL AUCs analysis

### Imports and environment setup

- Date of run: 2025-01-09
- Environment: python 3.12
- Packages required: pandas, numpy, sklearn, statsmodels, seaborn, matplotlib

In [1]:
# Include in the environment the code directory with the utils function
import sys
sys.path.append('../code/')

In [2]:
# Library imports
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns

# Utils imports
import cohort_analysis_utils as utils

In [3]:
# Remove warnings for readability
import warnings
warnings.filterwarnings('ignore')

# Remove cell printing limits
pd.set_option('display.max_rows', None)


# Data loading and preprosessing

The original excel file  was saved into a CSV file in the data folder of this repository, separating fields by TABs.

In [4]:
df_2mL = pd.read_csv('../data/2mL.csv' , sep='\t', index_col=0, header=0)


In [5]:
# Harmonization of column names
df_2mL = utils.normalize_column_names(df_2mL)

In [6]:
df_2mL.columns

Index(['TIMP-2', 'ADIPOQ', 'MMP9', 'KPYM', 'AGRIN', 'PERM', 'HSPB1',
       'Total_protein_BCA', 'Pathology'],
      dtype='object')

In [7]:
# Ensure numeric columns are treated as such
cols_2mL_to_num = ['TIMP-2', 'ADIPOQ', 'MMP9', 
                    'KPYM', 'AGRIN', 'PERM', 'HSPB1',
                    'Total_protein_BCA']
df_2mL = utils.cols_as_numbers(df_2mL, cols_2mL_to_num)

In [8]:
#Unique values in the column Pathology
df_2mL['Pathology'].unique()

array([nan, 'Benigna', 'Adenocarcinoma de endometrio', 'Otros',
       'Hiperplasia atípica endometrial'], dtype=object)

In [9]:
# Ensure categorical columns are treated as such
df_2mL = utils.cols_as_category(df_2mL, {'Pathology':{
                                            'Benigna': 0, 
                                            'Adenocarcinoma de endometrio': 1,
                                            'Otros': np.nan,
                                            'Hiperplasia atípica endometrial': np.nan,
                                        }})

In [10]:
#Unique values in the column Pathology
df_2mL['Pathology'].unique()

array([nan,  0.,  1.])

# Execution parameters

In [11]:
PLOT_ROCS = False
MAX_BIOMARKER_COUNT = 3
RESULTS_PATH = '../data/results/2mL/'

# Columns to be considered as biomarkers
BIOMARKERS_2mL = ['TIMP-2', 'ADIPOQ', 'MMP9', 'KPYM', 'AGRIN', 'PERM', 'HSPB1']

NORMALIZING_COL_2mL = 'Total_protein_BCA' # Column to be used for normalizing the biomarkers

### Create new columns for biomarker ratios

In [12]:
# Create new columns with the ratios between the biomarkers
for biomarker1 in BIOMARKERS_2mL:
    for biomarker2 in BIOMARKERS_2mL:
        if biomarker1 != biomarker2:
            df_2mL[f'{biomarker1}_{biomarker2}'] = df_2mL[biomarker1].div(df_2mL[biomarker2], axis=0)
            # Make infinite values NaN
            df_2mL[f'{biomarker1}_{biomarker2}'] = df_2mL[f'{biomarker1}_{biomarker2}'].replace([np.inf, -np.inf], np.nan)

RATIOS_2mL = [f'{biomarker1}_{biomarker2}' for biomarker1 in BIOMARKERS_2mL for biomarker2 in BIOMARKERS_2mL if biomarker1 != biomarker2]

In [13]:
METHODS = ['direct']

# Computing the models

All the functions to generate the models are included in the [cohort_analysis_utils.py](../code/cohort_analysis_utils.py) file.

In [14]:
for MAX_BIOMARKER_COUNT in range(1, 4):
    print(f"Computing models with {MAX_BIOMARKER_COUNT} biomarkers")
    models_120 = utils.compute_all_models_and_save(
                                df=df_2mL,
                                biomarkers=RATIOS_2mL,
                                normalizing_col=NORMALIZING_COL_2mL, 
                                volume_col= '',
                                volume_added=2.,
                                apply_log=True,
                                avoid_same_biomarker=True,
                                methods=METHODS,
                                max_biomarker_count=MAX_BIOMARKER_COUNT,
                                folder_name=RESULTS_PATH,
                                plot_rocs=False,
                                auc_threshold=0.6,
                                )

Computing models with 1 biomarkers
Computing models with 2 biomarkers
Computing models with 3 biomarkers


In [15]:
# Count rows from df_2mL that have defined TIMP-2, MMP9, AGRIN and Pathology values for each pathology
df_2mL[['TIMP-2', 'MMP9', 'AGRIN', 'Pathology']].dropna().groupby('Pathology').count()

Unnamed: 0_level_0,TIMP-2,MMP9,AGRIN
Pathology,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
0.0,21,21,21
1.0,28,28,28
