# Cohort 152 AUCs analysis

### Imports and environment setup

- Date of run: 2025-05-06
- Environment: python 3.12
- Packages required: pandas, numpy, sklearn, statsmodels, seaborn, matplotlib

In [1]:
# Include in the environment the code directory with the utils function
import sys
sys.path.append('../code/')

In [2]:
# Library imports
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns

# import all functions from ../code/cohort_analysis_utils.py
import cohort_analysis_utils as utils

In [3]:
# Remove warnings for readability
import warnings
warnings.filterwarnings('ignore')

# Remove cell printing limits
pd.set_option('display.max_rows', None)

# Data loading and preprosessing

## Samples data

In [4]:
df = pd.read_csv('../data/60x60_dr_mmk_20250224.csv' , sep='\t', index_col=0, header=0)

In [5]:
# Harmonization of column names
df = utils.normalize_column_names(df)

In [6]:
# Create a copy to compare the new values with the original ones
df_copy = df.copy()

In [7]:
df = df[['Collection_center', 'Age', 'Pathology', 'Hystology_grade',
       'Hystology_type', 'Time_between_collection_and_processing_h', 'Group_time',
       'Collected_volume_mL', 'Total_protein_BCA']]

In [8]:
df = utils.cols_as_category(df, {'Pathology':{'Benign': 0, 'Endometrial cancer': 1}})

## Readouts

In [9]:
BIOMARKERS = ["MMP9", "KPYM", "AGRIN", "HSPB1", "PERM", "PIGR", "TIMP2", "CLIC1"]

In [10]:
for biomarker in BIOMARKERS:
    df_readouts = pd.read_excel("../data/152_20250507.xlsx", sheet_name=biomarker, header=0, index_col=0)
    df_readouts = utils.normalize_column_names(df_readouts)
    df_readouts = df_readouts.rename(columns={'Repeat': f'Repeat_{biomarker}'})
    # merge with df_samples
    df = pd.merge(df, df_readouts, left_index=True, right_index=True, how='left')




# Model computation

## Execution parameters

In [11]:
PLOT_ROCS = False

NORMALIZING_COL = 'Total_protein_BCA' 
VOLUME_COL = 'Collected_volume_mL'

METHODS = ['normalized']

In [None]:
suffixes = ['_G1_repeats', '_G2', '_G3']

for suffix in suffixes:
    RESULTS_PATH = f'../data/results/152/{suffix}/'

    for biomarker in BIOMARKERS:
        df[f'{biomarker}'] =  pd.to_numeric(df[f'{biomarker}{suffix}'], errors='coerce')

    for biomarker_count in range(1, 4):
        print(f'Number of biomarkers: {biomarker_count}')

        models = utils.compute_all_models_and_save(
                                    df=df,
                                    biomarkers=BIOMARKERS,
                                    normalizing_col=NORMALIZING_COL, 
                                    volume_col= VOLUME_COL,
                                    volume_added=0.5,
                                    apply_log=True,
                                    avoid_same_biomarker=True,
                                    methods=METHODS,
                                    max_biomarker_count=biomarker_count,
                                    folder_name=RESULTS_PATH,
                                    plot_rocs=True,
                                    auc_threshold=0.49 + 0.05 * biomarker_count,
                                    compute_auc_ci=True if biomarker_count == 1 else False,
                                    )

        if biomarker_count == 1:
            # Reserve the models with confidence intervals for the final analysis
            models_with_auc_ci = models
        
        print(f'Number of biomarkers: {biomarker_count} done')
    

Number of biomarkers: 1
Number of biomarkers: 1 done
Number of biomarkers: 2
Number of biomarkers: 2 done
Number of biomarkers: 3
Number of biomarkers: 3 done
Number of biomarkers: 1
Number of biomarkers: 1 done
Number of biomarkers: 2
Number of biomarkers: 2 done
Number of biomarkers: 3
Number of biomarkers: 3 done
Number of biomarkers: 1
Number of biomarkers: 1 done
Number of biomarkers: 2
Number of biomarkers: 2 done
Number of biomarkers: 3
Number of biomarkers: 3 done


## Compute the same removing problematic samples

In [14]:
suffixes = ['_G1_repeats', '_G2', '_G3']

df_bad_removed = df.drop(index=['003-0044', '003-0035', '003-0007', '002-0007', '012-0005','001-0034', '012-0026', '012-0029', '002-0006'], axis=0)

for suffix in suffixes:
    RESULTS_PATH = f'../data/results/152/{suffix}_bad_removed/'

    
    for biomarker in BIOMARKERS:
        df_bad_removed[f'{biomarker}'] =  pd.to_numeric(df_bad_removed[f'{biomarker}{suffix}'], errors='coerce')

    for biomarker_count in range(1, 4):
        print(f'Number of biomarkers: {biomarker_count}')

        models = utils.compute_all_models_and_save(
                                    df=df_bad_removed,
                                    biomarkers=BIOMARKERS,
                                    normalizing_col=NORMALIZING_COL, 
                                    volume_col= VOLUME_COL,
                                    volume_added=0.5,
                                    apply_log=True,
                                    avoid_same_biomarker=True,
                                    methods=METHODS,
                                    max_biomarker_count=biomarker_count,
                                    folder_name=RESULTS_PATH,
                                    plot_rocs=True,
                                    auc_threshold=0.49 + 0.05 * biomarker_count,
                                    compute_auc_ci=True if biomarker_count == 1 else False,
                                    )

        if biomarker_count == 1:
            # Reserve the models with confidence intervals for the final analysis
            models_with_auc_ci = models
        
        print(f'Number of biomarkers: {biomarker_count} done')

Number of biomarkers: 1
Number of biomarkers: 1 done
Number of biomarkers: 2
Number of biomarkers: 2 done
Number of biomarkers: 3
Number of biomarkers: 3 done
Number of biomarkers: 1
Number of biomarkers: 1 done
Number of biomarkers: 2
Number of biomarkers: 2 done
Number of biomarkers: 3
Number of biomarkers: 3 done
Number of biomarkers: 1
Number of biomarkers: 1 done
Number of biomarkers: 2
Number of biomarkers: 2 done
Number of biomarkers: 3
Number of biomarkers: 3 done


## Data comparison

In [24]:
df_copy.rename(columns={'TIMP-2': 'TIMP2', }, inplace=True)

In [35]:
for biomarker in BIOMARKERS:
    if biomarker == 'CLIC1':
        continue
    df[f'{biomarker}'] =  pd.to_numeric(df[f'{biomarker}{suffixes[0]}'], errors='coerce')
    df_copy[f'{biomarker}'] =  pd.to_numeric(df_copy[f'{biomarker}'], errors='coerce')
    # round to 2 decimal places
    df[f'{biomarker}'] = df[f'{biomarker}'].round(1)
    df_copy[f'{biomarker}'] = df_copy[f'{biomarker}'].round(1)
    


In [38]:
for biomarker in BIOMARKERS:
    if biomarker == 'CLIC1':
        continue

    # compare the biomarker in the df and df_copy and print the differences if any
    if not df[biomarker].equals(df_copy[biomarker]):
        print(f'Differences in {biomarker}:')
        print(df[biomarker].compare(df_copy[biomarker]))
    else:
        print(f'No differences in {biomarker}')

Differences in MMP9:
            self   other
Sample                  
001-0006     NaN     0.2
001-0009     NaN     6.1
001-0011     NaN    14.9
001-0013     NaN     0.0
001-0014     NaN    16.6
001-0040   296.5   296.6
002-0001     NaN    23.2
002-0012  2112.6  2396.7
002-0013     NaN     4.8
002-0016     NaN    11.0
002-0027     NaN     8.5
004-0008     NaN     4.4
012-0013     NaN     7.9
012-0028     NaN    14.6
012-0029  2695.6     0.0
Differences in KPYM:
            self    other
Sample                   
001-0005   103.9    105.9
001-0008     NaN    117.2
001-0011    39.1    117.2
001-0014   538.1    623.8
002-0001     NaN     13.0
002-0005     NaN     35.6
002-0006   300.8    300.7
002-0008     NaN      0.0
002-0012   555.9    831.5
002-0013     NaN      0.0
002-0016     NaN      0.0
002-0018     1.0     39.0
002-0019     NaN     39.0
002-0020     NaN     39.0
002-0023  1879.1   2182.0
002-0027    24.4     39.0
002-0028     NaN      1.6
003-0035  2404.7   2404.8
003-0039  308