# Cohort 152 AUCs analysis

### Imports and environment setup

- Date of run: 2025-05-06
- Environment: python 3.12
- Packages required: pandas, numpy, sklearn, statsmodels, seaborn, matplotlib

In [1]:
# Include in the environment the code directory with the utils function
import sys
sys.path.append('../code/')

In [2]:
# Library imports
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns

# import all functions from ../code/cohort_analysis_utils.py
import cohort_analysis_utils as utils

In [3]:
# Remove warnings for readability
import warnings
warnings.filterwarnings('ignore')

# Remove cell printing limits
pd.set_option('display.max_rows', None)

# Data loading and preprosessing

## Samples data

In [4]:
df = pd.read_csv('../data/60x60_dr_mmk_20250224.csv' , sep='\t', index_col=0, header=0)

In [5]:
# Harmonization of column names
df_samples = utils.normalize_column_names(df)

In [6]:
df = df[['Collection_center', 'Age', 'Pathology', 'Hystology_grade',
       'Hystology_type', 'Time_between_collection_and_processing_h', 'Group_time',
       'Collected_volume_mL', 'Total_protein_BCA']]

In [7]:
df = utils.cols_as_category(df, {'Pathology':{'Benign': 0, 'Endometrial cancer': 1}})

## Readouts

In [8]:
BIOMARKERS = ["MMP9", "KPYM", "AGRIN", "HSPB1", "PERM", "PIGR", "TIMP2", "CLIC1"]

In [9]:
for biomarker in BIOMARKERS:
    df_readouts = pd.read_excel("../data/152_20250423.xlsx", sheet_name=biomarker, header=0, index_col=0)
    df_readouts = utils.normalize_column_names(df_readouts)
    df_readouts = df_readouts.rename(columns={'Repeat': f'Repeat_{biomarker}'})
    # merge with df_samples
    df = pd.merge(df, df_readouts, left_index=True, right_index=True, how='left')




# Model computation

In [10]:
# Which column take into account for the readout
READOUT_COLUMN_SUFFIX = '_G1_repeats'
# READOUT_COLUMN_SUFFIX = '_G2'
# READOUT_COLUMN_SUFFIX = '_G3'

#RESULTS_PATH = f'../data/results/152_ratios/{READOUT_COLUMN_SUFFIX}_bad_removed/'
RESULTS_PATH = f'../data/results/152/{READOUT_COLUMN_SUFFIX}/'

In [11]:
#df = df.drop(index=['002-0006', '012-0009', '012-0026', '012-0029'], axis=0)
#df = df.drop(index=['003-0044', '003-0035', '003-0007', '002-0007', '012-0005','001-0034', '012-0026', '012-0029', '002-0006'], axis=0)

## Compute ratios

In [12]:
for biomarker in BIOMARKERS:
    df[f'{biomarker}'] =  pd.to_numeric(df[f'{biomarker}{READOUT_COLUMN_SUFFIX}'], errors='coerce')

## Execution parameters

In [None]:
PLOT_ROCS = False

NORMALIZING_COL = 'Total_protein_BCA' 
VOLUME_COL = 'Collected_volume_mL'

METHODS = ['direct', 'normalized']

## Models creation

In [None]:
for biomarker_count in range(1, 4):
        print(f'Number of biomarkers: {biomarker_count}')

        models = utils.compute_all_models_and_save(
                                    df=df,
                                    biomarkers=BIOMARKERS,
                                    normalizing_col=NORMALIZING_COL, 
                                    volume_col= VOLUME_COL,
                                    volume_added=0.5,
                                    apply_log=False,
                                    avoid_same_biomarker=True,
                                    methods=METHODS,
                                    max_biomarker_count=biomarker_count,
                                    folder_name=RESULTS_PATH,
                                    plot_rocs=True,
                                    auc_threshold=0.6,
                                    compute_auc_ci=True if biomarker_count == 1 else False,
                                    )

        if biomarker_count == 1:
            # Reserve the models with confidence intervals for the final analysis
            models_with_auc_ci = models
        
        print(f'Number of biomarkers: {biomarker_count} done')


In [16]:
models = utils.compute_all_models_and_save(
                            df=df,
                            biomarkers=RATIOS_COLUMNS,
                            normalizing_col=NORMALIZING_COL, 
                            volume_col= VOLUME_COL,
                            volume_added=0.5,
                            apply_log=False,
                            avoid_same_biomarker=True,
                            methods=METHODS,
                            max_biomarker_count=2,
                            folder_name=RESULTS_PATH,
                            plot_rocs=True,
                            auc_threshold=0.8,
                            compute_auc_ci=False,
                            )

In [17]:
tk = list(models['direct'].keys())[0]
ks = list(models['direct'][tk].keys())

bad_systematic = set(df.index)
print(bad_systematic)
for k in ks:
    #print(models['direct'][tk][k]['y_true'])
    y_pred = models['direct'][tk][k]['y_pred']
    y_true = models['direct'][tk][k]['y_true']

    indices = y_true[(y_true == 0) & (y_pred > 0.10)].index
    #indices = y_true[(y_true == 1) & (y_pred < 0.30)].index

    if len(indices)>5:
        #print(f'Number of bad systematic: {len(bad_systematic)}')

        bad_systematic = bad_systematic.intersection(set(indices))

print(f'Number of bad systematic: {len(bad_systematic)}')
print(f'Bad systematic: {bad_systematic}')
    

{'002-0015', '001-0004', '004-0003', '002-0008', '001-0005', '001-0009', '010-0007', '001-0012', '001-0007', '005-0003', '004-0006', '002-0028', '002-0027', '002-0004', '013-0003', '012-0018', '012-0001', '008-0035', '012-0030', '013-0006', '012-0028', '020-0049', '007-0004', '002-0020', '012-0013', '003-0062', '012-0008', '003-0012', '006-0009', '020-0047', '012-0027', '001-0017', '001-0006', '002-0026', '020-0034', '020-0050', '020-0006', '002-0024', '020-0001', '003-0050', '002-0019', '001-0008', '001-0025', '020-0014', '003-0013', '016-0033', '002-0013', '012-0014', '003-0011', '003-0043', '002-0016', '016-0043', '001-0024', '006-0001', '020-0015', '012-0012', '003-0028', '002-0001', '020-0008', '002-0025', '011-0002', '003-0047', '012-0003', '007-0002', '018-0006', '012-0015', '018-0016', '003-0002', '001-0040', '018-0002', '002-0005', '001-0013', '001-0011', '007-0006', '005-0008', '012-0025', '012-0002', '020-0037', '020-0023', '012-0009', '003-0049', '020-0007', '010-0012', '01