# Cohort 7.3 Mass Spec cancer type analysis

### Imports and environment setup

- Date of run: 2025-06-19
- Environment: python 3.12
- Packages required: pandas, numpy, sklearn, statsmodels, seaborn, matplotlib

In [1]:
# Include in the environment the code directory with the utils function
import sys
sys.path.append('../code/')

In [2]:
# Library imports
import pandas as pd
import numpy as np
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm

# import all functions from ../code/cohort_analysis_utils.py
import cohort_analysis_utils as utils

In [3]:
# Remove warnings for readability
import warnings
warnings.filterwarnings('ignore')

# Remove cell printing limits
pd.set_option('display.max_rows', None)

# Data loading and preprosessing

## PG matrix

In [4]:
df = pd.read_csv('../data/ms/cohort_7_3_MS_pg.csv', index_col='Protein.Names', header=0, sep='\t')

In [5]:
# Name harmonization
df.columns = list(df.columns[0:3]) + \
             [ c.replace('/users/pr/rawstream/2025NK011/mzml/', '')[0:len("2025NK011_EVCO_001")] for c in df.columns[3:] ]

In [6]:
# Convert to numeric 
df.iloc[:, 3:] = df.iloc[:, 3:].replace(0, np.nan)
df.iloc[:, 3:] = df.iloc[:, 3:].apply(pd.to_numeric, errors='coerce')

In [7]:
# Remove the first 3 columns, keeping only the protein names as columns
df = df.iloc[:, 3:]

In [8]:
# Keep a log2 transformation of the data
df = np.log2(df)

In [9]:
# Transpose, columns are now proteins and rows are samples, as in ELISA
df = df.T

In [10]:
######## Remove rows from df that have less than 5000 non-NaN values ########
df_116 = df.copy()
df = df.dropna(thresh=5000)


In [11]:
######## NORMALIZATION TO THE MEDIAN ########
df = df.subtract(df.median(axis=1), axis=0)
df_116 = df_116.subtract(df_116.median(axis=1), axis=0)

In [12]:
# The biomarkers are now the columns of the dataframe
BIOMARKERS = list(df.columns)

### Metadata

In [13]:
df_metadata = pd.read_csv('../data/cohort_7.3_metadata.csv', index_col='id_ms', header=0, sep='\t')

In [14]:
for col in ['Pathology', 'Endometrial_thickness','Histology_grade','Grade','Histology_type','Type','FIGO_2009','Stage','Molecular_classification']:
    df[col] = df_metadata[col]
    df_116[col] = df_metadata[col]

In [15]:
df.head()

Protein.Names,NUD4B_HUMAN,KV37_HUMAN,LV469_HUMAN,LV861_HUMAN,LV460_HUMAN,LVX54_HUMAN,LV548_HUMAN,LV746_HUMAN,LV537_HUMAN,LV322_HUMAN,...,COL10_HUMAN,Pathology,Endometrial_thickness,Histology_grade,Grade,Histology_type,Type,FIGO_2009,Stage,Molecular_classification
2025NK011_EVCO_001,,8.503412,8.191344,6.744697,3.968654,,-0.711087,8.261095,0.667425,,...,-2.354596,Benign,21.0,,,,,,,
2025NK011_EVCO_002,,7.387199,6.012501,6.191824,-0.756886,,-1.932774,6.088268,-1.509365,,...,-2.946345,EC,6.0,G1,Low,Endometrioid,EEC,Ia,Early,Unclassifiable
2025NK011_EVCO_004,,8.186114,6.491853,3.839204,2.611763,,,6.204571,-0.715212,,...,-3.673083,Benign,6.0,,,,,,,
2025NK011_EVCO_005,-2.730646,7.809866,6.949269,5.861213,5.173829,,,7.590085,0.982298,,...,-2.858068,Benign,4.0,,,,,,,
2025NK011_EVCO_006,,7.701804,6.862866,5.804819,5.217739,,,7.378455,1.512206,,...,-1.051796,EC,11.0,G3,High,Serous,NEEC,Ia,Early,HCN


### Compute biomarker AUCs and Log2FC 

This allow us check we're in the same page that CRG computations

In [24]:
import numpy as np
import pandas as pd
from sklearn.metrics import roc_auc_score, roc_curve
from scipy.stats import mannwhitneyu

def compute_auc_metrics(df, biomarkers, target_col='Pathology'):
    auc_scores = {}
    sensitivity_max_sum = {}
    specificity_max_sum = {}
    sensitivity_high = {}
    specificity_for_high_sens = {}
    threshold_max_sum = {}
    threshold_high_sens = {}
    log2FC = {}
    n_pos = {}
    n_neg = {}
    pvalues = {}
    #adjusted_pvalues = {}

    for biomarker in biomarkers:
        if df[biomarker].isnull().any():
            y_true = df[target_col][df[biomarker].notnull()]
            y_score = df[biomarker][df[biomarker].notnull()]
        else:
            y_true = df[target_col]
            y_score = df[biomarker]

        if len(y_true) == 0 or len(y_score) == 0:
            continue

        auc = roc_auc_score(y_true, y_score)

        # Invert if AUC < 0.5
        invert = False
        if auc < 0.5:
            invert = True
            auc = 1 - auc
            y_score = -y_score

        fpr, tpr, thresholds = roc_curve(y_true, y_score)
        specificity = 1 - fpr

        # Criterion 1: maximize (sensitivity + specificity)
        sum_sens_spec = tpr + specificity
        optimal_idx = np.argmax(sum_sens_spec)

        # Criterion 2: max specificity where sensitivity > 0.95
        high_sens_indices = np.where(tpr >= 0.95)[0]
        if len(high_sens_indices) > 0:
            high_sens_optimal_idx = high_sens_indices[np.argmax(specificity[high_sens_indices])]
            sens_high = tpr[high_sens_optimal_idx]
            spec_high = specificity[high_sens_optimal_idx]
            thresh_high = thresholds[high_sens_optimal_idx] if high_sens_optimal_idx < len(thresholds) else np.nan
        else:
            sens_high = np.nan
            spec_high = np.nan
            thresh_high = np.nan

        # Mann-Whitney U test (uncomment if needed)
        group_0 = df[df[target_col] == 0][biomarker].dropna()
        group_1 = df[df[target_col] == 1][biomarker].dropna()
        try:
            stat, p_value = mannwhitneyu(group_0, group_1)
            pvalues[biomarker] = round(float(p_value), 6)
        except:
            pvalues[biomarker] = np.nan

        auc_scores[biomarker] = round(float(auc), 3)
        sensitivity_max_sum[biomarker] = round(float(tpr[optimal_idx]), 3)
        specificity_max_sum[biomarker] = round(float(specificity[optimal_idx]), 3)
        sensitivity_high[biomarker] = round(float(sens_high), 3)
        specificity_for_high_sens[biomarker] = round(float(spec_high), 3)
        threshold_max_sum[biomarker] = round(float(thresholds[optimal_idx] if optimal_idx < len(thresholds) else np.nan), 3)
        threshold_high_sens[biomarker] = round(float(thresh_high), 3)
        log2FC[biomarker] = round(float(df[biomarker].groupby(df[target_col]).mean().diff().iloc[-1]), 3)
        n_pos[biomarker] = df[df[target_col] == 1][biomarker].count()
        n_neg[biomarker] = df[df[target_col] == 0][biomarker].count()

    # Sort by AUC
    auc_scores = dict(sorted(auc_scores.items(), key=lambda item: item[1], reverse=True))

    # Create dataframe with all metrics
    result_df = pd.DataFrame({
        'Biomarker': list(auc_scores.keys()),
        'AUC': list(auc_scores.values()),
        'Sensitivity (max sum)': [sensitivity_max_sum[biomarker] for biomarker in auc_scores.keys()],
        'Specificity (max sum)': [specificity_max_sum[biomarker] for biomarker in auc_scores.keys()],
        'Sensitivity (sens > 95%)': [sensitivity_high[biomarker] for biomarker in auc_scores.keys()],
        'Specificity (sens > 95%)': [specificity_for_high_sens[biomarker] for biomarker in auc_scores.keys()],
        'Log2FC': [log2FC[biomarker] for biomarker in auc_scores.keys()],
        'p-value': [pvalues[biomarker] for biomarker in auc_scores.keys()],
        'n_pos': [n_pos[biomarker] for biomarker in auc_scores.keys()],
        'n_neg': [n_neg[biomarker] for biomarker in auc_scores.keys()]
    })

    return result_df


In [25]:
# Set mimark biomarkers to check if results are consistent with the CRG data
MMK_BIOMARKERS = ["AGRIN_HUMAN", "MMP9_HUMAN", "TIMP2_HUMAN", 
                    "KPYM_HUMAN", "PERM_HUMAN", "MVP_HUMAN", "NGAL_HUMAN", "CLIC1_HUMAN", 
                    "PIGR_HUMAN"]

# Using pathology as target columns

In [26]:
# Make pathology column to be 0 for Benign and 1 for EC
df['Pathology'] = df['Pathology'].replace({'Benign': 0, 'EC': 1})
df_copy = df.dropna(subset=['Pathology'])
result_df = compute_auc_metrics(df_copy, BIOMARKERS, target_col='Pathology')


In [27]:
# Sort the result_df by n_pos+n_neg and then by AUC
result_df['n_pos+n_neg'] = (result_df['n_pos'] + result_df['n_neg'])
result_df = result_df.sort_values(by=['n_pos+n_neg', 'AUC'], ascending=[False, False])
result_df.head(10)

Unnamed: 0,Biomarker,AUC,Sensitivity (max sum),Specificity (max sum),Sensitivity (sens > 95%),Specificity (sens > 95%),Log2FC,p-value,n_pos,n_neg,n_pos+n_neg
7624,CNRP1_HUMAN,0.967,0.885,0.925,0.962,0.83,-2.723,0.0,52,53,105
1448,LDHA_HUMAN,0.958,0.885,0.962,0.962,0.717,1.412,0.0,52,53,105
1219,AL1A2_HUMAN,0.954,0.942,0.868,0.962,0.792,-3.117,0.0,52,53,105
3243,FAS_HUMAN,0.945,0.923,0.925,0.962,0.358,1.805,0.0,52,53,105
1973,PTPRF_HUMAN,0.943,0.865,0.981,0.962,0.453,1.703,0.0,52,53,105
1718,PGS2_HUMAN,0.938,0.885,0.83,0.962,0.679,-2.498,0.0,52,53,105
3883,MD2L1_HUMAN,0.937,0.865,0.981,0.962,0.396,1.964,0.0,52,53,105
9851,DPP3_HUMAN,0.937,0.846,0.943,0.962,0.34,1.207,0.0,52,53,105
3772,UBE2H_HUMAN,0.925,0.885,0.943,0.962,0.453,1.413,0.0,52,53,105
9251,CYBP_HUMAN,0.925,0.865,0.962,0.962,0.208,1.557,0.0,52,53,105


In [28]:
result_df[result_df['Biomarker'].isin(MMK_BIOMARKERS)].sort_values(by='AUC', ascending=False)

Unnamed: 0,Biomarker,AUC,Sensitivity (max sum),Specificity (max sum),Sensitivity (sens > 95%),Specificity (sens > 95%),Log2FC,p-value,n_pos,n_neg,n_pos+n_neg
2107,KPYM_HUMAN,0.876,0.923,0.679,0.962,0.415,0.963,0.0,52,53,105
350,CLIC1_HUMAN,0.81,0.615,0.887,0.962,0.208,0.659,0.0,52,53,105
352,AGRIN_HUMAN,0.797,0.615,0.887,0.962,0.226,1.158,0.0,52,53,105
2249,MMP9_HUMAN,0.727,0.75,0.642,0.962,0.264,1.376,6.3e-05,52,53,105
1766,PERM_HUMAN,0.698,0.731,0.679,0.981,0.17,1.377,0.000483,52,53,105
2122,TIMP2_HUMAN,0.679,0.635,0.642,0.981,0.264,-0.681,0.001597,52,53,105
4159,MVP_HUMAN,0.667,0.615,0.717,0.962,0.038,0.744,0.00323,52,53,105
1588,PIGR_HUMAN,0.655,0.75,0.528,0.962,0.113,-1.074,0.00639,52,53,105
5419,NGAL_HUMAN,0.586,0.635,0.585,1.0,0.0,-0.523,0.13121,52,53,105


In [29]:
# Save results to a CSV file
result_df.to_csv('../data/results/cohort_7_3_MS/bmk_aucs/by_pathology.csv', index=False)

## Using Low and high grade

In [31]:
# Make pathology column to be 0 for Benign and 1 for EC
df['Grade'] = df['Grade'].replace({'Low': 0, 'High': 1})
df_copy = df.dropna(subset=['Grade'])
result_df_grade = compute_auc_metrics(df_copy, BIOMARKERS, target_col='Grade')

In [32]:
# Sort the result_df by n_pos+n_neg and then by AUC
result_df_grade['n_pos+n_neg'] = (result_df_grade['n_pos'] + result_df_grade['n_neg'])
result_df_grade = result_df_grade.sort_values(by=['n_pos+n_neg', 'AUC'], ascending=[False, False])
result_df_grade.head(10)

Unnamed: 0,Biomarker,AUC,Sensitivity (max sum),Specificity (max sum),Sensitivity (sens > 95%),Specificity (sens > 95%),Log2FC,p-value,n_pos,n_neg,n_pos+n_neg
7157,FAM3C_HUMAN,0.887,1.0,0.658,1.0,0.658,-1.256,2.2e-05,14,38,52
1375,CD44_HUMAN,0.868,0.786,0.895,1.0,0.395,-1.241,5.5e-05,14,38,52
5116,T132A_HUMAN,0.868,0.857,0.789,1.0,0.5,-1.954,5.5e-05,14,38,52
401,PODXL_HUMAN,0.859,1.0,0.632,1.0,0.632,-2.424,8.5e-05,14,38,52
1376,B4GT1_HUMAN,0.859,0.929,0.737,1.0,0.605,-2.056,8.5e-05,14,38,52
8626,TPPP3_HUMAN,0.855,1.0,0.605,1.0,0.605,-2.628,0.000101,14,38,52
4413,CAYP1_HUMAN,0.846,0.929,0.763,1.0,0.395,-2.304,0.000153,14,38,52
6909,GOLM1_HUMAN,0.834,0.929,0.684,1.0,0.5,-1.694,0.000261,14,38,52
6256,GALT7_HUMAN,0.831,0.786,0.789,1.0,0.5,-1.35,0.000294,14,38,52
6119,ASGL1_HUMAN,0.814,0.929,0.737,1.0,0.132,-1.784,0.000593,14,38,52


In [33]:
result_df_grade[result_df_grade['Biomarker'].isin(MMK_BIOMARKERS)].sort_values(by='AUC', ascending=False)

Unnamed: 0,Biomarker,AUC,Sensitivity (max sum),Specificity (max sum),Sensitivity (sens > 95%),Specificity (sens > 95%),Log2FC,p-value,n_pos,n_neg,n_pos+n_neg
1663,PIGR_HUMAN,0.746,0.857,0.605,1.0,0.395,-1.785,0.007098,14,38,52
3822,NGAL_HUMAN,0.712,1.0,0.395,1.0,0.395,-1.453,0.020294,14,38,52
19,AGRIN_HUMAN,0.711,0.571,0.895,1.0,0.026,-1.113,0.021435,14,38,52
1432,MMP9_HUMAN,0.652,0.857,0.447,1.0,0.237,-0.956,0.096773,14,38,52
1459,PERM_HUMAN,0.602,0.929,0.316,1.0,0.079,-0.755,0.269724,14,38,52
4730,MVP_HUMAN,0.549,0.286,0.947,1.0,0.079,0.385,0.598845,14,38,52
2177,KPYM_HUMAN,0.545,0.429,0.789,1.0,0.0,-0.043,0.627817,14,38,52
380,CLIC1_HUMAN,0.526,0.5,0.737,1.0,0.0,0.004,0.780626,14,38,52
2395,TIMP2_HUMAN,0.519,0.357,0.842,1.0,0.053,0.084,0.844623,14,38,52


In [34]:
# Save results to a CSV file
result_df_grade.to_csv('../data/results/cohort_7_3_MS/bmk_aucs/by_grade.csv', index=False)

## Using Type

In [35]:
# Make pathology column to be 0 for Benign and 1 for EC
df['Type'] = df['Type'].replace({'NEEC': 0, 'EEC': 1})
df_copy = df.dropna(subset=['Type'])
result_df_type = compute_auc_metrics(df_copy, BIOMARKERS, target_col='Type')

In [36]:
# Sort the result_df by n_pos+n_neg and then by AUC
result_df_type['n_pos+n_neg'] = (result_df_type['n_pos'] + result_df_type['n_neg'])
result_df_type = result_df_type.sort_values(by=['n_pos+n_neg', 'AUC'], ascending=[False, False])
result_df_type.head(10)

Unnamed: 0,Biomarker,AUC,Sensitivity (max sum),Specificity (max sum),Sensitivity (sens > 95%),Specificity (sens > 95%),Log2FC,p-value,n_pos,n_neg,n_pos+n_neg
5109,T132A_HUMAN,0.916,0.829,0.909,0.951,0.727,2.212,2.8e-05,41,11,52
7396,FAM3C_HUMAN,0.891,0.659,1.0,0.976,0.636,1.305,8e-05,41,11,52
2245,CD44_HUMAN,0.885,0.78,0.909,0.976,0.545,1.365,0.000106,41,11,52
7397,CTL1_HUMAN,0.885,0.78,1.0,1.0,0.091,1.274,0.000106,41,11,52
858,GPC4_HUMAN,0.854,0.854,0.727,0.951,0.545,1.813,0.000367,41,11,52
324,PODXL_HUMAN,0.845,0.585,1.0,0.951,0.545,2.327,0.000515,41,11,52
8629,TPPP3_HUMAN,0.845,0.878,0.727,0.951,0.455,2.547,0.000515,41,11,52
1031,MATN2_HUMAN,0.843,0.707,0.909,0.976,0.364,1.822,0.000559,41,11,52
2001,CATH_HUMAN,0.84,0.634,0.909,0.951,0.455,1.633,0.000608,41,11,52
4535,CAYP1_HUMAN,0.838,0.732,0.909,0.951,0.364,2.505,0.00066,41,11,52


In [37]:
result_df_type[result_df_type['Biomarker'].isin(MMK_BIOMARKERS)].sort_values(by='AUC', ascending=False)

Unnamed: 0,Biomarker,AUC,Sensitivity (max sum),Specificity (max sum),Sensitivity (sens > 95%),Specificity (sens > 95%),Log2FC,p-value,n_pos,n_neg,n_pos+n_neg
1659,PIGR_HUMAN,0.749,0.488,0.909,0.951,0.364,1.951,0.012091,41,11,52
3807,NGAL_HUMAN,0.705,0.463,0.909,1.0,0.091,1.48,0.039269,41,11,52
1428,MMP9_HUMAN,0.634,0.537,0.727,1.0,0.091,0.932,0.178832,41,11,52
293,AGRIN_HUMAN,0.619,0.854,0.455,0.951,0.273,0.821,0.235023,41,11,52
1466,PERM_HUMAN,0.592,0.659,0.545,1.0,0.182,0.682,0.358281,41,11,52
4756,MVP_HUMAN,0.574,0.951,0.364,0.951,0.364,-0.541,0.459665,41,11,52
2270,TIMP2_HUMAN,0.563,0.61,0.636,0.951,0.091,-0.165,0.530418,41,11,52
2168,KPYM_HUMAN,0.554,1.0,0.182,1.0,0.182,-0.14,0.590754,41,11,52
1294,CLIC1_HUMAN,0.532,0.317,0.818,0.951,0.182,-0.127,0.753761,41,11,52


In [38]:
# Save results to a CSV file
result_df_type.to_csv('../data/results/cohort_7_3_MS/bmk_aucs/by_type.csv', index=False)

## Using Stage

In [39]:
# Make pathology column to be 0 for Benign and 1 for EC
df['Stage'] = df['Stage'].replace({'Early': 0, 'Advanced': 1})
df_copy = df.dropna(subset=['Stage'])
result_df_stage = compute_auc_metrics(df_copy, BIOMARKERS, target_col='Stage')

In [40]:
# Sort the result_df by n_pos+n_neg and then by AUC
result_df_stage['n_pos+n_neg'] = (result_df_stage['n_pos'] + result_df_stage['n_neg'])
result_df_stage = result_df_stage.sort_values(by=['n_pos+n_neg', 'AUC'], ascending=[False, False])
result_df_stage.head(10)

Unnamed: 0,Biomarker,AUC,Sensitivity (max sum),Specificity (max sum),Sensitivity (sens > 95%),Specificity (sens > 95%),Log2FC,p-value,n_pos,n_neg,n_pos+n_neg
2288,PSB5_HUMAN,0.826,1.0,0.529,1.0,0.529,0.982,0.001333,11,34,45
3287,SERPH_HUMAN,0.807,0.818,0.765,1.0,0.324,1.523,0.002495,11,34,45
2763,BASI_HUMAN,0.791,0.727,0.735,1.0,0.441,0.951,0.004163,11,34,45
5099,DRB5_HUMAN,0.789,0.727,0.853,1.0,0.206,-1.181,0.004524,11,34,45
9877,ADA2_HUMAN,0.786,0.909,0.706,1.0,0.029,-0.541,0.004913,11,34,45
1583,APOA_HUMAN,0.781,0.818,0.735,1.0,0.206,1.738,0.005782,11,34,45
4136,LG3BP_HUMAN,0.778,0.818,0.735,1.0,0.441,-1.116,0.006267,11,34,45
7554,NEO1_HUMAN,0.775,0.727,0.794,1.0,0.382,-0.599,0.006788,11,34,45
1930,P4HA1_HUMAN,0.77,0.545,0.912,1.0,0.294,1.406,0.007948,11,34,45
5870,CSPG4_HUMAN,0.77,0.909,0.676,1.0,0.353,0.873,0.007948,11,34,45


In [42]:
result_df_stage[result_df_stage['Biomarker'].isin(MMK_BIOMARKERS)].sort_values(by='AUC', ascending=False)

Unnamed: 0,Biomarker,AUC,Sensitivity (max sum),Specificity (max sum),Sensitivity (sens > 95%),Specificity (sens > 95%),Log2FC,p-value,n_pos,n_neg,n_pos+n_neg
3921,NGAL_HUMAN,0.663,0.545,0.794,1.0,0.206,-1.243,0.11008,11,34,45
1406,PIGR_HUMAN,0.652,0.727,0.588,1.0,0.206,-1.108,0.135649,11,34,45
368,AGRIN_HUMAN,0.639,0.364,0.912,1.0,0.029,-0.881,0.173786,11,34,45
2021,KPYM_HUMAN,0.591,0.636,0.647,1.0,0.0,0.134,0.376291,11,34,45
407,CLIC1_HUMAN,0.535,0.273,0.912,1.0,0.059,0.117,0.741301,11,34,45
2094,TIMP2_HUMAN,0.524,0.545,0.676,1.0,0.118,0.133,0.822377,11,34,45
4340,MVP_HUMAN,0.519,0.273,0.912,1.0,0.059,0.234,0.863699,11,34,45
1868,PERM_HUMAN,0.508,0.818,0.412,1.0,0.0,-0.211,0.947357,11,34,45
2051,MMP9_HUMAN,0.505,1.0,0.206,1.0,0.206,-0.32,0.9684,11,34,45


In [41]:
# Save results to a CSV file
result_df_stage.to_csv('../data/results/cohort_7_3_MS/bmk_aucs/by_stage.csv', index=False)