# Cohort 38 AUCs analysis 

### Imports and environment setup

- Date of run: 2024-09-26
- Environment: python 3.12
- Packages required: pandas, numpy, sklearn, statsmodels, seaborn, matplotlib

In [1]:
# Include in the environment the code directory with the utils function
import sys
sys.path.append('../code/')

In [2]:
# Library imports
import pandas as pd
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Utils imports
import cohort_analysis_utils as utils
import ms_utils

In [3]:
# Remove warnings for readability
import warnings
warnings.filterwarnings('ignore')

# Remove cell printing limits
pd.set_option('display.max_rows', None)

# Data loading and preprosessing

For this analysis, we are going to work with the MS matrix ─located at ["data/ms/MS_verification_38.csv"](<../data/ms/MS_verification_38.csv>)─. For this dataset so far we have no metadata, just values for peptides and pathology condition for the 38 samples.

In [4]:
MS_38_PATH = "../data/ms/MS_verification_38.csv"

In [5]:
df_38 = pd.read_csv(MS_38_PATH, sep="\t", header=[0,1], index_col=0)

In [11]:
# Combine the names of headers at level 0 and 1 and make them the new headers
df_38.columns = ["_".join(filter(lambda x: x != '', col)).strip() for col in df_38.columns.values]


In [18]:
# Rename the column 0 to "Pathology" and make it categorical
df_38 = df_38.rename(columns={df_38.columns[0]: "Pathology"})
df_38 = utils.cols_as_category(df_38, {'Pathology':{'C2': 0, 'T1': 1}})

# Computing the models

In [24]:
# General parames
RESULTS_PATH = '../data/results/38'
BIOMARKERS_38 = df_38.columns[1:]
METHODS = ['direct']

In [32]:
# This is costly, so is commented out to compute it only once

models_38 = utils.compute_all_models_and_save(df=df_38, 
                                              biomarkers=BIOMARKERS_38, 
                                              target_col='Pathology', 
                                              methods=METHODS, 
                                              max_biomarker_count=1,
                                              folder_name=RESULTS_PATH,
                                              auc_threshold=0.0, 
                                              plot_rocs=True)

In [31]:
# Load the results
df_38_results_direct_max_1 = pd.read_csv(f'{RESULTS_PATH}/direct/max_1.csv', sep=',', index_col=0, header=0)
df_38_results_direct_max_1[["AUC"]].head(10)

Unnamed: 0_level_0,AUC
Biomarker_1,Unnamed: 1_level_1
PERM_IANVFTNAFR,0.96667
PERM_VVLEGGIDPILR,0.95
CADH1_VFYSITGQGADTPPVGVFIIER,0.94444
SPIT1_SFVYGGCLGNK,0.93333
CADH1_NLVQIK,0.925
SPIT1_WYYDPTEQICK,0.925
ENOA_YISPDQLADLYK,0.91944
KPYM_NTGIICTIGPASR,0.91389
CASP3_SGTDVDAANLR,0.91389
MMP9_SLGPALLLLQK,0.91389


In [34]:
roc_image_path = f"{RESULTS_PATH}/direct/max_1/rocs/PERM_IANVFTNAFR.png"
display(HTML("<img src='"+roc_image_path+"'>"))

In [35]:
roc_csv_path = f"{RESULTS_PATH}/direct/max_1/rocs/PERM_IANVFTNAFR.csv"
roc_df = pd.read_csv(roc_csv_path, sep=',', index_col=0, header=0)
roc_df

Unnamed: 0_level_0,Sensitivity,Specificity,NPV,PPV
Threshold,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
inf,0.0,1.0,0.473684,0.0
0.999554,0.05,1.0,0.486486,1.0
0.78899,0.75,1.0,0.782609,1.0
0.78462,0.75,0.944444,0.772727,0.9375
0.780393,0.8,0.944444,0.809524,0.941176
0.77781,0.8,0.888889,0.8,0.888889
0.699762,0.95,0.888889,0.941176,0.904762
0.595593,0.95,0.722222,0.928571,0.791667
0.592341,1.0,0.722222,1.0,0.8
0.578491,1.0,0.666667,1.0,0.769231
