# Cohort 215 AUCs analysis 

### Imports and environment setup

- Date of run: 2024-10-04
- Environment: python 3.12
- Packages required: pandas, numpy, sklearn, statsmodels, seaborn, matplotlib

In [1]:
# Include in the environment the code directory with the utils function
import sys
sys.path.append('../code/')

In [2]:
# Library imports
import pandas as pd
from IPython.display import display, HTML
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np

# Utils imports
import cohort_analysis_utils as utils
import ms_utils

In [3]:
# Remove warnings for readability
import warnings
warnings.filterwarnings('ignore')

# Remove cell printing limits
pd.set_option('display.max_rows', None)

# Data loading and preprosessing

For this analysis, we are going to work with the MS matrix ─located at ["data/ms/MS_verification_215.csv"](<../data/ms/MS_verification_215.csv>)─. For this dataset so far we have no metadata, just values for peptides and pathology condition for the 215 samples.

In [4]:
MS_215_PATH = "../data/ms/MS_215_ratios.csv"

In [5]:
df_215 = pd.read_csv(MS_215_PATH, sep=",", header=[0,1], index_col=0)

In [6]:
# Combine the names of headers at level 0 and 1 and make them the new headers
df_215.columns = ["_".join(filter(lambda x: x != '', col)).strip() for col in df_215.columns.values]


In [7]:
df_215.columns

Index(['Unnamed: 1_level_0_DIAGNOSIS', 'AGR2_LPQTLSR', 'AGR2_LPQTLSR.1',
       'AGR2_LAEQFVLLNLVYETTDK', 'AGR2_LAEQFVLLNLVYETTDK.1',
       'AGRIN_VLGAPVPAFEGR', 'AGRIN_VLGAPVPAFEGR.1', 'AGRIN_LELGIGPGAATR',
       'AGRIN_LELGIGPGAATR.1', 'BCAM_AGAAGTAEATAR',
       ...
       'PRDX1_LVQAFQFTDK', 'PRDX1_LVQAFQFTDK.1', 'CAYP1_EAVIAAAFAK',
       'CAYP1_EAVIAAAFAK.1', 'CAYP1_SGDGVVTVDDLR', 'CAYP1_SGDGVVTVDDLR.1',
       'MIF_LLCGLLAER', 'MIF_LLCGLLAER.1', 'MIF_VYINYYDMNAANVGWNNSTFA',
       'MIF_VYINYYDMNAANVGWNNSTFA.1'],
      dtype='object', length=199)

In [8]:
# Rename the column 0 to "Pathology" and make it categorical
df_215 = df_215.rename(columns={df_215.columns[0]: "Pathology"})
df_215['Pathology'] = df_215['Pathology'].apply(lambda x: 1 if x.startswith("T") else 0)

# Computing the models

In [9]:
# General parames
RESULTS_PATH = '../data/results/215'
BIOMARKERS_215 = df_215.columns[1:]
METHODS = ['direct']

In [10]:
models_215 = utils.compute_all_models_and_save(df=df_215, 
                                              biomarkers=BIOMARKERS_215, 
                                              target_col='Pathology', 
                                              methods=METHODS, 
                                              max_biomarker_count=1,
                                              folder_name=RESULTS_PATH,
                                              auc_threshold=0.0, 
                                              plot_rocs=False,
                                              compute_auc_ci=True)

Could not fit the model for biomarkers: ['LAT1_SADGSAPAGEGEGVTLQR']
zero-size array to reduction operation maximum which has no identity


ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.

In [None]:
# Load the results
df_215_results_direct_max_1 = pd.read_csv(f'{RESULTS_PATH}/direct/max_1.csv', sep=',', index_col=0, header=0)
df_215_results_direct_max_1.head(10)

FileNotFoundError: [Errno 2] No such file or directory: '../data/results/215/direct/max_1.csv'

In [None]:
BIOMARKERS_215_cis = df_215_results_direct_max_1.head(20).index
utils.plot_aucs_with_confidence_intervals(models_215, 'direct', BIOMARKERS_215_cis)

In [None]:
# _ = utils.compute_all_models_and_save(df=df_215, 
#                                         biomarkers=BIOMARKERS_215, 
#                                         target_col='Pathology', 
#                                         methods=METHODS, 
#                                         max_biomarker_count=3,
#                                         folder_name=RESULTS_PATH,
#                                         auc_threshold=0.0, 
#                                         plot_rocs=True)

In [None]:
# df_215_results_direct_max_3 = pd.read_csv(f'{RESULTS_PATH}/direct/max_3.csv', sep=',', index_col=0, header=0)
# df_215_results_direct_max_3.head(10)