## Imports

In [1]:
import sys
import os
import pandas as pd
import numpy as np
from pathlib import Path

module_path = str(Path("../src/data").resolve())
if module_path not in sys.path:
    sys.path.append(module_path)

import preprocessing as prep
import feature_selection as fs
import model_fit as mf

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

In [67]:
import importlib
importlib.reload(prep)


<module 'preprocessing' from '/home/lestrada/tumor_type_prediction/src/data/preprocessing.py'>

## Data Import

In [84]:
#Proteins quantification intensities file
processed_data = '2024.10.23_CJ_pancancer_250/'
folder_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/'
PREPROCESSED_FP_INTENSITY = 'preprocessed_fp_with_ref.csv'
intensity_path_file = folder_path + processed_data + PREPROCESSED_FP_INTENSITY
input_quantifications = pd.read_csv(intensity_path_file)

#--------------------------------------------------------------------------------

# Proteins quantification z-scores file
preprocessed_fp_z_scores = 'full_proteome_measures_z.tsv'
z_scores_path_file = folder_path + processed_data + preprocessed_fp_z_scores
df_z_scores = pd.read_csv(z_scores_path_file, sep='\t')


#--------------------------------------------------------------------------------

#Samples metadata (oncotree classification) file.
metadata_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_MTBs_Evaluation/'
metadata_file = 'METADATA_PAN_CANCER_Batch300.xlsx'
the_metadata_file = metadata_path + metadata_file
input_metadata = pd.read_excel(the_metadata_file)



In [86]:
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
output_dir = os.path.join(project_root,'tumor_type_prediction', 'data', 'data_output')
output_dir

'/home/lestrada/tumor_type_prediction/tumor_type_prediction/data/data_output'

## Data Preprocessing

In [3]:
# Protein and peptides quantification intensities post-processing
input_quantifications = input_quantifications.set_index(input_quantifications.columns[0])

peptides_quant_info = prep.post_process_meta_intensities(input_quantifications.iloc[:,int(input_quantifications.shape[1]/2):].T ) #clean dataframe from regex characers
proteins_quant = input_quantifications.iloc[:,:int(input_quantifications.shape[1]/2)].T #subset protein measurements from dataset

#Imputation
prot_quant_imputed = prep.impute_normal_down_shift_distribution(proteins_quant) #Imputation of missing values in protein intensities using normal distribution down-shift method
na_columns = prot_quant_imputed.isna().any()
na_columns_true = na_columns[na_columns].index.tolist()
print("Proteins with  empty values:", na_columns_true)

#Cleaning sample names
prot_quant_imputed.reset_index(inplace=True)
prot_quant_imputed.rename(columns={'index': 'Sample name'}, inplace=True)
prot_quant_imputed['Sample name'] = prot_quant_imputed['Sample name'].str.replace('pat_', '')

#Dataset with protein intensities and metadata
samples_metadata = input_metadata[["Sample name", "code_oncotree",]] #sample metadata e.g. class, TCC, tissue of origin, etc.
initial_df = samples_metadata.merge(prot_quant_imputed, left_on='Sample name', right_on='Sample name')

#Peptides quantification to binary dataset
peptides_df_binary = pd.DataFrame(
    np.where(peptides_quant_info > 1, 1, 0), #if the # of peptides > 1, then turns to 1, otherwise 0. 
    index=peptides_quant_info.index,
    columns=peptides_quant_info.columns  
)
peptides_df_binary.reset_index(inplace=True) #Moves the index to a column. Allows to obtain patient id
peptides_df_binary.replace('Identification metadata ','',regex=True, inplace=True) #Removes text from id's
peptides_df_binary = samples_metadata.merge(peptides_df_binary, left_on='Sample name', right_on='index') #merging both data sets by Sample Name, ontaining a dataset with sample, classification and peptide binary count
peptides_df_binary.drop('index', axis=1, inplace=True)

peptides_df_binary

(2135, 13017)


  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  temp_mean = np.nanmean(temp)


Proteins with  empty values: ['PTGER4', 'CD19', 'FOXO4', 'CRYGA', 'HNRNPCL3;HNRNPCL4', 'MYBPHL']


Unnamed: 0,Sample name,code_oncotree,MSH6,PCLAF,UTP18,SEC16A,IPO7,EIF3L,RPAP3,INTS3,...,ROPN1L,CARD10,ZNF804A,ZNF503,HHEX,STK40,FAM214A,WNT10B,VMO1,CCDC152
0,H021-3RLVZS-T1-Q1,AASTR,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,H021-VFM3B1-T1-Q1,AASTR,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,H021-3RLVZS-T1-Q1-R2,AASTR,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,H021-XBLS3R-M1-Q1,AASTR,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,H021-M2MSRE-M1-Q1,ACBC,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1655,H021-25HCGP-M2-Q1,VMM,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1656,H021-VYS51F-M1-Q1,VSC,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1657,H021-1B7R18-M1-Q1,VSC,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1658,H021-FUFZFT-T1-Q1,VSC,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [80]:
# Transforming Z-scores file to obtain values and info
z_scores_df = df_z_scores.transpose(copy=True) 
z_scores_df = z_scores_df.reset_index()
z_scores_df = z_scores_df.replace('zscore_','', regex=True) 
z_scores_df.rename(columns = z_scores_df.iloc[0], inplace=True)
z_scores_df.drop(axis=0, index=0, inplace=True)
z_scores_df['Gene names'] = z_scores_df['Gene names'].str.replace('pat_', '')
z_scores_df = z_scores_df.set_index('Gene names') 

z_scores_imputed = prep.impute_normal_down_shift_distribution(z_scores_df)
z_scores_imputed.reset_index(inplace=True)
z_scores_imputed.rename(columns={'Gene names': 'Sample name'}, inplace=True)

z_scores_initial_df = samples_metadata.merge(z_scores_imputed, left_on='Sample name', right_on='Sample name')
z_scores_initial_df

(1667, 13017)


  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  temp_sd = np.nanstd(temp)


Unnamed: 0,Sample name,code_oncotree,MSH6,PCLAF,UTP18,SEC16A,IPO7,EIF3L,RPAP3,INTS3,...,ROPN1L,CARD10,ZNF804A,ZNF503,HHEX,STK40,FAM214A,WNT10B,VMO1,CCDC152
0,H021-3RLVZS-T1-Q1,AASTR,-1.539061,-1.237403,-1.857498,-1.239718,-0.457340,-0.716951,-0.751463,-1.595253,...,0.831971,-2.124029,-2.222392,-2.271782,-1.123408,-1.988682,0.085773,-2.217128,-1.350153,1.297413
1,H021-VFM3B1-T1-Q1,AASTR,-0.647630,-2.067226,-0.568514,-0.668665,1.527020,-0.184709,0.464970,-0.776060,...,-0.110166,-1.946678,-1.731386,-2.714793,-1.691071,-1.805054,0.309919,-2.145422,-1.232119,2.146928
2,H021-3RLVZS-T1-Q1-R2,AASTR,-1.858234,-1.541744,-2.353572,-1.408264,-0.410396,-1.626688,-1.615083,-1.828735,...,1.467501,-2.030740,0.079840,-2.385696,-1.866041,-2.441151,0.841328,-2.051718,-0.819176,2.050525
3,H021-XBLS3R-M1-Q1,AASTR,-3.592338,0.166180,-1.593462,-0.487806,0.189687,-1.370391,-0.659636,-1.561716,...,0.705113,-2.153836,-1.862338,-1.692220,-1.584527,-1.933245,-0.535493,-2.223340,-1.455259,1.062251
4,H021-M2MSRE-M1-Q1,ACBC,0.068480,0.411428,1.072526,-0.243170,-1.609448,-1.803969,-0.323886,0.369503,...,-1.984753,-2.050096,-2.315219,-2.509352,-1.893514,1.841045,-1.906512,-2.311553,-1.485464,-1.030180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1655,H021-25HCGP-M2-Q1,VMM,0.274847,0.427466,0.497179,-0.544787,-0.309454,0.336908,0.006479,0.388220,...,1.056990,-1.262168,-1.658615,-2.917329,-1.850001,-2.012641,-2.293452,-2.358545,-1.336805,-1.705622
1656,H021-VYS51F-M1-Q1,VSC,-0.540518,0.084253,0.142285,0.773620,0.511044,1.040700,0.523064,0.105094,...,1.573210,-1.320724,-1.716318,-2.027592,-1.475648,-1.606586,-1.729860,-2.774147,-1.479486,1.438808
1657,H021-1B7R18-M1-Q1,VSC,-0.523526,-1.822627,-0.221382,-2.251962,0.148048,-0.427163,-3.937086,-1.901229,...,-0.715383,-2.072916,-2.241513,-2.272900,-2.239329,-1.788503,-1.080520,-2.149362,-1.613503,0.513564
1658,H021-FUFZFT-T1-Q1,VSC,0.123631,0.501403,0.753356,1.073038,2.083629,0.797588,0.799687,0.510565,...,0.881698,-0.564836,-2.201696,-2.701125,-1.952934,-2.021265,-1.456877,-2.886111,-1.362231,0.096910


## Data Set Split

In [5]:
#Removing samples not part of the Oncotree classification
NOS_cases = initial_df[initial_df['code_oncotree'].str.endswith('NOS', na=False)]['code_oncotree'].unique().tolist()
other_cases = ['missing']
cases_to_remove = NOS_cases + other_cases
ml_initial_df = prep.remove_class(initial_df, cases_to_remove, 'code_oncotree')

# Splitting dataset into training and held-out sets1
training_df, held_out_df = prep.data_split(ml_initial_df, split_size=0.25, classified_by='code_oncotree', export=False)

#Z_scores dataset
z_scores_training_df = z_scores_initial_df.iloc[training_df.index]

Removed samples: 203
Remaining samples: 1457
Classes with only one sample: 68
Training set samples: 1109
Held-out set samples: 348


# Class Specific Worflow

In [6]:
#Set Classification Parameters
target_class = ['CHDM'] 
classified_by = 'code_oncotree'
samples_column = 'Sample name'


In [7]:

#Obtaining high confidence proteins by peptides
entity_proteins_by_peptides = fs.get_high_confidence_proteins(peptides_df_binary, target_class, classified_by, threshold=0.7)

# Binary labeling for specific class classification 
entity_training_df = fs.binary_labeling(training_df, classified_by=classified_by, true_class=target_class)
entity_ho_df = fs.binary_labeling(held_out_df, classified_by=classified_by, true_class=target_class)

entity_z_scores_train_df = fs.binary_labeling(z_scores_training_df, classified_by=classified_by, true_class=target_class)


# 1st Filter - Filtering entity training and held-out dataframes by proteins with peptides
entity_training_df = entity_training_df.filter(items=[samples_column, classified_by, 'Classifier'] + entity_proteins_by_peptides)
entity_ho_df = entity_ho_df.filter(items=[samples_column, classified_by, 'Classifier'] + entity_proteins_by_peptides)

entity_z_scores_train_df = entity_z_scores_train_df.filter(items=[samples_column, classified_by, 'Classifier'] + entity_proteins_by_peptides)

 5721 proteins identified in 70.0% of ['CHDM'] samples

Number of samples per class:
Classifier
0    1030
1      79
Name: count, dtype: int64


Number of samples per class:
Classifier
0    321
1     27
Name: count, dtype: int64


Number of samples per class:
Classifier
0    1030
1      79
Name: count, dtype: int64



## Feature Selection

### Hyperparametes for ElasticNet
Calculated in Z-scores

In [8]:
entity_cv_results, entity_best_params, entity_best_score, entity_grid_search_obj = fs.hparameter_grid_search(entity_z_scores_train_df, 4, l1_ratio_list=[0.7,0.5], C_list=[0.1,1], classified_by='code_oncotree')

Exception ignored in: <function ResourceTracker.__del__ at 0x7ff016f8dee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7be6d2985ee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/

Grid search completed in 1110.92 seconds
Best parameters: {'C': 1, 'l1_ratio': 0.5, 'max_iter': 10000, 'penalty': 'elasticnet', 'solver': 'saga'}
Best score: 0.9586090511280881


### Feature Selection by ElasticNet Cross-Validation

In [9]:
entity_cross_val_coeffs = fs.elnet_wrapper(entity_z_scores_train_df,classified_by=classified_by, tumor_type_name='CHDM', l1_ratio=entity_best_params['l1_ratio'], C=entity_best_params['C'], n_splits=4, n_repeats=25,n_jobs=16, export=True)

Running Logistic Regression: 100%|██████████| 25/25 [00:01<00:00, 15.33iteration/s]
Exception ignored in: <function ResourceTracker.__del__ at 0x777b9158dee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7ef38ff89ee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py"

DataFrame exported to: /home/lestrada/tumor_type_prediction/data/data_output/CHDM_coefficients.xlsx


In [10]:
entity_stats_try, entity_proteins = fs.statistic_from_coefficients(entity_cross_val_coeffs, target_class)

With  1923  folds, the following statistics were obtained, from feature selection:
• Mean MCC score: 0.9577 ± 0.0279

----------------------------------------
• Top 3 proteins with highest coefficients:
             mean       std  Freq  Wald Chi-Square   p-value_corrected  \
SCARA5   0.495922  0.027767   1.0       318.982815                 0.0   
OLFML2A  0.380725  0.030626   1.0       154.539331                 0.0   
PDE1A    0.366939  0.038534   1.0        90.678292                 0.0   

         Significant  
SCARA5           1.0  
OLFML2A          1.0  
PDE1A            1.0  

----------------------------------------
• List of significant proteins: ['SCARA5', 'OLFML2A', 'PDE1A', 'SUSD5', 'XYLT1', 'TRIL', 'KRT80', 'ITGBL1', 'CHST3', 'MGARP', 'AKR1B10', 'TLR3', 'GALNT3', 'COL2A1', 'LYST', 'TUBA8', 'CDK18', 'ABLIM3', 'TCAF2', 'LGALSL', 'SLPI', 'MGAT5', 'PON3', 'MUC1', 'ABCG2', 'HAPLN1', 'TUBB1', 'CRISP3', 'ZNRD2', 'RAB3B', 'KRT8', 'CD109', 'PPM1H', 'ETFDH', 'DCTN3', 'PODXL', 'EPP

## Model Fitting
Calculated on intensities

### Reshaping dataset for training and test

In [11]:
entity_training_fs = fs.reshape_df_for_fitting(entity_training_df, entity_proteins)
entity_test_fs = fs.reshape_df_for_fitting(entity_ho_df, entity_proteins)


### Hyperparameter Selection for Logistic Regression

In [12]:
entity_to_nest_cv_results = mf.wrapper_nested_cv(entity_training_fs, random_state_tries=5, n_splits=3, classified_by=classified_by)
entity_to_nest_hp = mf.nested_cv_hparameters_selection(entity_to_nest_cv_results)

• Running for random_state=0
1 Inner fold best parameter={'C': 1}, Score=0.9398, Outer Validation MCC Score: 0.9580

2 Inner fold best parameter={'C': 10}, Score=0.9276, Outer Validation MCC Score: 0.9445

3 Inner fold best parameter={'C': 1}, Score=0.9408, Outer Validation MCC Score: 0.9208

Average MCC across all outer folds: 0.9411

--------------------------------------------------
• Running for random_state=1
1 Inner fold best parameter={'C': 0.1}, Score=0.9409, Outer Validation MCC Score: 0.9255

2 Inner fold best parameter={'C': 1}, Score=0.9494, Outer Validation MCC Score: 0.8986

3 Inner fold best parameter={'C': 0.1}, Score=0.9442, Outer Validation MCC Score: 0.9392

Average MCC across all outer folds: 0.9211

--------------------------------------------------
• Running for random_state=2
1 Inner fold best parameter={'C': 0.1}, Score=0.9403, Outer Validation MCC Score: 0.9034

2 Inner fold best parameter={'C': 0.1}, Score=0.9423, Outer Validation MCC Score: 0.9595

3 Inner fo

In [13]:
hyperparameter_C = pd.DataFrame(entity_to_nest_hp).T.sort_values(by='avg', ascending=False).index.tolist()[0]

In [26]:
hyperparameter_C

10.0

### Model Fit

In [14]:
entity_log_reg_model = mf.logistic_regression_ridge( entity_training_fs, hyperparameter_C, target_class, classified_by=classified_by) 

Model saved as CHDM_log_reg_ridge_model.pkl


In [15]:
entity_coefficients, entity_train_probabilities, entity_test_probabilities = mf.logistic_regression_results(entity_log_reg_model, entity_training_fs, entity_ho_df, target_class, classified_by=classified_by)

# of Iterations: [2333]
MCC train: 0.9932478352032398
MCC test: 0.9797818097694013
F1 Positive: 0.9811320754716981


In [16]:
test_entity_scores = mf.classification_scores(entity_test_probabilities)

------------------------------------
•General Scores:
MCC Score: 0.9797818097694013
F1 Macro: 0.9897884327591773
F1 Micro: 0.9971264367816092
F1 Entity Score: 0.9811320754716981

------------------------------------
•Confusion Matrix:
  TN | FP
[[321   0]
 [  1  26]]
  FN | TP

------------------------------------
•False Positives:
No False Positives detected.

------------------------------------
•False Negatives:
          Sample name code_oncotree  Classifier  Probability  Predicted
97  H021-YQWZ88-M3-Q1          CHDM           1     0.044115        0.0


Exception ignored in: <function ResourceTracker.__del__ at 0x77ab9fd8dee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x76e9f1f8dee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/

In [23]:
from entity_model_settings import *

In [25]:
SAMPLES_COLUMN

'Sample name'