## Imports

In [1]:
import sys
import os
import pandas as pd
import numpy as np
from pathlib import Path

module_path = str(Path("../src/data").resolve())
if module_path not in sys.path:
    sys.path.append(module_path)

import preprocessing as prep
import feature_selection as fs
import model_fit as mf
import graphs as grph



  from .autonotebook import tqdm as notebook_tqdm


In [3]:
import importlib
importlib.reload(grph)


<module 'graphs' from '/home/lestrada/tumor_type_prediction/src/data/graphs.py'>

## Data Import

In [6]:
#Proteins quantification intensities file
processed_data = '2025.07.14_CJ_pancancer_334/'
folder_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/'
PREPROCESSED_FP_INTENSITY = 'preprocessed_fp.csv'
intensity_path_file = folder_path + processed_data + PREPROCESSED_FP_INTENSITY
input_quantifications = pd.read_csv(intensity_path_file)

#--------------------------------------------------------------------------------

# Proteins quantification z-scores file
preprocessed_fp_z_scores = 'full_proteome_measures_z.tsv'
z_scores_path_file = folder_path + processed_data + preprocessed_fp_z_scores
df_z_scores = pd.read_csv(z_scores_path_file, sep='\t')


#--------------------------------------------------------------------------------

#Samples metadata (oncotree classification) file.
METADATA_PATH = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/LE_PROdict/paper_freeze_versions_22_08/'
metadata_file = 'METADATA_PANCANCER_PAPER_final.xlsx'
the_metadata_file = metadata_path + metadata_file
input_metadata = pd.read_excel(the_metadata_file,
                                        usecols=['Sample name', 'code_oncotree', 'Tumor cell content', 'TCC_Bioinfo', 'TCC GROUP'],
                                        dtype={'Sample name': 'string', 'code_oncotree': 'string', 'Tumor cell content': 'float64', 'TCC_Bioinfo': 'float64', 'TCC GROUP': 'string'},
                                        na_values=['', 'NA', 'NaN', 'nan', 'N/A', 'n/a', 'None', 'TBD', 'notavailable', 'missing'])



In [151]:
project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))
output_dir = os.path.join(project_root, 'data', 'data_output', 'Notebook_output')
os.makedirs(output_dir, exist_ok=True)


## Data Preprocessing

In [7]:
# Protein and peptides quantification intensities post-processing
input_quantifications = input_quantifications.set_index(input_quantifications.columns[0])
peptides_quant_info = prep.post_process_meta_intensities(input_quantifications.iloc[:,int(input_quantifications.shape[1]/2):].T ) #clean dataframe from regex characers
proteins_quant = input_quantifications.iloc[:,:int(input_quantifications.shape[1]/2)].T #subset protein measurements from dataset

#Imputation
prot_quant_imputed = prep.impute_normal_down_shift_distribution(proteins_quant) #Imputation of missing values in protein intensities using normal distribution down-shift method
na_columns = prot_quant_imputed.isna().any()
na_columns_true = na_columns[na_columns].index.tolist()
print("Proteins with  empty values:", na_columns_true)

#Cleaning sample names
prot_quant_imputed.reset_index(inplace=True)
prot_quant_imputed.rename(columns={'index': 'Sample name'}, inplace=True)
prot_quant_imputed['Sample name'] = prot_quant_imputed['Sample name'].str.replace('pat_', '')

#Dataset with protein intensities and metadata
input_metadata['TCC'] = input_metadata['TCC_Bioinfo'].fillna(input_metadata['Tumor cell content'])
samples_metadata = input_metadata[["Sample name", "code_oncotree", 'TCC', "TCC GROUP"]] #sample metadata e.g. class, TCC, tissue of origin, etc.
initial_df = samples_metadata.merge(prot_quant_imputed, left_on='Sample name', right_on='Sample name')

#Peptides quantification to binary dataset
peptides_df_binary = pd.DataFrame(
    np.where(peptides_quant_info > 1, 1, 0), #if the # of peptides > 1, then turns to 1, otherwise 0. 
    index=peptides_quant_info.index,
    columns=peptides_quant_info.columns  
)
peptides_df_binary.reset_index(inplace=True) #Moves the index to a column. Allows to obtain patient id
peptides_df_binary.replace('Identification metadata ','',regex=True, inplace=True) #Removes text from id's
peptides_df_binary = samples_metadata.merge(peptides_df_binary, left_on='Sample name', right_on='index') #merging both data sets by Sample Name, ontaining a dataset with sample, classification and peptide binary count
peptides_df_binary.drop('index', axis=1, inplace=True)


(2567, 13063)
Proteins with  empty values: ['ENPP7', 'SHOX2', 'CRYGA', 'HNRNPCL3;HNRNPCL4', 'MYBPHL']


In [8]:
# Transforming Z-scores file to obtain values and info
z_scores_df = df_z_scores.transpose(copy=True) 
z_scores_df = z_scores_df.reset_index()
z_scores_df = z_scores_df.replace('zscore_','', regex=True) 
z_scores_df.rename(columns = z_scores_df.iloc[0], inplace=True)
z_scores_df.drop(axis=0, index=0, inplace=True)
z_scores_df['Gene names'] = z_scores_df['Gene names'].str.replace('pat_', '')
z_scores_df = z_scores_df.set_index('Gene names') 

z_scores_imputed = prep.impute_normal_down_shift_distribution(z_scores_df)
z_scores_imputed.reset_index(inplace=True)
z_scores_imputed.rename(columns={'Gene names': 'Sample name'}, inplace=True)

z_scores_initial_df = samples_metadata.merge(z_scores_imputed, left_on='Sample name', right_on='Sample name')
z_scores_initial_df

(1987, 13063)


Unnamed: 0,Sample name,code_oncotree,TCC,TCC GROUP,FN1,HMBS,CDKN1B,DENR,CTNNA2,INA,...,TMPRSS11F,IL34,FREM3,LRRC2,ZP4,TRIML2,TBX19,DNAI3,CSF3R,MSANTD3
0,A26K-5SXQR3-T11-Q1,BRCA,30.0,low,0.419207,0.064605,-1.517050,0.992052,0.530773,-0.405181,...,-1.721890,-2.551214,-1.592601,-1.298179,-1.945411,-3.737643,-3.227579,-1.365518,0.996891,-1.534333
1,A26K-5SXQR3-T24-Q1,BRCA,42.0,intermediate,0.241614,0.607330,-1.941840,0.922715,0.507636,0.110441,...,-1.524969,-2.780512,-1.904598,-2.136195,-1.721556,-2.674478,-2.634187,-1.435020,1.019020,-1.744125
2,A26K-9TET1N-T11-Q1,BRCA,52.0,intermediate,0.182117,-0.452370,0.494543,0.321400,-2.411723,0.711820,...,-1.724645,-3.301664,-0.902085,-1.714741,-2.095086,-3.001014,-3.006426,-1.800624,-1.810894,-1.757925
3,A26K-ADUQXR-T11-Q1,BRCA,53.0,intermediate,1.249130,-0.082325,0.535330,0.671047,-1.937163,1.283470,...,-1.257435,-3.376623,-1.515989,-1.919593,-1.937873,-3.606769,-3.984241,-1.650024,0.173736,-1.607774
4,A26K-HS3BDB-T14-Q1,BRCA,80.0,high,0.332858,-0.019439,-0.109984,-1.077440,-1.417420,-0.026247,...,-1.689900,-3.364729,-2.308590,-2.014349,-1.631705,-2.809192,-2.875324,-1.701577,-2.059592,-1.651387
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1938,S033-33+029-T1-Q1,CHDM,70.0,high,1.489240,0.753119,-0.088520,0.273985,0.911104,1.047210,...,-1.917155,-3.008746,-1.383428,-1.689841,-2.338460,-2.520990,-3.945086,-1.201763,2.061710,-1.596457
1939,S033-33+030-T1-Q1,CHDM,80.0,high,0.464197,0.232156,0.113310,0.892083,-1.400324,-1.039850,...,-1.812466,-3.222541,-1.380890,-1.251379,-1.980979,-3.255978,-3.107019,-1.971122,-0.264134,-1.608001
1940,S033-33+035-T1-Q1,CHDM,,,0.109016,-0.870740,-0.866849,0.309826,0.327108,2.762510,...,-1.971065,-2.843725,-0.219780,-1.752959,-1.898055,-3.027615,-3.732931,-1.851137,0.325024,-1.436321
1941,S033-33+036-T1-Q1,CHDM,,,1.311050,-0.652790,-0.035056,1.623510,0.349962,-0.129080,...,-1.382845,-4.168036,0.297935,-1.943607,-2.466811,-3.918682,-3.185430,-2.132708,0.328319,-1.416544


In [9]:
z_scores_initial_df.shape

(1943, 13067)

In [13]:
initial_df['code_oncotree'].value_counts()

code_oncotree
BRCA      231
CUPNOS    111
CHDM       97
SYNS       79
LMS        69
         ... 
TSTAD       1
MPT         1
PCM         1
SCSRMS      1
UCEC        1
Name: count, Length: 195, dtype: Int64

In [14]:
z_scores_initial_df['code_oncotree'].value_counts()

code_oncotree
BRCA      231
CUPNOS    111
CHDM       97
SYNS       79
LMS        69
         ... 
TSTAD       1
MPT         1
PCM         1
SCSRMS      1
UCEC        1
Name: count, Length: 195, dtype: Int64

## Data Set Split

In [153]:
#Removing samples not part of the Oncotree classification
NOS_cases = initial_df[initial_df['code_oncotree'].str.endswith('NOS', na=False)]['code_oncotree'].unique().tolist()
other_cases = ['missing']
cases_to_remove = NOS_cases + other_cases

ml_initial_df = (
    initial_df
    .pipe(prep.remove_class, cases_to_remove, 'code_oncotree')
    .pipe(prep.remove_class, ['very low', 'missing'], 'TCC GROUP')
    .loc[lambda df: df['TCC GROUP'].notna()]
)

# Splitting dataset into training and held-out sets1
training_df, held_out_df = prep.data_split(ml_initial_df, split_size=0.25, classified_by='code_oncotree', export=False)

#Z_scores dataset
z_scores_training_df = z_scores_initial_df.iloc[training_df.index]

Removed samples: 204
Remaining samples: 1739
Removed samples: 168
Remaining samples: 1571
Classes with only one sample: 67
Training set samples: 1184
Held-out set samples: 373


In [154]:
test_samples_def = pd.DataFrame(ml_initial_df['code_oncotree'].value_counts())

In [159]:
test_samples_def[test_samples_def['count']>8]

Unnamed: 0_level_0,count
code_oncotree,Unnamed: 1_level_1
BRCA,200
CHDM,84
SYNS,71
LMS,65
ACYC,59
SFT,50
MFH,44
ES,43
ARMS,43
ACC,37


In [None]:
test_samples_def[test_samples_def['count']>3]

Unnamed: 0_level_0,count
code_oncotree,Unnamed: 1_level_1
BRCA,127
CHDM,79
SYNS,58
LMS,49
ARMS,44
MFH,39
ES,38
SFT,35
ERMS,29
ACC,28


In [None]:
test_samples_def[test_samples_def['count']>3]

Unnamed: 0_level_0,count
code_oncotree,Unnamed: 1_level_1
BRCA,32
CHDM,20
SYNS,15
LMS,12
ARMS,11
ES,10
MFH,10
SFT,9
ACC,7
OS,7


In [None]:
print(ml_initial_df.shape)
print(training_df.shape)
print(held_out_df.shape)

(1193, 13078)
(911, 13079)
(282, 13079)


# Class Specific Worflow

In [52]:
#Set Classification Parameters
target_class = ['CHDM'] 
classified_by = 'code_oncotree'
samples_column = 'Sample name'


In [53]:

#Obtaining high confidence proteins by peptides
entity_proteins_by_peptides = fs.get_high_confidence_proteins(peptides_df_binary, target_class, classified_by, threshold=0.7)

# Binary labeling for specific class classification 
entity_training_df = fs.binary_labeling(training_df, classified_by=classified_by, true_class=target_class)
entity_ho_df = fs.binary_labeling(held_out_df, classified_by=classified_by, true_class=target_class)

entity_z_scores_train_df = fs.binary_labeling(z_scores_training_df, classified_by=classified_by, true_class=target_class)


# 1st Filter - Filtering entity training and held-out dataframes by proteins with peptides
entity_training_df = entity_training_df.filter(items=[samples_column, classified_by, 'Classifier'] + entity_proteins_by_peptides)
entity_ho_df = entity_ho_df.filter(items=[samples_column, classified_by, 'Classifier'] + entity_proteins_by_peptides)

entity_z_scores_train_df = entity_z_scores_train_df.filter(items=[samples_column, classified_by, 'Classifier'] + entity_proteins_by_peptides)

 5838 proteins identified in 70.0% of ['CHDM'] samples

Number of samples per class:
Classifier
0    852
1     59
Name: count, dtype: int64


Number of samples per class:
Classifier
0    262
1     20
Name: count, dtype: int64


Number of samples per class:
Classifier
0    859
1     52
Name: count, dtype: int64



## Feature Selection

### Hyperparametes for ElasticNet
Calculated in Z-scores

In [73]:
entity_cv_results, entity_best_params, entity_best_score, entity_grid_search_obj = fs.hparameter_grid_search(entity_z_scores_train_df.iloc[:,:30], 4, l1_ratio_list=[0.7,0.5], C_list=[0.1,1], classified_by='code_oncotree')

Grid search completed in 2.36 seconds
Best parameters: {'C': 1, 'l1_ratio': 0.5, 'max_iter': 10000, 'penalty': 'elasticnet', 'solver': 'saga'}
Best score: 0.8870251743579333


### Feature Selection by ElasticNet Cross-Validation

In [74]:
entity_cross_val_coeffs = fs.elnet_wrapper(entity_z_scores_train_df.iloc[:,:30],classified_by=classified_by, tumor_type_name='CHDM', l1_ratio=entity_best_params['l1_ratio'], C=entity_best_params['C'], n_splits=4, n_repeats=25,n_jobs=16, export=True)

Running Logistic Regression:   0%|          | 0/25 [00:00<?, ?iteration/s]Exception ignored in: <function ResourceTracker.__del__ at 0x71d40909e020>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7618c419e020>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91,

DataFrame exported to: /home/lestrada/tumor_type_prediction/notebooks/data/data_output/BRCA_250817_1230_results/feature_selection/CHDM_coefficients.xlsx


In [None]:
entity_stats_try, entity_proteins = fs.statistic_from_coefficients(entity_cross_val_coeffs, target_class)

With  26  folds, the following statistics were obtained, from feature selection:
• Mean MCC score: 0.8063 ± 0.0667

----------------------------------------
• Top 3 proteins with highest coefficients:
           mean       std  Freq  Wald Chi-Square   p-value_corrected  \
KRT19  2.018405  0.531257   1.0        14.434680            0.000409   
SCRN1  1.649494  0.571543   1.0         8.329203            0.009303   
TBL2   0.817797  0.439555   1.0         3.461513            0.108178   

       Significant  
KRT19          1.0  
SCRN1          1.0  
TBL2           0.0  

----------------------------------------
• List of significant proteins: ['KRT19', 'SCRN1', 'VCP', 'HSD17B4', 'VDAC2', 'PHGDH', 'SEPTIN9', 'SNRNP70', 'CPSF7']
• Number of significant proteins: 9

----------------------------------------
★ The mean MCC score is above 0.8, indicating reliable model performance. ★


## Model Fitting
Calculated on intensities

### Reshaping dataset for training and test

In [82]:
entity_training_fs = fs.reshape_df_for_fitting(entity_z_scores_train_df.iloc[:,:30], entity_proteins)
entity_test_fs = fs.reshape_df_for_fitting(entity_ho_df.iloc[:,:30], entity_proteins)


### Hyperparameter Selection for Logistic Regression

In [83]:
entity_to_nest_cv_results = mf.wrapper_nested_cv(entity_training_fs, random_state_tries=5, n_splits=3, classified_by=classified_by)
entity_to_nest_hp = mf.nested_cv_hparameters_selection(entity_to_nest_cv_results)

• Running for random_state=0
1 Inner fold best parameter={'C': 1}, Score=0.7406, Outer Validation MCC Score: 0.7795

2 Inner fold best parameter={'C': 10}, Score=0.9271, Outer Validation MCC Score: 0.7047

3 Inner fold best parameter={'C': 10}, Score=0.7543, Outer Validation MCC Score: 0.7641

Average MCC across all outer folds: 0.7494

--------------------------------------------------
• Running for random_state=1
1 Inner fold best parameter={'C': 0.1}, Score=0.7743, Outer Validation MCC Score: 0.7222

2 Inner fold best parameter={'C': 10}, Score=0.8194, Outer Validation MCC Score: 0.7114

3 Inner fold best parameter={'C': 10}, Score=0.8049, Outer Validation MCC Score: 0.8834

Average MCC across all outer folds: 0.7724

--------------------------------------------------
• Running for random_state=2
1 Inner fold best parameter={'C': 1}, Score=0.7637, Outer Validation MCC Score: 0.7222

2 Inner fold best parameter={'C': 1}, Score=0.7455, Outer Validation MCC Score: 0.7189

3 Inner fold 

In [84]:
hyperparameter_C = pd.DataFrame(entity_to_nest_hp).T.sort_values(by='avg', ascending=False).index.tolist()[0]

In [85]:
hyperparameter_C

10.0

### Model Fit

In [86]:
entity_log_reg_model = mf.logistic_regression_ridge( entity_training_fs, hyperparameter_C, target_class, classified_by=classified_by) 

Model saved as CHDM_log_reg_ridge_model.pkl


In [87]:
entity_coefficients, entity_train_probabilities, entity_test_probabilities = mf.logistic_regression_results(entity_log_reg_model, entity_training_fs, entity_ho_df, target_class, classified_by=classified_by)

# of Iterations: [115]
MCC train: 0.77096848314381
MCC test: 0.0
F1 Positive: 0.0


In [88]:
test_entity_scores = mf.classification_scores(entity_test_probabilities)

------------------------------------
•General Scores:
MCC Score: 0.0
F1 Macro: 0.48161764705882354
F1 Micro: 0.9290780141843972
F1 Entity Score: 0.0

------------------------------------
•Confusion Matrix:
  TN | FP
[[262   0]
 [ 20   0]]
  FN | TP

------------------------------------
•False Positives:
No False Positives detected.

------------------------------------
•False Negatives:
           Sample name code_oncotree  Classifier   Probability  Predicted
16      H021-7J2J7X-M1          CHDM           1  1.436548e-20        0.0
87   H021-V6BV2M-M2-Q1          CHDM           1  1.718825e-19        0.0
134  S033-33+005-T2-Q1          CHDM           1  8.806193e-20        0.0
135  S033-33+006-T1-Q1          CHDM           1  1.581967e-20        0.0
136  S033-33+030-T1-Q1          CHDM           1  6.068903e-19        0.0
137  H021-6A9ZQ6-T2-Q1          CHDM           1  6.686990e-21        0.0
145  S033-33+001-T3-Q1          CHDM           1  2.839905e-19        0.0
146  S033-33+008-T

# Graphs

### 3D UMAP with selected features

In [89]:
grph.create_umap_plot(df=initial_df, 
                       feature_columns= entity_proteins, 
                       color_column='code_oncotree', 
                       metadata_cols=['Sample name', 'code_oncotree', 'TCC GROUP'],
                       n_neighbors=5)

Original dataframe shape: (1479, 13078)
Feature data shape: (1479, 9)
Standardizing features...
Applying UMAP...
UMAP(n_components=3, n_jobs=1, n_neighbors=5, random_state=93, verbose=True)
Sun Aug 17 15:55:50 2025 Construct fuzzy simplicial set
Sun Aug 17 15:55:52 2025 Finding Nearest Neighbors
Sun Aug 17 15:55:56 2025 Finished Nearest Neighbor Search
Sun Aug 17 15:55:58 2025 Construct embedding


Epochs completed:   7%| ▋          34/500 [00:00]

	completed  0  /  500 epochs
	completed  50  /  500 epochs


Epochs completed:  31%| ███▏       157/500 [00:01]

	completed  100  /  500 epochs
	completed  150  /  500 epochs


Epochs completed:  50%| ████▉      248/500 [00:01]

	completed  200  /  500 epochs
	completed  250  /  500 epochs


Epochs completed:  68%| ██████▊    341/500 [00:01]

	completed  300  /  500 epochs
	completed  350  /  500 epochs


Epochs completed:  87%| ████████▋  433/500 [00:02]

	completed  400  /  500 epochs
	completed  450  /  500 epochs


Epochs completed: 100%| ██████████ 500/500 [00:02]


Sun Aug 17 15:56:01 2025 Finished embedding
Number of unique categories in code_oncotree: 177
Plot saved as: /home/lestrada/tumor_type_prediction/notebooks/data/data_output/BRCA_250817_1230_results/model_fit/UMAP_of_class.html


In [116]:
grph.plot_tcc_vs_probability(initial_df, test_entity_scores)

ValueError: 
Image export using the "kaleido" engine requires the Kaleido package,
which can be installed using pip:

    $ pip install --upgrade kaleido
