## Imports

In [1]:
import sys
import os
import importlib
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro, kstest
from scipy import stats  

from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV,ElasticNetCV, LogisticRegressionCV
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, classification_report, f1_score, matthews_corrcoef, mean_squared_error,r2_score, roc_auc_score, roc_curve, auc, confusion_matrix, log_loss
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler
from statsmodels.stats.diagnostic import kstest_normal
from timeit import default_timer as timer
from tqdm import tqdm  
from typing import Optional
from joblib import Parallel, delayed
import pickle

module_path = str(Path("../src/data").resolve())
if module_path not in sys.path:
    sys.path.append(module_path)

import LogRegFxF as LR
import preprocessing as prep

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

In [2]:
sys.path

['/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python313.zip',
 '/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13',
 '/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/lib-dynload',
 '',
 '/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/site-packages',
 '/home/lestrada/tumor_type_prediction/src/data']

In [236]:
import LogRegFxF as LR
import preprocessing as prep
import feature_selection as fs
import model_fit as mf

In [237]:
importlib.reload(mf)

dir(mf)

 'ElasticNet',
 'ElasticNetCV',
 'GridSearchCV',
 'Lasso',
 'LassoCV',
 'LogisticRegression',
 'LogisticRegressionCV',
 'MinMaxScaler',
 'Parallel',
 'Ridge',
 'RidgeCV',
 'StandardScaler',
 'StratifiedKFold',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'accuracy_score',
 'auc',
 'chi2',
 'classification_report',
 'classification_scores',
 'confusion_matrix',
 'cross_val_score',
 'delayed',
 'dump',
 'f1_score',
 'fdrcorrection',
 'importlib',
 'kstest',
 'kstest_normal',
 'load',
 'log_likelihood',
 'logistic_regression_results',
 'logistic_regression_ridge',
 'make_classification',
 'make_scorer',
 'matthews_corrcoef',
 'mean_squared_error',
 'nested_cross_validation_logistic_regression',
 'nested_cv_hparameters_selection',
 'np',
 'os',
 'output_dir',
 'pd',
 'precision_recall_curve',
 'project_root',
 'r2_score',
 'roc_auc_score',
 'roc_curve',
 'shapiro',
 'timer',
 'tqdm',
 'train_test_split',
 'wrapper_nested

## Data Import

In [5]:
#Proteins quantification intensities file
processed_data = '2024.10.23_CJ_pancancer_250/'
folder_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/'
PREPROCESSED_FP_INTENSITY = 'preprocessed_fp_with_ref.csv'
intensity_path_file = folder_path + processed_data + PREPROCESSED_FP_INTENSITY
input_quantifications = pd.read_csv(intensity_path_file)

#--------------------------------------------------------------------------------

#Samples metadata (oncotree classification) file.
metadata_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_MTBs_Evaluation/'
metadata_file = 'METADATA_PAN_CANCER_Batch300.xlsx'
the_metadata_file = metadata_path + metadata_file
input_metadata = pd.read_excel(the_metadata_file)

#--------------------------------------------------------------------------------

# # Proteins quantification z-scores file
# processed_data = '2024.10.23_CJ_pancancer_250/'
# folder_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/'
# PREPROCESSED_FP_INTENSITY = 'full_proteome_measures_z.tsv'
# intensity_path_file = folder_path + processed_data + PREPROCESSED_FP_INTENSITY
# df_Z_scores = pd.read_csv(intensity_path_file, sep='\t')

## Data Preprocessing

In [6]:
#Peptides quantification intensities post-processing

# Protein quantification intensities post-processing
input_quantifications = input_quantifications.set_index(input_quantifications.columns[0])
peptides_quant_info = prep.post_process_meta_intensities(input_quantifications.iloc[:,int(input_quantifications.shape[1]/2):].T ) #clean dataframe from regex characers
proteins_quant = input_quantifications.iloc[:,:int(input_quantifications.shape[1]/2)].T #subset protein measurements from dataset

#Imputation
prot_quant_imputed = prep.impute_normal_down_shift_distribution(proteins_quant) #Imputation of missing values in protein intensities using normal distribution down-shift method
na_columns = prot_quant_imputed.isna().any()
na_columns_true = na_columns[na_columns].index.tolist()
print("Proteins with  empty values:", na_columns_true)

#Cleaning sample names
prot_quant_imputed.reset_index(inplace=True)
prot_quant_imputed.rename(columns={'index': 'Sample name'}, inplace=True)
prot_quant_imputed['Sample name'] = prot_quant_imputed['Sample name'].str.replace('pat_', '')

#Dataset with protein intensities and metadata
samples_metadata = input_metadata[["Sample name", "code_oncotree",]] #sample metadata e.g. class, TCC, tissue of origin, etc.
initial_df = samples_metadata.merge(prot_quant_imputed, left_on='Sample name', right_on='Sample name')

#Peptides quantification to binary dataset
peptides_df_binary = pd.DataFrame(
    np.where(peptides_quant_info > 1, 1, 0), #if the # of peptides > 1, then turns to 1, otherwise 0. 
    index=peptides_quant_info.index,
    columns=peptides_quant_info.columns  
)
peptides_df_binary.reset_index(inplace=True) #Moves the index to a column. Allows to obtain patient id
peptides_df_binary.replace('Identification metadata ','',regex=True, inplace=True) #Removes text from id's
peptides_df_binary = samples_metadata.merge(peptides_df_binary, left_on='Sample name', right_on='index') #merging both data sets by Sample Name, ontaining a dataset with sample, classification and peptide binary count
peptides_df_binary.drop('index', axis=1, inplace=True)

peptides_df_binary

(2135, 13017)


  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  temp_mean = np.nanmean(temp)


Proteins with  empty values: ['PTGER4', 'CD19', 'FOXO4', 'CRYGA', 'HNRNPCL3;HNRNPCL4', 'MYBPHL']


Unnamed: 0,Sample name,code_oncotree,MSH6,PCLAF,UTP18,SEC16A,IPO7,EIF3L,RPAP3,INTS3,...,ROPN1L,CARD10,ZNF804A,ZNF503,HHEX,STK40,FAM214A,WNT10B,VMO1,CCDC152
0,H021-3RLVZS-T1-Q1,AASTR,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,H021-VFM3B1-T1-Q1,AASTR,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,H021-3RLVZS-T1-Q1-R2,AASTR,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,H021-XBLS3R-M1-Q1,AASTR,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,H021-M2MSRE-M1-Q1,ACBC,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1655,H021-25HCGP-M2-Q1,VMM,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1656,H021-VYS51F-M1-Q1,VSC,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1657,H021-1B7R18-M1-Q1,VSC,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1658,H021-FUFZFT-T1-Q1,VSC,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


## Data Set Split

In [7]:
#Removing samples not part of the Oncotree classification
NOS_cases = ['CUPNOS', 'ADNOS', 'SARCNOS', 'SCCNOS', 'missing', 'SOLIDNOS', 'RCSNOS', 'GCTNOS']
ml_initial_df = prep.remove_class(initial_df, NOS_cases, 'code_oncotree')

# Splitting dataset into training and held-out sets1
training_df, held_out_df = prep.data_split(ml_initial_df, split_size=0.25, classified_by='code_oncotree', export=False)


Removed samples: 191
Remaining samples: 1469
Classes with only one sample: 70
Training set samples: 1119
Held-out set samples: 350


# Class Specific Worflow

In [206]:
ARMS_class = ['ARMS'] 
classified_by = 'code_oncotree'
samples_column = 'Sample name'

#Obtaining high confidence proteins by peptides
arms_proteins_by_peptides = fs.get_high_confidence_proteins(peptides_df_binary, ARMS_class, classified_by, threshold=0.7)

# Binary labeling for specific class classification - CREATE A FX or CLASS to do this alltogether with the following code
ARMS_training_df = fs.binary_labeling(training_df, classified_by=classified_by, true_class=ARMS_class)
ARMS_ho_df = fs.binary_labeling(held_out_df, classified_by=classified_by, true_class=ARMS_class)

# 1st Filter - Filtering ARMS training and held-out dataframes by proteins with peptides
ARMS_training_df = ARMS_training_df.filter(items=[samples_column, classified_by, 'Classifier'] + arms_proteins_by_peptides)
ARMS_ho_df = ARMS_ho_df.filter(items=[samples_column, classified_by, 'Classifier'] + arms_proteins_by_peptides)

 6336 proteins identified in 70.0% of ['ARMS'] samples

Number of samples per class:
Classifier
0    1076
1      43
Name: count, dtype: int64


Number of samples per class:
Classifier
0    336
1     14
Name: count, dtype: int64



## Feature Selection

### Hyperparametes for ElasticNet

In [9]:
ARMS_cv_results, ARMS_best_params, ARMS_best_score, ARMS_grid_search_obj = fs.hparameter_grid_search(ARMS_training_df.iloc[:, 0:20], 4, [0.5], [1], classified_by='code_oncotree')

Grid search completed in 2.64 seconds
Best parameters: {'C': 1, 'l1_ratio': 0.5, 'max_iter': 10000, 'penalty': 'elasticnet', 'solver': 'saga'}
Best score: 0.5665255820147777


### Feature Selection by ElasticNet Cross-Validation

In [10]:
arms_cross_val_coeffs = fs.elnet_wrapper(ARMS_training_df.iloc[:, 0:20],classified_by='code_oncotree', tumor_type_name='ARSM_try', l1_ratio=0.1, C=100, n_splits=3, n_repeats=3,n_jobs=3, export=False)

Running Logistic Regression:   0%|          | 0/3 [00:00<?, ?iteration/s]Exception ignored in: <function ResourceTracker.__del__ at 0x7ce56b589ee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7f60e3191ee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91, 

In [126]:
arms_stats_try, arms_proteins = fs.statistic_from_coefficients(arms_cross_val_coeffs, ARMS_class)

With  17  folds, the following statistics were obtained, from feature selection:
• Mean MCC score: 0.5161 ± 0.0784

----------------------------------------
• Top 3 proteins with highest coefficients:
            mean       std  Freq  Wald Chi-Square   p-value_corrected  \
CCT2   27.200252  2.995860   1.0        82.433345            0.000000   
LBR    18.542922  3.796345   1.0        23.857503            0.000002   
EIF3L  12.923744  2.987112   1.0        18.718607            0.000026   

       Significant  
CCT2           1.0  
LBR            1.0  
EIF3L          1.0  

----------------------------------------
• List of significant proteins: ['CCT2', 'LBR', 'EIF3L', 'MSH6', 'BIN1', 'LAMB1', 'DYNC1I2', 'PPA1', 'CUL4B', 'AMPD2']
• Number of significant proteins: 10

----------------------------------------


## Model Fitting

### Reshaping dataset for training and test

In [None]:
arms_training_fs = fs.reshape_df_for_fitting(ARMS_training_df, arms_proteins)
arms_test_fs = fs.reshape_df_for_fitting(ARMS_testing_df, arms_proteins)


### Hyperparameter Selection for Logistic REgression

In [None]:
arms_to_nest_cv_results = mf.wrapper_nested_cv(arms_training_fs, random_state_tries=4, n_splits=2, classified_by='code_oncotree')
arms_to_nest_hp = mf.nested_cv_hparameters_selection(arms_to_nest_cv_results)

• Running for random_state=0
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
1 Inner fold best parameter={'C': 10}, Score=0.4690, Outer MCC Score(held-out): 0.5182

Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
2 Inner fold best parameter={'C': 10}, Score=0.5701, Outer MCC Score(held-out): 0.5820

Average MCC across all outer folds: 0.5501

--------------------------------------------------
• Running for random_state=1
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did no

### Model Fit

In [241]:
ARMS_log_reg_model = mf.logistic_regression_ridge( arms_training_fs, 1, ARMS_class, classified_by='code_oncotree') 

Model saved as ARMS_log_reg_ridge_model.pkl




In [242]:
arms_coefficients, arms_train_probabilities, arms_test_probabilities = mf.logistic_regression_results(ARMS_log_reg_model, arms_training_fs, ARMS_ho_df, ARMS_class, classified_by='code_oncotree')

# of Iterations: [10000]
MCC train: 0.5264873771676347
MCC test: 0.4233094220713288
F1 Positive: 0.36619718309859156


In [243]:
test_arms_scores = mf.classification_scores(arms_test_probabilities)

------------------------------------
•General Scores:
MCC Score: 0.4233094220713288
F1 Macro: 0.6473275263664658
F1 Micro: 0.8714285714285714
F1 Entity Score: 0.36619718309859156

------------------------------------
•Confusion Matrix:
  TN | FP
[[292  44]
 [  1  13]]
  FN | TP

------------------------------------
•False Positives:
            Sample name code_oncotree  Classifier  Probability  Predicted
7     H021-ADE19T-M1-Q1           ACC           0     0.973863        1.0
13    H021-CL1T8C-T1-Q1           AML           0     0.999730        1.0
15    H021-U5PZDV-M2-Q1          ANGS           0     0.975011        1.0
38    H021-5YFB63-T4-Q1          ASPS           0     0.628426        1.0
49   A26K-5SXQR3-T24-Q1          BRCA           0     0.667595        1.0
78      I007-029-108166           CCS           0     0.906130        1.0
79       H021-UQYD3U-M3           CCS           0     0.661490        1.0
81    H021-4X511D-M2-Q1          CEAD           0     0.605939        1.0