## Imports

In [1]:
import sys
import os
import importlib
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro, kstest
from scipy import stats  

from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV,ElasticNetCV, LogisticRegressionCV
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, classification_report, f1_score, matthews_corrcoef, mean_squared_error,r2_score, roc_auc_score, roc_curve, auc, confusion_matrix, log_loss
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler
from statsmodels.stats.diagnostic import kstest_normal
from timeit import default_timer as timer
from tqdm import tqdm  
from typing import Optional
from joblib import Parallel, delayed
import pickle

module_path = str(Path("../src/data").resolve())
if module_path not in sys.path:
    sys.path.append(module_path)

import LogRegFxF as LR
import preprocessing as prep

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

In [2]:
sys.path

['/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python313.zip',
 '/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13',
 '/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/lib-dynload',
 '',
 '/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/site-packages',
 '/home/lestrada/tumor_type_prediction/src/data']

In [3]:
import LogRegFxF as LR
import preprocessing as prep
import feature_selection as fs
import model_fit as mf

In [4]:
importlib.reload(mf)

dir(mf)

 'ElasticNet',
 'ElasticNetCV',
 'GridSearchCV',
 'Lasso',
 'LassoCV',
 'LogisticRegression',
 'LogisticRegressionCV',
 'MinMaxScaler',
 'Parallel',
 'Ridge',
 'RidgeCV',
 'StandardScaler',
 'StratifiedKFold',
 '__builtins__',
 '__cached__',
 '__doc__',
 '__file__',
 '__loader__',
 '__name__',
 '__package__',
 '__spec__',
 'accuracy_score',
 'auc',
 'chi2',
 'classification_report',
 'classification_scores',
 'confusion_matrix',
 'cross_val_score',
 'delayed',
 'dump',
 'f1_score',
 'fdrcorrection',
 'importlib',
 'kstest',
 'kstest_normal',
 'load',
 'log_likelihood',
 'logistic_regression_results',
 'logistic_regression_ridge',
 'make_classification',
 'make_scorer',
 'matthews_corrcoef',
 'mean_squared_error',
 'nested_cross_validation_logistic_regression',
 'nested_cv_hparameters_selection',
 'np',
 'os',
 'output_dir',
 'pd',
 'precision_recall_curve',
 'project_root',
 'r2_score',
 'roc_auc_score',
 'roc_curve',
 'shapiro',
 'timer',
 'tqdm',
 'train_test_split',
 'wrapper_nested

## Data Import

In [5]:
#Proteins quantification intensities file
processed_data = '2024.10.23_CJ_pancancer_250/'
folder_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/'
PREPROCESSED_FP_INTENSITY = 'preprocessed_fp_with_ref.csv'
intensity_path_file = folder_path + processed_data + PREPROCESSED_FP_INTENSITY
input_quantifications = pd.read_csv(intensity_path_file)

#--------------------------------------------------------------------------------

# Proteins quantification z-scores file
preprocessed_fp_z_scores = 'full_proteome_measures_z.tsv'
z_scores_path_file = folder_path + processed_data + preprocessed_fp_z_scores
df_z_scores = pd.read_csv(z_scores_path_file, sep='\t')


#--------------------------------------------------------------------------------

#Samples metadata (oncotree classification) file.
METADATA_PATH = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/LE_PROdict/paper_freeze_versions_22_08/'
metadata_file = 'METADATA_PAN_CANCER_Batch300.xlsx'
the_metadata_file = metadata_path + metadata_file
input_metadata = pd.read_excel(the_metadata_file)



## Data Preprocessing

In [6]:
#Peptides quantification intensities post-processing

# Protein quantification intensities post-processing
input_quantifications = input_quantifications.set_index(input_quantifications.columns[0])
peptides_quant_info = prep.post_process_meta_intensities(input_quantifications.iloc[:,int(input_quantifications.shape[1]/2):].T ) #clean dataframe from regex characers
proteins_quant = input_quantifications.iloc[:,:int(input_quantifications.shape[1]/2)].T #subset protein measurements from dataset

#Imputation
prot_quant_imputed = prep.impute_normal_down_shift_distribution(proteins_quant) #Imputation of missing values in protein intensities using normal distribution down-shift method
na_columns = prot_quant_imputed.isna().any()
na_columns_true = na_columns[na_columns].index.tolist()
print("Proteins with  empty values:", na_columns_true)

#Cleaning sample names
prot_quant_imputed.reset_index(inplace=True)
prot_quant_imputed.rename(columns={'index': 'Sample name'}, inplace=True)
prot_quant_imputed['Sample name'] = prot_quant_imputed['Sample name'].str.replace('pat_', '')

#Dataset with protein intensities and metadata
samples_metadata = input_metadata[["Sample name", "code_oncotree",]] #sample metadata e.g. class, TCC, tissue of origin, etc.
initial_df = samples_metadata.merge(prot_quant_imputed, left_on='Sample name', right_on='Sample name')

#Peptides quantification to binary dataset
peptides_df_binary = pd.DataFrame(
    np.where(peptides_quant_info > 1, 1, 0), #if the # of peptides > 1, then turns to 1, otherwise 0. 
    index=peptides_quant_info.index,
    columns=peptides_quant_info.columns  
)
peptides_df_binary.reset_index(inplace=True) #Moves the index to a column. Allows to obtain patient id
peptides_df_binary.replace('Identification metadata ','',regex=True, inplace=True) #Removes text from id's
peptides_df_binary = samples_metadata.merge(peptides_df_binary, left_on='Sample name', right_on='index') #merging both data sets by Sample Name, ontaining a dataset with sample, classification and peptide binary count
peptides_df_binary.drop('index', axis=1, inplace=True)

peptides_df_binary

(2135, 13017)


  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  temp_mean = np.nanmean(temp)


Proteins with  empty values: ['PTGER4', 'CD19', 'FOXO4', 'CRYGA', 'HNRNPCL3;HNRNPCL4', 'MYBPHL']


Unnamed: 0,Sample name,code_oncotree,MSH6,PCLAF,UTP18,SEC16A,IPO7,EIF3L,RPAP3,INTS3,...,ROPN1L,CARD10,ZNF804A,ZNF503,HHEX,STK40,FAM214A,WNT10B,VMO1,CCDC152
0,H021-3RLVZS-T1-Q1,AASTR,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1,H021-VFM3B1-T1-Q1,AASTR,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
2,H021-3RLVZS-T1-Q1-R2,AASTR,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
3,H021-XBLS3R-M1-Q1,AASTR,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
4,H021-M2MSRE-M1-Q1,ACBC,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1655,H021-25HCGP-M2-Q1,VMM,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1656,H021-VYS51F-M1-Q1,VSC,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1657,H021-1B7R18-M1-Q1,VSC,1,0,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
1658,H021-FUFZFT-T1-Q1,VSC,1,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0


In [7]:
# Transforming Z-scores file to obtain values and info
z_scores_df = df_z_scores.transpose(copy=True) 
z_scores_df = z_scores_df.reset_index()
z_scores_df = z_scores_df.replace('zscore_','', regex=True) 
z_scores_df.rename(columns = z_scores_df.iloc[0], inplace=True)
z_scores_df.drop(axis=0, index=0, inplace=True)
z_scores_df['Gene names'] = z_scores_df['Gene names'].str.replace('pat_', '')
z_scores_df = z_scores_df.set_index('Gene names') 

z_scores_imputed = prep.impute_normal_down_shift_distribution(z_scores_df)
z_scores_imputed.reset_index(inplace=True)
z_scores_imputed.rename(columns={'Gene names': 'Sample name'}, inplace=True)

z_scores_initial_df = samples_metadata.merge(z_scores_imputed, left_on='Sample name', right_on='Sample name')

(1667, 13017)


  unimputerd_matrix = unimputerd_df.replace({pd.NA: np.nan}, inplace=True) #Added to modify pandas's NAN values into  numpy NAN values
  var = nanvar(a, axis=axis, dtype=dtype, out=out, ddof=ddof,
  temp_mean = np.nanmean(temp)


In [8]:
z_scores_initial_df

Unnamed: 0,Sample name,code_oncotree,MSH6,PCLAF,UTP18,SEC16A,IPO7,EIF3L,RPAP3,INTS3,...,ROPN1L,CARD10,ZNF804A,ZNF503,HHEX,STK40,FAM214A,WNT10B,VMO1,CCDC152
0,H021-3RLVZS-T1-Q1,AASTR,-1.539061,-1.237403,-1.857498,-1.239718,-0.457340,-0.716951,-0.751463,-1.595253,...,0.831971,-2.124029,-2.222392,-2.271782,-1.123408,-1.988682,0.085773,-2.217128,-1.350153,1.297413
1,H021-VFM3B1-T1-Q1,AASTR,-0.647630,-2.067226,-0.568514,-0.668665,1.527020,-0.184709,0.464970,-0.776060,...,-0.110166,-1.946678,-1.731386,-2.714793,-1.691071,-1.805054,0.309919,-2.145422,-1.232119,2.146928
2,H021-3RLVZS-T1-Q1-R2,AASTR,-1.858234,-1.541744,-2.353572,-1.408264,-0.410396,-1.626688,-1.615083,-1.828735,...,1.467501,-2.030740,0.079840,-2.385696,-1.866041,-2.441151,0.841328,-2.051718,-0.819176,2.050525
3,H021-XBLS3R-M1-Q1,AASTR,-3.592338,0.166180,-1.593462,-0.487806,0.189687,-1.370391,-0.659636,-1.561716,...,0.705113,-2.153836,-1.862338,-1.692220,-1.584527,-1.933245,-0.535493,-2.223340,-1.455259,1.062251
4,H021-M2MSRE-M1-Q1,ACBC,0.068480,0.411428,1.072526,-0.243170,-1.609448,-1.803969,-0.323886,0.369503,...,-1.984753,-2.050096,-2.315219,-2.509352,-1.893514,1.841045,-1.906512,-2.311553,-1.485464,-1.030180
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1655,H021-25HCGP-M2-Q1,VMM,0.274847,0.427466,0.497179,-0.544787,-0.309454,0.336908,0.006479,0.388220,...,1.056990,-1.262168,-1.658615,-2.917329,-1.850001,-2.012641,-2.293452,-2.358545,-1.336805,-1.705622
1656,H021-VYS51F-M1-Q1,VSC,-0.540518,0.084253,0.142285,0.773620,0.511044,1.040700,0.523064,0.105094,...,1.573210,-1.320724,-1.716318,-2.027592,-1.475648,-1.606586,-1.729860,-2.774147,-1.479486,1.438808
1657,H021-1B7R18-M1-Q1,VSC,-0.523526,-1.822627,-0.221382,-2.251962,0.148048,-0.427163,-3.937086,-1.901229,...,-0.715383,-2.072916,-2.241513,-2.272900,-2.239329,-1.788503,-1.080520,-2.149362,-1.613503,0.513564
1658,H021-FUFZFT-T1-Q1,VSC,0.123631,0.501403,0.753356,1.073038,2.083629,0.797588,0.799687,0.510565,...,0.881698,-0.564836,-2.201696,-2.701125,-1.952934,-2.021265,-1.456877,-2.886111,-1.362231,0.096910


## Data Set Split

In [9]:
#Removing samples not part of the Oncotree classification
NOS_cases = initial_df[initial_df['code_oncotree'].str.endswith('NOS', na=False)]['code_oncotree'].unique().tolist()
other_cases = ['missing']
cases_to_remove = NOS_cases + other_cases
ml_initial_df = prep.remove_class(initial_df, cases_to_remove, 'code_oncotree')

# Splitting dataset into training and held-out sets1
training_df, held_out_df = prep.data_split(ml_initial_df, split_size=0.25, classified_by='code_oncotree', export=False)


Removed samples: 191
Remaining samples: 1469
Classes with only one sample: 70
Training set samples: 1119
Held-out set samples: 350


In [None]:
#Z_scores dataset
z_scores_train_df = z_scores_initial_df[z_scores_initial_df['Sample name'].isin(training_df['Sample name'])]

    print(f"Samples match between Z-score and intesntity dataset: {set(training_df['Sample name']) == set(z_scores_train_df['Sample name'])}")

# Class Specific Worflow

In [11]:
#Set Class Parameters
ARMS_class = ['ARMS'] 
classified_by = 'code_oncotree'
samples_column = 'Sample name'


In [12]:

#Obtaining high confidence proteins by peptides
arms_proteins_by_peptides = fs.get_high_confidence_proteins(peptides_df_binary, ARMS_class, classified_by, threshold=0.7)

# Binary labeling for specific class classification 
arms_training_df = fs.binary_labeling(training_df, classified_by=classified_by, true_class=ARMS_class)
arms_ho_df = fs.binary_labeling(held_out_df, classified_by=classified_by, true_class=ARMS_class)

arms_z_scores_train_df = fs.binary_labeling(z_scores_train_df, classified_by=classified_by, true_class=ARMS_class)


# 1st Filter - Filtering ARMS training and held-out dataframes by proteins with peptides
arms_training_df = arms_training_df.filter(items=[samples_column, classified_by, 'Classifier'] + arms_proteins_by_peptides)
arms_ho_df = arms_ho_df.filter(items=[samples_column, classified_by, 'Classifier'] + arms_proteins_by_peptides)

arms_z_scores_train_df = arms_z_scores_train_df.filter(items=[samples_column, classified_by, 'Classifier'] + arms_proteins_by_peptides)

 6336 proteins identified in 70.0% of ['ARMS'] samples

Number of samples per class:
Classifier
0    1076
1      43
Name: count, dtype: int64


Number of samples per class:
Classifier
0    336
1     14
Name: count, dtype: int64


Number of samples per class:
Classifier
0    1076
1      43
Name: count, dtype: int64



## Feature Selection

### Hyperparametes for ElasticNet
Done in Z-scores

In [13]:
arms_cv_results, arms_best_params, arms_best_score, arms_grid_search_obj = fs.hparameter_grid_search(arms_z_scores_train_df.iloc[:, 0:20], 4, [0.5], [1], classified_by='code_oncotree')

Grid search completed in 1.96 seconds
Best parameters: {'C': 1, 'l1_ratio': 0.5, 'max_iter': 10000, 'penalty': 'elasticnet', 'solver': 'saga'}
Best score: 0.5858800674476102


In [29]:
arms_best_params

{'C': 1,
 'l1_ratio': 0.5,
 'max_iter': 10000,
 'penalty': 'elasticnet',
 'solver': 'saga'}

In [30]:
arms_best_params['C']

1

### Feature Selection by ElasticNet Cross-Validation

In [14]:
arms_cross_val_coeffs = fs.elnet_wrapper(arms_z_scores_train_df.iloc[:, 0:20],classified_by='code_oncotree', tumor_type_name='ARSM_try', l1_ratio=0.1, C=100, n_splits=3, n_repeats=3,n_jobs=3, export=False)

Running Logistic Regression:   0%|          | 0/3 [00:00<?, ?iteration/s]Exception ignored in: <function ResourceTracker.__del__ at 0x7afaaed99ee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x7296c398dee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91, 

In [15]:
arms_stats_try, arms_proteins = fs.statistic_from_coefficients(arms_cross_val_coeffs, ARMS_class)

With  17  folds, the following statistics were obtained, from feature selection:
• Mean MCC score: 0.5415 ± 0.0568

----------------------------------------
• Top 3 proteins with highest coefficients:
           mean       std  Freq  Wald Chi-Square   p-value_corrected  \
CCT2   3.783380  1.171051   1.0        10.437785            0.002716   
EIF3L  2.860172  1.008055   1.0         8.050364            0.008341   
MSH6   2.411365  0.981066   1.0         6.041289            0.020497   

       Significant  
CCT2           1.0  
EIF3L          1.0  
MSH6           0.0  

----------------------------------------
• List of significant proteins: ['CCT2', 'EIF3L', 'BIN1', 'LAMB1', 'PCLAF', 'AMPD2', 'CUL4B']
• Number of significant proteins: 7

----------------------------------------


## Model Fitting
Done on intensities

### Reshaping dataset for training and test

In [16]:
arms_training_fs = fs.reshape_df_for_fitting(arms_training_df, arms_proteins)
arms_test_fs = fs.reshape_df_for_fitting(arms_ho_df, arms_proteins)


### Hyperparameter Selection for Logistic Regression

In [17]:
arms_to_nest_cv_results = mf.wrapper_nested_cv(arms_training_fs, random_state_tries=4, n_splits=2, classified_by='code_oncotree')
arms_to_nest_hp = mf.nested_cv_hparameters_selection(arms_to_nest_cv_results)

• Running for random_state=0
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
1 Inner fold best parameter={'C': 10}, Score=0.3494, Outer Validation MCC Score: 0.4446

Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
2 Inner fold best parameter={'C': 1}, Score=0.5005, Outer Validation MCC Score: 0.3695

Average MCC across all outer folds: 0.4070

--------------------------------------------------
• Running for random_state=1
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did not converged.
Inner fold model did n

In [18]:
hyperparameter_C = pd.DataFrame(arms_to_nest_hp).T.sort_values(by='avg', ascending=False).index.tolist()[0]

### Model Fit

In [19]:
ARMS_log_reg_model = mf.logistic_regression_ridge( arms_training_fs, hyperparameter_C, ARMS_class, classified_by='code_oncotree') 

Model saved as ARMS_log_reg_ridge_model.pkl




In [20]:
arms_coefficients, arms_train_probabilities, arms_test_probabilities = mf.logistic_regression_results(ARMS_log_reg_model, arms_training_fs, arms_ho_df, ARMS_class, classified_by='code_oncotree')

# of Iterations: [10000]
MCC train: 0.24284087647124855
MCC test: 0.24128650621655529
F1 Positive: 0.175


In [21]:
test_arms_scores = mf.classification_scores(arms_test_probabilities)

------------------------------------
•General Scores:
MCC Score: 0.24128650621655529
F1 Macro: 0.4652777777777778
F1 Micro: 0.6228571428571429
F1 Entity Score: 0.175

------------------------------------
•Confusion Matrix:
  TN | FP
[[204 132]
 [  0  14]]
  FN | TP

------------------------------------
•False Positives:
           Sample name code_oncotree  Classifier  Probability  Predicted
0    H021-3RLVZS-T1-Q1         AASTR           0     0.997831        1.0
1       H021-MQ3B2C-M5           ACC           0     1.000000        1.0
3    H021-ZHVK6R-M3-Q1           ACC           0     0.997964        1.0
5    H021-VD45DC-M2-Q1           ACC           0     0.998486        1.0
7    H021-ADE19T-M1-Q1           ACC           0     0.999917        1.0
..                 ...           ...         ...          ...        ...
344  H021-VMUZN8-M3-Q1            UM           0     1.000000        1.0
346  H021-PAQVNC-M1-Q1            UM           0     0.592781        1.0
347  H021-HPVSB4-M1-Q

Exception ignored in: <function ResourceTracker.__del__ at 0x787d80791ee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 116, in _stop_locked
ChildProcessError: [Errno 10] No child processes
Exception ignored in: <function ResourceTracker.__del__ at 0x733b85f8dee0>
Traceback (most recent call last):
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 82, in __del__
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/python3.13/multiprocessing/resource_tracker.py", line 91, in _stop
  File "/home/lestrada/miniconda3/envs/tumor_type_clasifier/lib/