## Imports

In [None]:
import sys
import os
import importlib
from pathlib import Path

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from scipy.stats import shapiro, kstest
from scipy import stats  

from sklearn.linear_model import LogisticRegression, Ridge, Lasso, ElasticNet, RidgeCV, LassoCV,ElasticNetCV, LogisticRegressionCV
from sklearn.model_selection import train_test_split, cross_val_score, StratifiedKFold, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import make_scorer, accuracy_score, classification_report, f1_score, matthews_corrcoef, mean_squared_error,r2_score, roc_auc_score, roc_curve, auc, confusion_matrix, log_loss
from sklearn.datasets import make_classification
from sklearn.preprocessing import MinMaxScaler
from statsmodels.stats.diagnostic import kstest_normal
from timeit import default_timer as timer
from tqdm import tqdm  
from typing import Optional
from joblib import Parallel, delayed
import pickle

module_path = str(Path("../src/data").resolve())
if module_path not in sys.path:
    sys.path.append(module_path)

import LogRegFxF as LR
import preprocessing as prep

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

In [None]:
sys.path

In [None]:
import LogRegFxF as LR
import preprocessing as prep
import feature_selection as fs
import model_fit as mf

In [None]:
importlib.reload(mf)

dir(mf)

## Data Import

In [None]:
#Proteins quantification intensities file
processed_data = '2024.10.23_CJ_pancancer_250/'
folder_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/'
PREPROCESSED_FP_INTENSITY = 'preprocessed_fp_with_ref.csv'
intensity_path_file = folder_path + processed_data + PREPROCESSED_FP_INTENSITY
input_quantifications = pd.read_csv(intensity_path_file)

#--------------------------------------------------------------------------------

# Proteins quantification z-scores file
preprocessed_fp_z_scores = 'full_proteome_measures_z.tsv'
z_scores_path_file = folder_path + processed_data + preprocessed_fp_z_scores
df_z_scores = pd.read_csv(z_scores_path_file, sep='\t')


#--------------------------------------------------------------------------------

#Samples metadata (oncotree classification) file.
metadata_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_MTBs_Evaluation/'
metadata_file = 'METADATA_PAN_CANCER_Batch300.xlsx'
the_metadata_file = metadata_path + metadata_file
input_metadata = pd.read_excel(the_metadata_file)



## Data Preprocessing

In [None]:
#Peptides quantification intensities post-processing

# Protein quantification intensities post-processing
input_quantifications = input_quantifications.set_index(input_quantifications.columns[0])
peptides_quant_info = prep.post_process_meta_intensities(input_quantifications.iloc[:,int(input_quantifications.shape[1]/2):].T ) #clean dataframe from regex characers
proteins_quant = input_quantifications.iloc[:,:int(input_quantifications.shape[1]/2)].T #subset protein measurements from dataset

#Imputation
prot_quant_imputed = prep.impute_normal_down_shift_distribution(proteins_quant) #Imputation of missing values in protein intensities using normal distribution down-shift method
na_columns = prot_quant_imputed.isna().any()
na_columns_true = na_columns[na_columns].index.tolist()
print("Proteins with  empty values:", na_columns_true)

#Cleaning sample names
prot_quant_imputed.reset_index(inplace=True)
prot_quant_imputed.rename(columns={'index': 'Sample name'}, inplace=True)
prot_quant_imputed['Sample name'] = prot_quant_imputed['Sample name'].str.replace('pat_', '')

#Dataset with protein intensities and metadata
samples_metadata = input_metadata[["Sample name", "code_oncotree",]] #sample metadata e.g. class, TCC, tissue of origin, etc.
initial_df = samples_metadata.merge(prot_quant_imputed, left_on='Sample name', right_on='Sample name')

#Peptides quantification to binary dataset
peptides_df_binary = pd.DataFrame(
    np.where(peptides_quant_info > 1, 1, 0), #if the # of peptides > 1, then turns to 1, otherwise 0. 
    index=peptides_quant_info.index,
    columns=peptides_quant_info.columns  
)
peptides_df_binary.reset_index(inplace=True) #Moves the index to a column. Allows to obtain patient id
peptides_df_binary.replace('Identification metadata ','',regex=True, inplace=True) #Removes text from id's
peptides_df_binary = samples_metadata.merge(peptides_df_binary, left_on='Sample name', right_on='index') #merging both data sets by Sample Name, ontaining a dataset with sample, classification and peptide binary count
peptides_df_binary.drop('index', axis=1, inplace=True)

peptides_df_binary

In [21]:
df_z_scores

Unnamed: 0,Gene names,zscore_pat_I007-031-108742,zscore_pat_I043-005-95540,zscore_pat_I007-020-1007541,zscore_pat_I007-039-130734,zscore_pat_I043-001-80842,zscore_pat_I043-005-130270,zscore_pat_H021-7AAYWW-T1,zscore_pat_H021-7AAYWW-T2,zscore_pat_H021-ENQC15-T1,...,zscore_pat_K26K-SNFUYE-M11-Q1,zscore_pat_PLAGL-S651.18-T2,zscore_pat_PLAGL-1062-029-T2,zscore_pat_PLAGL-1070-010-T2,zscore_pat_PLAGL-18H01607-T2,zscore_pat_PLAGL-15H01681-T2,zscore_pat_PLAGL-T20-93173-T1,zscore_pat_PLAGL-H0067496-T1,zscore_pat_PLAGL-S30470010-REA-T2,zscore_pat_PLAGL-1139-018-T2
0,MSH6,-1.736677,1.275205,-1.014028,-0.976851,1.181702,0.399403,0.925327,1.095705,-0.654436,...,-0.189504,-1.186537,0.705940,-3.018350,-1.528356,-1.495828,-1.015592,-0.106627,-2.297350,-1.169946
1,PCLAF,-0.181733,0.186393,0.503325,-0.637646,0.428013,-0.027356,0.376571,-0.076594,-0.763392,...,0.616997,,,,,,,,,
2,UTP18,-1.009879,0.766796,-0.354145,-0.671638,0.581857,0.440137,0.281841,-0.181134,0.701161,...,0.007499,-1.465479,-0.254084,-0.269556,-1.457134,-3.780978,-1.770047,-0.076073,-2.836637,-1.653923
3,SEC16A,-0.703658,-0.240586,-0.703176,-0.671051,-0.103367,-0.702204,0.240134,-0.114688,-0.111614,...,1.274612,-1.215534,1.511328,-1.792839,-1.427330,-2.489280,-0.903802,-0.581286,-1.374406,-1.306951
4,IPO7,-0.948150,0.352164,-0.942567,-0.866113,-0.201395,0.036018,0.050517,0.266625,1.856548,...,-0.192219,-0.574418,0.738095,-1.377923,-0.427334,0.068179,0.025539,0.230214,0.403533,-0.585712
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
13012,STK40,,,,,,,,,-0.611393,...,,,,,,,,,,
13013,FAM214A,,,,,,,,,,...,,-0.174521,-1.600007,0.238428,0.629419,1.711126,0.900724,0.318236,1.629722,0.998803
13014,WNT10B,,,,,,,,,,...,,,,,,,,,,
13015,VMO1,,,,,,,,,,...,,,,,,,,,,


## Data Set Split

In [None]:
#Removing samples not part of the Oncotree classification
NOS_cases = ['CUPNOS', 'ADNOS', 'SARCNOS', 'SCCNOS', 'missing', 'SOLIDNOS', 'RCSNOS', 'GCTNOS']
ml_initial_df = prep.remove_class(initial_df, NOS_cases, 'code_oncotree')

# Splitting dataset into training and held-out sets1
training_df, held_out_df = prep.data_split(ml_initial_df, split_size=0.25, classified_by='code_oncotree', export=False)


In [19]:
training_df

Unnamed: 0,Sample name,code_oncotree,Classifier,MSH6,PCLAF,UTP18,SEC16A,IPO7,EIF3L,RPAP3,...,ROPN1L,CARD10,ZNF804A,ZNF503,HHEX,STK40,FAM214A,WNT10B,VMO1,CCDC152
1,H021-VFM3B1-T1-Q1,AASTR,0,8.680371,5.933663,7.645139,8.361822,9.226800,8.919425,8.317408,...,7.695090,5.873703,5.996388,2.326346,6.582798,5.034075,6.910751,5.523846,4.900420,7.982770
2,H021-3RLVZS-T1-Q1-R2,AASTR,0,8.303137,6.181595,7.214161,8.232569,8.877661,8.676085,7.951519,...,8.229571,5.770071,6.838024,3.103242,6.644451,4.008825,7.121844,5.267685,5.007503,7.937410
3,H021-XBLS3R-M1-Q1,AASTR,0,7.765713,6.878360,7.397420,8.393453,8.985775,8.719291,8.119543,...,7.971207,5.736151,6.088030,2.168346,6.685648,4.227623,6.577688,5.039773,5.279435,7.471511
4,H021-M2MSRE-M1-Q1,ACBC,0,8.902576,6.982327,8.042001,8.436245,8.661617,8.646217,8.178647,...,7.059671,5.803157,6.005662,2.950181,6.586981,8.230956,5.807881,5.320014,4.769868,6.483862
6,H021-XKP7ZN-M1,ACC,0,9.129199,7.109279,7.851145,8.589051,9.202412,9.057259,8.421119,...,8.190473,5.652991,6.145269,2.354371,6.644244,4.432015,5.864944,5.106300,5.073612,5.770285
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1463,H021-9L41LB9-M3-Q1,VA,0,9.550614,7.126274,8.023580,8.699227,9.246176,8.887176,8.391796,...,7.953213,5.904273,6.053612,2.067367,6.705999,5.588218,6.128643,5.509291,4.932100,6.859698
1464,H021-25HCGP-M2-Q1,VMM,0,8.966982,6.989125,7.902856,8.383487,8.895865,9.007164,8.236685,...,8.090508,5.723962,6.243462,2.297622,6.694029,4.188066,5.721522,5.086781,4.888286,6.165193
1466,H021-1B7R18-M1-Q1,VSC,0,8.719091,6.026785,7.729123,8.085433,8.978265,8.878481,7.545643,...,7.489733,5.815132,6.094122,3.265360,6.690222,4.748220,6.106756,5.119893,5.249331,7.212354
1467,H021-FUFZFT-T1-Q1,VSC,0,8.919788,7.020464,7.964825,8.666295,9.326903,9.084935,8.376320,...,8.031086,6.525668,6.153420,3.348079,6.726562,4.248906,5.997883,5.266783,4.882664,7.015448


# Class Specific Worflow

In [None]:
ARMS_class = ['ARMS'] 
classified_by = 'code_oncotree'
samples_column = 'Sample name'

#Obtaining high confidence proteins by peptides
arms_proteins_by_peptides = fs.get_high_confidence_proteins(peptides_df_binary, ARMS_class, classified_by, threshold=0.7)

# Binary labeling for specific class classification - CREATE A FX or CLASS to do this alltogether with the following code
ARMS_training_df = fs.binary_labeling(training_df, classified_by=classified_by, true_class=ARMS_class)
ARMS_ho_df = fs.binary_labeling(held_out_df, classified_by=classified_by, true_class=ARMS_class)

# 1st Filter - Filtering ARMS training and held-out dataframes by proteins with peptides
ARMS_training_df = ARMS_training_df.filter(items=[samples_column, classified_by, 'Classifier'] + arms_proteins_by_peptides)
ARMS_ho_df = ARMS_ho_df.filter(items=[samples_column, classified_by, 'Classifier'] + arms_proteins_by_peptides)

## Feature Selection

### Hyperparametes for ElasticNet

In [None]:
ARMS_cv_results, ARMS_best_params, ARMS_best_score, ARMS_grid_search_obj = fs.hparameter_grid_search(ARMS_training_df.iloc[:, 0:20], 4, [0.5], [1], classified_by='code_oncotree')

### Feature Selection by ElasticNet Cross-Validation

In [None]:
arms_cross_val_coeffs = fs.elnet_wrapper(ARMS_training_df.iloc[:, 0:20],classified_by='code_oncotree', tumor_type_name='ARSM_try', l1_ratio=0.1, C=100, n_splits=3, n_repeats=3,n_jobs=3, export=False)

In [None]:
arms_stats_try, arms_proteins = fs.statistic_from_coefficients(arms_cross_val_coeffs, ARMS_class)

## Model Fitting

### Reshaping dataset for training and test

In [None]:
arms_training_fs = fs.reshape_df_for_fitting(ARMS_training_df, arms_proteins)
arms_test_fs = fs.reshape_df_for_fitting(ARMS_ho_df, arms_proteins)


### Hyperparameter Selection for Logistic Regression

In [None]:
arms_to_nest_cv_results = mf.wrapper_nested_cv(arms_training_fs, random_state_tries=4, n_splits=2, classified_by='code_oncotree')
arms_to_nest_hp = mf.nested_cv_hparameters_selection(arms_to_nest_cv_results)

In [None]:
hyperparameter_C = pd.DataFrame(arms_to_nest_hp).T.sort_values(by='avg', ascending=False).index.tolist()[0]

### Model Fit

In [None]:
ARMS_log_reg_model = mf.logistic_regression_ridge( arms_training_fs, hyperparameter_C, ARMS_class, classified_by='code_oncotree') 

In [None]:
arms_coefficients, arms_train_probabilities, arms_test_probabilities = mf.logistic_regression_results(ARMS_log_reg_model, arms_training_fs, ARMS_ho_df, ARMS_class, classified_by='code_oncotree')

In [None]:
test_arms_scores = mf.classification_scores(arms_test_probabilities)