## Imports

In [None]:
import sys
import os
import pandas as pd
import numpy as np
from pathlib import Path

module_path = str(Path("../src/data").resolve())
if module_path not in sys.path:
    sys.path.append(module_path)

import preprocessing as prep
import feature_selection as fs
import model_fit as mf

project_root = os.path.abspath(os.path.join(os.getcwd(), '..'))

## Data Import

In [None]:
#Proteins quantification intensities file
processed_data = '2024.10.23_CJ_pancancer_250/'
folder_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/'
PREPROCESSED_FP_INTENSITY = 'preprocessed_fp_with_ref.csv'
intensity_path_file = folder_path + processed_data + PREPROCESSED_FP_INTENSITY
input_quantifications = pd.read_csv(intensity_path_file)

#--------------------------------------------------------------------------------

# Proteins quantification z-scores file
preprocessed_fp_z_scores = 'full_proteome_measures_z.tsv'
z_scores_path_file = folder_path + processed_data + preprocessed_fp_z_scores
df_z_scores = pd.read_csv(z_scores_path_file, sep='\t')


#--------------------------------------------------------------------------------

#Samples metadata (oncotree classification) file.
metadata_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_MTBs_Evaluation/'
metadata_file = 'METADATA_PAN_CANCER_Batch300.xlsx'
the_metadata_file = metadata_path + metadata_file
input_metadata = pd.read_excel(the_metadata_file)



## Data Preprocessing

In [None]:
# Protein and peptides quantification intensities post-processing
input_quantifications = input_quantifications.set_index(input_quantifications.columns[0])

peptides_quant_info = prep.post_process_meta_intensities(input_quantifications.iloc[:,int(input_quantifications.shape[1]/2):].T ) #clean dataframe from regex characers
proteins_quant = input_quantifications.iloc[:,:int(input_quantifications.shape[1]/2)].T #subset protein measurements from dataset

#Imputation
prot_quant_imputed = prep.impute_normal_down_shift_distribution(proteins_quant) #Imputation of missing values in protein intensities using normal distribution down-shift method
na_columns = prot_quant_imputed.isna().any()
na_columns_true = na_columns[na_columns].index.tolist()
print("Proteins with  empty values:", na_columns_true)

#Cleaning sample names
prot_quant_imputed.reset_index(inplace=True)
prot_quant_imputed.rename(columns={'index': 'Sample name'}, inplace=True)
prot_quant_imputed['Sample name'] = prot_quant_imputed['Sample name'].str.replace('pat_', '')

#Dataset with protein intensities and metadata
samples_metadata = input_metadata[["Sample name", "code_oncotree",]] #sample metadata e.g. class, TCC, tissue of origin, etc.
initial_df = samples_metadata.merge(prot_quant_imputed, left_on='Sample name', right_on='Sample name')

#Peptides quantification to binary dataset
peptides_df_binary = pd.DataFrame(
    np.where(peptides_quant_info > 1, 1, 0), #if the # of peptides > 1, then turns to 1, otherwise 0. 
    index=peptides_quant_info.index,
    columns=peptides_quant_info.columns  
)
peptides_df_binary.reset_index(inplace=True) #Moves the index to a column. Allows to obtain patient id
peptides_df_binary.replace('Identification metadata ','',regex=True, inplace=True) #Removes text from id's
peptides_df_binary = samples_metadata.merge(peptides_df_binary, left_on='Sample name', right_on='index') #merging both data sets by Sample Name, ontaining a dataset with sample, classification and peptide binary count
peptides_df_binary.drop('index', axis=1, inplace=True)

peptides_df_binary

In [None]:
# Transforming Z-scores file to obtain values and info
z_scores_df = df_z_scores.transpose(copy=True) 
z_scores_df = z_scores_df.reset_index()
z_scores_df = z_scores_df.replace('zscore_','', regex=True) 
z_scores_df.rename(columns = z_scores_df.iloc[0], inplace=True)
z_scores_df.drop(axis=0, index=0, inplace=True)
z_scores_df['Gene names'] = z_scores_df['Gene names'].str.replace('pat_', '')
z_scores_df = z_scores_df.set_index('Gene names') 

z_scores_imputed = prep.impute_normal_down_shift_distribution(z_scores_df)
z_scores_imputed.reset_index(inplace=True)
z_scores_imputed.rename(columns={'Gene names': 'Sample name'}, inplace=True)

z_scores_initial_df = samples_metadata.merge(z_scores_imputed, left_on='Sample name', right_on='Sample name')

In [None]:
z_scores_initial_df

## Data Set Split

In [None]:
#Set Classification Parameters
target_class = ['ARMS'] 
classified_by = 'code_oncotree'
samples_column = 'Sample name'


In [None]:
#Removing samples not part of the Oncotree classification
NOS_cases = initial_df[initial_df[classified_by].str.endswith('NOS', na=False)][classified_by].unique().tolist()
other_cases = ['missing']
cases_to_remove = NOS_cases + other_cases
ml_initial_df = prep.remove_class(initial_df, cases_to_remove, classified_by)

# Splitting dataset into training and held-out sets1
training_df, held_out_df = prep.data_split(ml_initial_df, split_size=0.25, classified_by=classified_by, export=False)

#Z_scores dataset
z_scores_training_df = z_scores_initial_df.iloc[training_df.index]

# Class Specific Worflow

In [None]:

#Obtaining high confidence proteins by peptides
entity_proteins_by_peptides = fs.get_high_confidence_proteins(peptides_df_binary, target_class, classified_by, threshold=0.7)

# Binary labeling for specific class classification 
entity_training_df = fs.binary_labeling(training_df, classified_by=classified_by, true_class=target_class)
entity_ho_df = fs.binary_labeling(held_out_df, classified_by=classified_by, true_class=target_class)

entity_z_scores_train_df = fs.binary_labeling(z_scores_training_df, classified_by=classified_by, true_class=target_class)


# 1st Filter - Filtering entity training and held-out dataframes by proteins with peptides
entity_training_df = entity_training_df.filter(items=[samples_column, classified_by, 'Classifier'] + entity_proteins_by_peptides)
entity_ho_df = entity_ho_df.filter(items=[samples_column, classified_by, 'Classifier'] + entity_proteins_by_peptides)

entity_z_scores_train_df = entity_z_scores_train_df.filter(items=[samples_column, classified_by, 'Classifier'] + entity_proteins_by_peptides)

## Feature Selection

### Hyperparametes for ElasticNet
Done in Z-scores

In [None]:
entity_cv_results, entity_best_params, entity_best_score, entity_grid_search_obj = fs.hparameter_grid_search(entity_z_scores_train_df.iloc[:, 0:20], 4, [0.5], [1], classified_by=classified_by)

### Feature Selection by ElasticNet Cross-Validation

In [None]:
entity_cross_val_coeffs = fs.elnet_wrapper(entity_z_scores_train_df.iloc[:, 0:20],classified_by=classified_by, tumor_type_name='ARSM_try', l1_ratio=0.1, C=100, n_splits=3, n_repeats=3,n_jobs=3, export=False)

In [None]:
entity_stats_try, entity_proteins = fs.statistic_from_coefficients(entity_cross_val_coeffs, target_class)

## Model Fitting
Done on intensities

### Reshaping dataset for training and test

In [None]:
entity_training_fs = fs.reshape_df_for_fitting(entity_training_df, entity_proteins)
entity_test_fs = fs.reshape_df_for_fitting(entity_ho_df, entity_proteins)


### Hyperparameter Selection for Logistic Regression

In [None]:
entity_to_nest_cv_results = mf.wrapper_nested_cv(entity_training_fs, random_state_tries=4, n_splits=2, classified_by=classified_by)
entity_to_nest_hp = mf.nested_cv_hparameters_selection(entity_to_nest_cv_results)

In [None]:
hyperparameter_C = pd.DataFrame(entity_to_nest_hp).T.sort_values(by='avg', ascending=False).index.tolist()[0]

### Model Fit

In [None]:
entity_log_reg_model = mf.logistic_regression_ridge( entity_training_fs, hyperparameter_C, target_class, classified_by=classified_by) 

In [None]:
entity_coefficients, entity_train_probabilities, entity_test_probabilities = mf.logistic_regression_results(entity_log_reg_model, entity_training_fs, entity_ho_df, target_class, classified_by=classified_by)

In [None]:
test_entity_scores = mf.classification_scores(entity_test_probabilities)