In [None]:
# needed functions and libraries
import pandas as pd
import seaborn as sns
from pathlib import Path
import signature_functions as sf

def signature_pipeline(entity,input_df,list_proteins,report_dir,level='FP'):
    # univariate Analysis with Cross_validation
    ROC_univariate_df = sf.univariate_analysis_with_CV_for_all_proteins(input_df,
                                                    list_proteins,
                                                    entity,
                                                    K_folds = 5,
                                                    repeats = 1,
                                                    threshold = 0.7
                                                    )

    # T_test
    first_t_test_result_df = sf.one_vs_all_t_test(
                            input_df,
                            list_proteins,
                            entity,
                            sf.entity_subtypes)
    t_test_univariate_df = first_t_test_result_df.merge(ROC_univariate_df,right_on='names',left_on='Gene Names')
    t_test_univariate_df.to_excel(Path(report_dir)/f'{level}_signatures_{entity}.xlsx')
    print(f'{entity}_Finished')
    return t_test_univariate_df

 Inputs for phospho protein

In [None]:

report_dir = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/2023.06.22_AhS_PAPER_COHORT'
meta_file_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_MTBs_Evaluation/Metadata_Papercohort_230801.xlsx'
meta_df = pd.read_excel(meta_file_path)
sf.entity_subtypes = 'Input Material'
entity = 'Lysate'
list_interesting = ['Lysate', 'Kryotissue']
meta_df = meta_df[meta_df["Input Material"].isin(list_interesting)]
path_to_file = f'{report_dir}/preprocessed_fp.csv'
fp_intensity_file = sf.read_intensity_file(path_to_file,'Gene names')
path_to_file = f'{report_dir}/preprocessed_pp.csv'
pp_intensity_file = sf.read_intensity_file(path_to_file,'Modified sequence')
path_to_file = f'{report_dir}/protein_results/protein_scores.tsv'
phosphorylation_intensity_file = pd.read_csv(path_to_file,sep = '\t').set_index('Gene names')
path_to_file = f'{report_dir}/kinase_results/kinase_scores.tsv'
kinase_intensity_file = pd.read_csv(path_to_file,sep = '\t').set_index('PSP Kinases')
path_to_file = f'{report_dir}/basket_scores_4th_gen.tsv'
basket_intensity_file = pd.read_csv(path_to_file,sep = '\t').set_index('Sample').transpose()

In [None]:
for level,intensity_file in zip([
                                    'FP',
                                    'PP',
                                    'Phsopho_scores',
                                    'kinase_score',
                                    'basket_score'
                                ],
                                [
                                    fp_intensity_file,
                                    pp_intensity_file,
                                    phosphorylation_intensity_file,
                                    kinase_intensity_file,
                                    basket_intensity_file
                                ]):

    # signature by t_test
    input_df,list_proteins = sf.prepareDataframeforTest(
                                                        intensity_file,
                                                        meta=meta_df,
                                                        minimum_patients_per_entity = 8,
                                                        protein_expressed_in_at_least_percent = 70)
    res = signature_pipeline(entity,input_df,list_proteins,report_dir,level=level)

# Removing Protocol effect:
- removing the proteins at FP level which are significantly expressed between Kryo tisseu and Lysates

In [None]:
path_to_the_file = Path(report_dir)/'FP_signatures_Lysate.xlsx'
signature_df = pd.read_excel(path_to_the_file)
print(f'Number of the proteins before removing:{len(signature_df)}')
filtered_data = signature_df[signature_df['fdr'] > 1e-16]  # removing the protein with less than 0.01% FDR
filtered_data["Gene Names"].to_csv('~/Desktop/final_protein.txt',index=False,header=False)
print(f'Number of the proteins after removing:{len(filtered_data)}')