In [None]:
# For all the FP IDs basket annotation, occurence, max number peptides, median number peptides, max intensity, max fold change


import pandas as pd
import re

FP_KEY = 'Gene names'
# REGEX PATTERNS FOR THE PATIENTS IDS
REGEX = r'(^\S+-.+-\S)|(Reporter intensity corrected)|(^P\d{6}$)'  # patient identifiers
Z_SCORE_REGEX = r'(^\S+-.+-\S)|(Reporter intensity corrected)|(^P\d{6}$) Z-score'
REGEX_META = r'^Identification metadata'  # to get num identified pepetides in FP intensity file


def load_intensity_meta_data(instensitypath,key,regex = REGEX_META):
    cols = pd.read_csv(instensitypath,low_memory=False, nrows=10).filter(regex=regex).columns.tolist()
    cols.append(key)
    intensity_df = pd.read_csv(instensitypath,low_memory=False, usecols=cols)
    intensity_df.index = intensity_df[key]
    intensity_df = intensity_df.loc[:,~intensity_df.columns.duplicated()]
    # intensity_df = intensity_df.fillna(0)

    return intensity_df.fillna('num_peptides=0;')



def load_intensity_scores(instensitypath,key = FP_KEY, regex = REGEX):
    intensity_df = pd.read_csv(instensitypath,low_memory=False)
    intensity_df.index = intensity_df[key]
    intensity_df = intensity_df.filter(regex = regex)
    intensity_df = intensity_df.loc[:,~intensity_df.columns.duplicated()].copy()
    return intensity_df




def get_pep_number_from_protein_name(intensity_df:pd.DataFrame,protein_name:str,regex=REGEX_META) -> pd.DataFrame:
    """ 
    gets the number of the identified pepetides from the protein name across all the patients
    :intensity_df: A pandas dataframe of the intensities with Identification metadata columns for each patient 
    :protein_name: the name of the protein
    :USAGE :
        get_pep_number_from_protein_name(fp_df,'EGFR')

    """
    print(protein_name)
    premeta_df = pd.DataFrame(intensity_df)
    premeta_df = premeta_df.filter(regex=regex)
    premeta_df = pd.DataFrame(premeta_df.loc[protein_name,:].str.replace('num_peptides=|;','', regex=True),columns=[protein_name])
    premeta_df[protein_name] = premeta_df[protein_name].str.replace('detected in batch','0', regex=True) 
    premeta_df[protein_name] = pd.to_numeric(premeta_df[protein_name])
    premeta_df['Sample name'] = premeta_df.index.str.replace('Identification metadata ','', regex=True)
    premeta_df.columns = ['num_pep','Sample name']
    return premeta_df





In [None]:
sample_annotation_df = pd.read_csv('/media/kusterlab/internal_projects/active/TOPAS/WP31/Searches/patient_annotation_231121_PAN_cancer.csv')
meta_data_df = pd.read_excel('/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_MTBs_Evaluation/METADATA_PAN CANCER_Batch155_AS_231116.xlsx')
report_directory = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/2023.11.21_CJ_PANCANCER_151'


In [None]:
intensity_file = 'preprocessed_fp.csv'
intensity_path = report_directory + '/' + intensity_file
intensity_meta_df = load_intensity_meta_data(intensity_path,FP_KEY,regex = REGEX_META)
intesity_scores_df = load_intensity_scores(intensity_path,FP_KEY,regex=REGEX)
fc_file_path = report_directory + '/' + 'full_proteome_measures_fc.tsv'
fc_file_df = pd.read_csv(fc_file_path,sep='\t')
fc_file_df.set_index(FP_KEY,inplace=True)
fc_file_df.columns  = fc_file_df.columns.str.replace('fc_','')

In [None]:
common_index = [x for x in fc_file_df.index if x in intesity_scores_df.index]  # overlapping proteins between dataFrames
# aligining all data frames
intesity_scores_df = intesity_scores_df.loc[common_index,:]
intensity_meta_df = intensity_meta_df.loc[common_index,:]
fc_file_df = fc_file_df.loc[common_index,:]
max_intensitires = intesity_scores_df.max(numeric_only = True,axis=1)
max_fcs = fc_file_df.max(numeric_only = True,axis=1)
counts_df = intesity_scores_df.count(axis=1)
final_df = pd.DataFrame(list(zip(max_intensitires,max_fcs,counts_df)),columns = ['max_intensity','max Fold Change','Occurence'])
final_df.index = common_index
final_df['proteins'] = final_df.index

# adding basket annotations

In [None]:
basket_annotation_path = "/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_MTBs_Evaluation/TUPAC_SCORING_4th gen_230628.xlsx"
# basket_annotation_path = "/home/amir/Desktop/basket_annotation.csv" # the excel file was not opened
basket_annotation_df = pd.read_(basket_annotation_path)
final_df['basket_annotation'] = ''
for i in range(len(final_df)):
    final_df['basket_annotation'][i] = ';'.join(basket_annotation_df['BASKET'][basket_annotation_df["GENE NAME"].isin([final_df.index[i]])].unique().tolist())


In [None]:
# TODO: this will take 1.5h , refactor it in the function to do it only once
final_df['median_peptides'] = final_df['proteins'].apply(lambda x:int(get_pep_number_from_protein_name(intensity_meta_df,x)['num_pep'].median()))
final_df['max_peptides'] = final_df['proteins'].apply(lambda x:get_pep_number_from_protein_name(intensity_meta_df,x)['num_pep'].max())

In [None]:
final_df.to_excel('/home/amir/Desktop/all_proteins_statistics.xlsx',index=None)