In [81]:
import pandas as pd
REGEX = r'(^\S+-.+-\S)|(Reporter intensity corrected)|(^P\d{6}$)'  # patient identifiers
Z_SCORE_REGEX = r'(^\S+-.+-\S)|(Reporter intensity corrected)|(^P\d{6}$) Z-score'
REGEX_META = r'^Identification metadata'  # to get num identified pepetides in FP intensity file

def load_intensity_scores(instensitypath,key = 'Gene names', regex = REGEX):
    intensity_df = pd.read_csv(instensitypath,low_memory=False)
    intensity_df.index = intensity_df[key]
    intensity_df = intensity_df.filter(regex = regex)
    intensity_df = intensity_df.loc[:,~intensity_df.columns.duplicated()].copy()

    return intensity_df


def unnest_proteingroups(df:pd.DataFrame) -> pd.DataFrame:
    """
    Unnest the protein_groups A;B as two separate rows with the same values
    the protein groups are the index of the the pandas dataframe df
    """
    temp_df = df
    temp_df['index'] = temp_df.index.str.split(';')
    temp_df = temp_df.explode('index')
    temp_df = temp_df.set_index('index')
    return temp_df



def load_intensity_meta_data(instensitypath,key,regex = REGEX_META):
    cols = pd.read_csv(instensitypath,low_memory=False, nrows=10).filter(regex=regex).columns.tolist()
    cols.append(key)
    intensity_df = pd.read_csv(instensitypath,low_memory=False, usecols=cols)
    intensity_df.index = intensity_df[key]
    intensity_df = intensity_df.loc[:,~intensity_df.columns.duplicated()]
    # intensity_df = intensity_df.fillna(0)

    return intensity_df.fillna('num_peptides=0;')


def get_pep_number_from_protein_name(intensity_df:pd.DataFrame,protein_name:str,regex) -> pd.DataFrame:
    """ 
    gets the number of the identified pepetides from the protein name across all the patients
    :intensity_df: A pandas dataframe of the intensities with Identification metadata columns for each patient 
    :protein_name: the name of the protein
    :USAGE :
        get_pep_number_from_protein_name(fp_df,'EGFR')

    """

    try:
            
        premeta_df = pd.DataFrame(intensity_df.loc[protein_name,:])
        premeta_df = premeta_df.filter(regex=regex,axis=0)
        premeta_df[protein_name] = premeta_df[protein_name].str.replace('num_peptides=|;','', regex=True)
        premeta_df[protein_name] = premeta_df[protein_name].str.replace('detected in batch','0', regex=True) 
        premeta_df[protein_name] = pd.to_numeric(premeta_df[protein_name])
        premeta_df['Sample name'] = premeta_df.index.str.replace('Identification metadata ','', regex=True)
        premeta_df.columns = ['num_pep','Sample name']
        return premeta_df['num_pep'].median()
    except: 
        return None


def get_occurence(identifier,intensity_scores):
    try:
        df = intensity_scores.T
        return df[df.notna()].count()[identifier]
    except:
        return None


In [23]:
intensitty_path = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/2024.01.17_CJ_pancancer_169/preprocessed_fp.csv'
intensity_meta_path =  '/media/kusterlab/internal_projects/active/TOPAS/WP31/Playground/Retrospective_study/2024.01.17_CJ_pancancer_169/preprocessed_fp.csv'
intensity_scores = load_intensity_scores(intensitty_path)
intensity_scores = unnest_proteingroups(intensity_scores)
intensity_meta = load_intensity_meta_data(intensity_meta_path,'Gene names')
intensity_meta = unnest_proteingroups(intensity_meta)
df = pd.read_csv('Immunotherapy_FFPE.csv')
df['occurence'] = df['Gene names'].apply(lambda x:get_occurence(x,intensity_scores))
df['median_num_pep'] = df['Gene names'].apply(lambda x:get_pep_number_from_protein_name(intensity_meta,x,regex=REGEX_META))
df.to_excel('Immunotherapy_FFPE.xlsx',index=None)