In [10]:
import os
from typing import Dict, Union

import numpy as np
import pandas as pd
# import sys
# !{sys.executable} -m pip install picked_group_fdr
import picked_group_fdr.digest as digest
# !{sys.executable} -m pip install tables --upgrade

In [20]:
# Locations and read evidence file
location = '/media/kusterlab/internal_projects/active/TOPAS/WP31/Searches/Workflow_Test/BatchPDX2_FP_mixedFASTA/combined/txt'
# location = '/home/cjensen/kusterlab_home/kusterlab/internal_projects/active/TOPAS/WP31/Searches/Workflow_Test/BatchPDX1_PP_mixedFASTA/combined/txt'
mixed_fasta = '/media/kusterlab/internal_projects/active/TOPAS/Databases/uniprotkb_pdx_2023_07_03.fasta'
# mixed_fasta = '/home/cjensen/kusterlab_home/kusterlab/internal_projects/active/TOPAS/Databases/uniprotkb_pdx_2023_07_03.fasta'

pdx_df = pd.read_csv(os.path.join(location, "evidence_mixed.txt"), sep='\t')


In [12]:
def get_organism_single(name: str):
    organism = name.split("|")[2].split("_")[1]
    if organism in ['MUSMB', 'MUSMC', 'MUSMM']:
        return 'MOUSE'
    return organism

# Function to retrieve organism from FASTA and add information to evidence file
def add_organism(df: pd.DataFrame, fasta: str = '/media/kusterlab/internal_projects/active/TOPAS/Databases/uniprotkb_pdx_2023_07_03.fasta'):

    # Get dict with fasta uniprot keys and organism values
    uniprot_organism = {name.split("|")[1]: get_organism_single(name) for name, _ in digest.readFastaMaxQuant(fasta, db = "concat")}
    df['Organism'] = np.nan
    # Add organism from matching uniprot ids to column
    df['Organism'] = df['Proteins'].apply(lambda x: get_organism(x, uniprot_organism))
    return df

def get_organism(uniprotIds: Union[str, float], uniprot_organism: Dict):

    if type(uniprotIds) == str:
        organisms = {uniprot_organism.get(p, 'CON') for p in uniprotIds.split(';')}
        return ";".join(sorted(list(organisms)))
    else:
        return np.nan


In [21]:
# Get organism information
pdx_df = add_organism(pdx_df, mixed_fasta)
print(pdx_df.head(50))


                                     Sequence  Length  \
0                      AAAAAAAAAAAAAAAGAGAGAK      22   
1                AAAAAAAAAAGDSDSWDADTFSMEDPVR      28   
2                                  AAAAAAAAAK      10   
3               AAAAAAAAAPAAAATAPTTAATTAATAAQ      29   
4                                AAAAAAAAAVSR      12   
5                   AAAAAAAGDSDSWDADAFSVEDPVR      25   
6                  AAAAAAAGDSDSWDADAFSVEDPVRK      26   
7                                 AAAAAAALQAK      11   
8                                 AAAAAAALQAK      11   
9                     AAAAAAAPSGGGGGGEEERLEEK      23   
10                AAAAAAAVGGQQPSQPELPAPGLALDK      27   
11                      AAAAAAGAASGLPGPVAQGLK      21   
12        AAAAAAVGPGAGGAGSAVPGGAGPCATVSVFPGAR      35   
13                                 AAAAADLANR      10   
14                  AAAAAEQQQFYLLLGNLLSPDNVVR      25   
15                          AAAAASAAGPGGLVAGK      17   
16                      AAAAASA

In [22]:
pdx_df['Organism'].value_counts()

Organism
HUMAN;MOUSE        59347
HUMAN              44423
MOUSE              13019
CON                  733
CON;HUMAN            530
CON;HUMAN;MOUSE      473
CON;MOUSE             75
Name: count, dtype: int64

In [23]:
# Filter to HUMAN only
# pdx_df = pdx_df[pdx_df['Organism'].isin(['HUMAN', 'HUMAN;MOUSE'])]
# pdx_df = pdx_df[pdx_df['Organism'].isin(['HUMAN', 'HUMAN;MOUSE'])]

# Save new evidence file
pdx_df.to_csv(os.path.join(location, "evidence_organism.txt"), sep='\t', index=False)

pdx_df = pdx_df[pdx_df['Organism'].isin(['HUMAN', 'HUMAN;MOUSE'])]
pdx_df.to_csv(os.path.join(location, "evidence_human_mouse.txt"), sep='\t', index=False)

pdx_df = pdx_df[pdx_df['Organism'].isin(['HUMAN'])]

pdx_df.to_csv(os.path.join(location, "evidence_human_only.txt"), sep='\t', index=False)



