# Analyze the data from pbsort

In [1]:
import pandas as pd
from collections import Counter
import matplotlib.pyplot as plt

Approaches
- df_pbsort_total = select experimental and any location
- df_epbsort = download straight the data generated by the database in : https://db.psort.org/downloads

In [17]:
df_epbsort = pd.read_table("../data/Experimental-PSORTdb-v4.00.tsv", encoding = "ISO-8859-1")
len(df_epbsort) ,df_epbsort.head(1)

(11781,
   SwissProt_ID Refseq_Accession Other_Accession Experimental_Localization  \
 0       P50307              NaN             NaN               Cytoplasmic   
 
   Secondary_Localization  MultipleSCL          ProteinName AltProteinName  \
 0                    NaN            0  Cytoplasmic protein            NaN   
 
   GeneName  TaxID               Organism      Phylum    Class      GramStain  \
 0      NaN   1280  Staphylococcus aureus  Firmicutes  Bacilli  Gram positive   
 
    Comments PMID RefSummary  ePSORTdbVersion  
 0       NaN  NaN        NaN              3.0  )

We take df_epbsort, however the sequence does not appear

In [18]:
df = df_epbsort.copy()

# get sequences

## removeID unknown
- unknown ID because we have to get the sequence from the database

In [19]:
# Filter nan and remove columns
df_filtered = df[df["SwissProt_ID"].notna()]
df_filtered= df_filtered.drop(df_filtered.columns[[1, 2,5,6,7,8,14,15,16,17]], axis=1)
len(df_filtered), df_filtered.head(1)

(11355,
   SwissProt_ID Experimental_Localization Secondary_Localization  TaxID  \
 0       P50307               Cytoplasmic                    NaN   1280   
 
                 Organism      Phylum    Class      GramStain  
 0  Staphylococcus aureus  Firmicutes  Bacilli  Gram positive  )

## Uniprot

In [5]:
from bioservices import UniProt
u = UniProt(verbose=False)

In [20]:
#removing specific data nonsense
df_filtered =  df_filtered[~df_filtered.SwissProt_ID.str.endswith(".1")]
df_filtered =  df_filtered[~df_filtered.SwissProt_ID.str.endswith(".2")]
df_filtered =  df_filtered[~df_filtered.SwissProt_ID.str.endswith(".3")]
df_filtered =  df_filtered[~df_filtered.SwissProt_ID.str.endswith(".4")]
df_filtered =  df_filtered[~df_filtered.SwissProt_ID.str.endswith(".6")]
len(df_filtered), df_filtered.head(1)

(10767,
   SwissProt_ID Experimental_Localization Secondary_Localization  TaxID  \
 0       P50307               Cytoplasmic                    NaN   1280   
 
                 Organism      Phylum    Class      GramStain  
 0  Staphylococcus aureus  Firmicutes  Bacilli  Gram positive  )

Localizations:

- cellwall
- extracellular
- cytoplasmic
- CytoplasmicMembrane
- outermembrane
- periplasmic


In [7]:
df_filtered.Experimental_Localization.unique()


array(['Cytoplasmic', 'Extracellular', 'CytoplasmicMembrane,Cellwall',
       'CytoplasmicMembrane', 'Cellwall',
       'Periplasmic,CytoplasmicMembrane', 'Periplasmic', 'OuterMembrane',
       'Cytoplasmic,CytoplasmicMembrane', 'OuterMembrane,Extracellular',
       'Cellwall,Extracellular', 'Periplasmic,OuterMembrane',
       'Extracellular,Periplasmic', 'Cytoplasmic,Extracellular',
       'Cytoplasmic,HostAssociated', 'Extracellular,HostAssociated',
       'Cytoplasmic,Periplasmic', 'OuterMembrane,CytoplasmicMembrane',
       'Cytoplasmic,OuterMembrane',
       'Curated for secondary localization only'], dtype=object)

In [24]:
locations = ["Cellwall", "Extracellular", "Cytoplasmic","CYtoplasmicMembrane", "OuterMembrane", "Periplasmic"]

In [21]:
#change CytoplasmicMembrane to CYtoplasmicMembrane to avoid false positives

df_filtered.Experimental_Localization.replace({"^CytoplasmicMembrane" : "CYtoplasmicMembrane",
"CytoplasmicMembrane$" : "CYtoplasmicMembrane" } , inplace=True, regex=True)

In [22]:
def mark_sublocation(df,location): 
    subdataset = df.loc[df.Experimental_Localization.str.contains(location)]
    df.loc[subdataset.index, location] = 1
    return df

In [25]:
for sublocation in locations:
    df_filtered[sublocation] = 0
    df_filtered = mark_sublocation(df_filtered, sublocation)

In [26]:
df2 = df_filtered.copy()
df2.Secondary_Localization.unique()
df2["Secondary_Localization"] = df["Secondary_Localization"].fillna("pepe")

In [27]:
sublocations = ['Flagellar', 'Fimbrial', 'HostAssociated', 'GasVesicle', 'T3SS', 'Capsule', 'S-layer', 'Spore outer coat',
       'Outer Membrane Vesicle']

In [28]:
def mark_sublocation2(df,location): 
    subdataset = df.loc[df.Secondary_Localization.str.contains(location)]
    df.loc[subdataset.index, location] = 1
    return df

In [29]:
for subsublocation in sublocations:
    df2[subsublocation] = 0
    df2 = mark_sublocation2(df2, subsublocation)

In [19]:
#remove a specific query bc doesnt have seq
# u.get_df(['Q9HVAZ']).set_index('Entry').loc['Q9HVAZ']
# print(u.search("Q9HVAZ", columns="sequence"))
# print("no seq")
# df_filtered = df_filtered[df_filtered['SwissProt_ID'] != 'Q9HVAZ']

obtain sequences

In [24]:
refs = list(df2["SwissProt_ID"])
refs[0]

'P50307'

In [39]:
u.retrieve("P50307" )["sequence"]["value"]

'MLNNKRLFTSESVTEGHPDKIADQVSDAILDAILKDDPNARVACETTVTTGMALIAGEISTTTYVDIPKVVRETIKEIGYTRAKYGYDYETMAILTAIDEQSPDIAQGVDKALEYRDKDSEEEIEATGAGDQGLMFGYATNETETYMPLAIYLSHQLAKRLSDVRKDGTLNYLRPDGKVQVTVEYDENDNPVRIDTIVVSTQHAEDVTLEQIQEDIKAHVIYPTVPENLINEQTKFYINPTGRFVIGGPQGDAGLTGRKIIVDTYGGIARHGGGCFSGKDPTKVDRSAAYAARYVAKNIVAAGLADQCEVQLAYAIGVAEPVSIAIDTFGTGKVSEGQLVEAVRKHFDLRPAGIIKMLDLKQPIYKQTAAYGHFGRTDVLFPWEKLDKVEELKDAVK'

In [None]:
# obtain the seq
refs = list(df2["SwissProt_ID"])
seqs = []
IDs_no_found = []
for idx, ref in enumerate(refs):
    try:
        seq = u.retrieve(ref)["sequence"]["value"]
        print(idx, ref)
        seqs.append(seq)
    except:
        IDs_no_found.append(ref)
print("done")

In [46]:
for id in IDs_no_found:
    df2 = df2[df2['SwissProt_ID'] != id]

In [48]:
#join dataframes
df2["Sequence"] = seqs

In [58]:
df_seq = df2.copy()

In [None]:
df_seq = df_seq.drop_duplicates(subset='SwissProt_ID', keep="first")
df_seq = df_seq.drop_duplicates(subset='Sequence', keep="first")

In [61]:
df_seq.to_csv("../data/PSORTdb_filtered.csv")