In [1]:
import re
import numpy as np
import pandas as pd
import xlwt
import os
import ete3
ncbi = ete3.NCBITaxa()
#ncbi.update_taxonomy_database()

Downloading taxdump.tar.gz from NCBI FTP site (via HTTP)...
Done. Parsing...


Loading node names...
1677281 names loaded.
195565 synonyms loaded.
Loading nodes...
1677281 nodes loaded.
Linking nodes...
Tree is loaded.
Updating database: /Users/thesis/.etetoolkit/taxa.sqlite ...
 1677000 generating entries... 0 generating entries... generating entries...  generating entries... 274000 generating entries...  generating entries... generating entries...  generating entries...  generating entries... generating entries... 
Uploading to /Users/thesis/.etetoolkit/taxa.sqlite


Inserting synonyms:      10000 




Inserting taxid merges:  15000  




Inserting taxids:       10000  




Inserting taxids:       1675000                     




Choose species

In [62]:
species ="Yersinia pestis"

Read in exported data files

In [63]:
HPIDB = pd.read_table("/Users/thesis/Desktop/Google drive/thesis-lorenz/Work in progress.../Databases/HPIDB2.0/Mitabs/" + species + "/" + species + ".txt", names=["Protein_A","Protein_B","Alt_A","Alt_B","D_method","PubmedID","Taxid_A","Taxid_B","Int_type","Source_DB","Conf_score"], usecols=[0,1,2,3,6,8,9,10,11,12,14], header=0)
IntAct = pd.read_table("/Users/thesis/Desktop/Google drive/thesis-lorenz/Work in progress.../Databases/IntAct/" + species, names=["Protein_A","Protein_B","Alt_A","Alt_B","D_method","PubmedID","Taxid_A","Taxid_B","Int_type","Source_DB","Conf_score"], usecols=[0,1,2,3,6,8,9,10,11,12,14], header=0)
PHISTO = pd.read_excel("/Users/thesis/Desktop/Google drive/thesis-lorenz/Work in progress.../Databases/PHISTO/" + species + ".xls", names= ["Taxid_A", "Protein_A", "Protein_B", "D_method","PubmedID"], usecols=[1,2,4,6,7])
print(HPIDB.Source_DB.unique())
print (len(HPIDB), len(IntAct),len(PHISTO))

['psi-mi:MI:0469(IntAct)']
1371 1358 1331


IntAct and HPIDB sets are basically the same apart from their taxid columns, thus we'll clean up the taxid column of IntAct, merge the two and then clean up the rest

It should also be noted that the taxid cleaning up has one step that needs to 'supervised' because sometimes the procedures creates NaN values which need to be checked manually.

In [64]:
r = IntAct.Taxid_A
r = r.str.lower()
r = r.str.extract(r'(?:taxid:)([0-9]*)',expand = False)
IntAct.Taxid_A = r

r = IntAct.Taxid_B
r = r.str.lower()
r = r.str.extract(r'(?:taxid:)([0-9]*)',expand = False)
IntAct.Taxid_B = r

In [65]:
df=pd.concat([HPIDB,IntAct])
df=df.reset_index(drop=True)

In [66]:
len(df)

2729

In [67]:
taxids= list(pd.unique(list(df.Taxid_A.unique())+list(df.Taxid_B.unique())))
parent= ncbi.get_name_translator([species])
taxid_list=[]
for taxid in taxids:
    lineage = ncbi.get_taxid_translator(ncbi.get_lineage(taxid))
    
    if parent.get(species)[0] in lineage:
        taxid_list.append(taxid)
taxid_list.append('9606')

In [68]:
print(ncbi.get_taxid_translator(taxid_list))
sel1 = df.Taxid_A.apply(lambda x: str(x) in taxid_list)
sel2 = df.Taxid_B.apply(lambda x: str(x) in taxid_list)
df = df.loc[sel1 & sel2,:]

{9606: 'Homo sapiens', 119856: 'Francisella tularensis subsp. tularensis', 177416: 'Francisella tularensis subsp. tularensis SCHU S4'}


In [69]:
len(df)

2729

In [70]:
for name in ["D_method","Int_type","Source_DB"]:
    r = df[name]
    r = r.str.extract((r'(?:psi-mi:"?MI:[0-9]*"?[(])([^)]*)(?:[)])') ,expand = False)
    df[name] = r


for name in ["Protein_A","Protein_B"]:
    r = df[name]
    r = r.str.extract(r'(?:uniprotkb:)([a-zA-Z0-9]*)', expand=False)
    df[name] = r
    

r = df.PubmedID
r = r.str.extract(r'(?:[^p]*)(?:pubmed:)([0-9]*)(?:.*)',expand=False)
df.PubmedID = r

r = df.Conf_score
r = r.str.extract(r'(?:intact-miscore:)([0-9.]*)', expand=False)
df.Conf_score = r

Some entries don't have uniprot ID
We'll search the alt ID column for a uniprot ID
However, in the particular case of B anthracis, there are no Alt Ids available => entries 714-761 in intact file for protein A
Protein B is spread across the dataset. 

Upon further inspection most of these entries trace back do deleted entries in the uniprotKB database.
    Some of these due to deletion. We don't know (yet) if this deletion is due to being wrong or upon request of original submitters
        Others due to redundancy. Trace back to the non redundant entry might be done by downloading the sequence and finding it in the uniprot database? example entry index 56 of intact file
        EBI-2810906 => a0a1v4b700_bacan => 
The other entries aren't even traceable

In [71]:
sel1 = df.Protein_A.isnull()
subset1 = df.loc[sel1,"Alt_A"]
sel2 = df.Protein_B.isnull()
subset2 = df.loc[sel2,"Alt_B"]

for name in [subset1,subset2]:
    name = name.str.extract(r'(?:uniprotkb:)([a-zA-Z0-9]*)', expand=False)
#code to replace empty protein IDs with the alternative ones

In [72]:
sel1 = df.Protein_A.isnull()
sel2 = df.Protein_B.isnull()
df = df.loc[~sel1 & ~sel2,:]
df = df.reset_index(drop=True)

In [73]:
len(df)

2637

In [74]:
df = df.drop(["Alt_A","Alt_B"],axis=1)

Sort columns so pathogen is 'A' and human is 'B'

In [75]:
for i in range(len(df)):
    if df.loc[i,"Taxid_B"] != '9606':
        A = df.loc[i,"Taxid_A"]
        B = df.loc[i,"Taxid_B"]
        C = df.loc[i,"Protein_A"]
        D = df.loc[i,"Protein_B"]
      

        df.loc[i,"Taxid_A"] = B
        df.loc[i,"Taxid_B"] = A
        df.loc[i,"Protein_A"]= D
        df.loc[i,"Protein_B"]= C

Filter out non-HPIs

In [76]:
sel1 = df.Taxid_B == '9606'
df = df.loc[sel1,:]

Make identifier df

In [77]:
ID_all =[]
for i in range (0,len(df)):
    ID_list=[]
    ID_list.append (df.iloc[i,0])
    ID_list.append (df.iloc[i,1])
    ID_list.append (df.iloc[i,3])
    ID_list.append (df.iloc[i,2])
    ID_list.sort()
    ID_all.append(
        ID_list[0]
        +ID_list[1] 
        + str(ID_list[2])
        +ID_list[3]
    )
df=df.assign(ID=ID_all)

In [78]:
print(len(df.ID.unique()))
df.PubmedID.unique()

1307


array(['20711500'], dtype=object)

Make identifier PHISTO and make Taxid_B column

In [79]:
ID_all =[]
for i in range (0,len(PHISTO)):
    ID_list=[]
    ID_list.append (PHISTO.iloc[i,1])
    ID_list.append (PHISTO.iloc[i,2])
    ID_list.append (PHISTO.iloc[i,3])
    ID_list.append (str(PHISTO.iloc[i,4]))
    ID_list.sort()
    ID_all.append(
        ID_list[0]
        +ID_list[1] 
        + ID_list[2]
        + ID_list[3]
    )
PHISTO=PHISTO.assign(ID=ID_all)

PHISTO["Taxid_B"]="9606"
PHISTO["Source_DB"]="PHISTO"

In [80]:
print(len(PHISTO.ID.unique()))
print(len(PHISTO))

1331
1331


merge df and PHISTO

In [81]:
df = pd.concat([df,PHISTO])
len(df)

2638

Remove redundancy

In [82]:
df = df.sort_values(by = "Conf_score", ascending = False)
df = df.drop_duplicates("ID")
print(len(df))

1335


Unnamed: 0,Conf_score,D_method,ID,Int_type,Protein_A,Protein_B,PubmedID,Source_DB,Taxid_A,Taxid_B
1330,0.37,two hybrid pooling approach,20711500P04233Q5NES9two hybrid pooling approach,physical association,Q5NES9,P04233,20711500,IntAct,177416,9606
2188,0.37,two hybrid pooling approach,20711500Q5NHR0Q8IWA4two hybrid pooling approach,physical association,Q5NHR0,Q8IWA4,20711500,IntAct,177416,9606
2206,0.37,two hybrid pooling approach,20711500O60664Q5NGP5two hybrid pooling approach,physical association,Q5NGP5,O60664,20711500,IntAct,177416,9606
2205,0.37,two hybrid pooling approach,20711500Q5NHE7Q96QZ7two hybrid pooling approach,physical association,Q5NHE7,Q96QZ7,20711500,IntAct,177416,9606
2204,0.37,two hybrid pooling approach,20711500Q5NEC0Q96QZ7two hybrid pooling approach,physical association,Q5NEC0,Q96QZ7,20711500,IntAct,177416,9606
2203,0.37,two hybrid pooling approach,20711500P27448Q5NGV3two hybrid pooling approach,physical association,Q5NGV3,P27448,20711500,IntAct,177416,9606
2202,0.37,two hybrid pooling approach,20711500P27816Q5NF74two hybrid pooling approach,physical association,Q5NF74,P27816,20711500,IntAct,177416,9606
2201,0.37,two hybrid pooling approach,20711500P27448Q5NHK6two hybrid pooling approach,physical association,Q5NHK6,P27448,20711500,IntAct,177416,9606
2200,0.37,two hybrid pooling approach,20711500Q5NHR0Q9NR56two hybrid pooling approach,physical association,Q5NHR0,Q9NR56,20711500,IntAct,177416,9606
2199,0.37,two hybrid pooling approach,20711500O60318Q5NIJ3two hybrid pooling approach,physical association,Q5NIJ3,O60318,20711500,IntAct,177416,9606


Count how many times an interaction has been found with different methods, different types of interactions, different publications,...

In [83]:
count=df.groupby(["Protein_A","Protein_B"]).count()
count.sort_values("ID",ascending=False)

Unnamed: 0_level_0,Unnamed: 1_level_0,Conf_score,D_method,ID,Int_type,PubmedID,Source_DB,Taxid_A,Taxid_B
Protein_A,Protein_B,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
Q4VP87,P01877,0,1,1,0,1,1,1,1
Q5NHF2,Q02763,1,1,1,1,1,1,1,1
Q5NHF1,P55011,1,1,1,1,1,1,1,1
Q5NHF0,Q9UJW9,1,1,1,1,1,1,1,1
Q5NHE8,Q9UBK9,1,1,1,1,1,1,1,1
Q5NHE7,Q96QZ7,1,1,1,1,1,1,1,1
Q5NHE7,Q15853,1,1,1,1,1,1,1,1
Q5NHE7,P23588,1,1,1,1,1,1,1,1
Q5NHE7,O43464,1,1,1,1,1,1,1,1
Q5NHD9,O14641,1,1,1,1,1,1,1,1


In [84]:
df.groupby("PubmedID").size()

PubmedID
20711500      28
20711500    1307
dtype: int64

In [85]:
df = df.reset_index()
df = df[["Taxid_A","Taxid_B","Protein_A","Protein_B","Int_type","D_method","PubmedID","Source_DB","Conf_score"]]

In [86]:
df.to_csv("/Users/thesis/Desktop/Python scripts/Test outputs/" + species, sep ="\t")

In [87]:
df

Unnamed: 0,Taxid_A,Taxid_B,Protein_A,Protein_B,Int_type,D_method,PubmedID,Source_DB,Conf_score
0,177416,9606,Q5NES9,P04233,physical association,two hybrid pooling approach,20711500,IntAct,0.37
1,177416,9606,Q5NHR0,Q8IWA4,physical association,two hybrid pooling approach,20711500,IntAct,0.37
2,177416,9606,Q5NGP5,O60664,physical association,two hybrid pooling approach,20711500,IntAct,0.37
3,177416,9606,Q5NHE7,Q96QZ7,physical association,two hybrid pooling approach,20711500,IntAct,0.37
4,177416,9606,Q5NEC0,Q96QZ7,physical association,two hybrid pooling approach,20711500,IntAct,0.37
5,177416,9606,Q5NGV3,P27448,physical association,two hybrid pooling approach,20711500,IntAct,0.37
6,177416,9606,Q5NF74,P27816,physical association,two hybrid pooling approach,20711500,IntAct,0.37
7,177416,9606,Q5NHK6,P27448,physical association,two hybrid pooling approach,20711500,IntAct,0.37
8,177416,9606,Q5NHR0,Q9NR56,physical association,two hybrid pooling approach,20711500,IntAct,0.37
9,177416,9606,Q5NIJ3,O60318,physical association,two hybrid pooling approach,20711500,IntAct,0.37
