In [478]:
import re
import numpy as np
import pandas as pd
import xlwt
import os
import ete3
ncbi = ete3.NCBITaxa()
#ncbi.update_taxonomy_database()

Choose species

In [479]:
species ="Bacillus anthracis"

Read in exported data files

In [480]:
HPIDB = pd.read_table("/Users/thesis/Desktop/Google drive/thesis-lorenz/Work in progress.../Databases/HPIDB2.0/Mitabs/" + species + "/" + species + ".txt", names=["Protein_A","Protein_B","Alt_A","Alt_B","D_method","PubmedID","Taxid_A","Taxid_B","Int_type","Source_DB","Conf_score"], usecols=[0,1,2,3,6,8,9,10,11,12,14], header=0)
IntAct = pd.read_table("/Users/thesis/Desktop/Google drive/thesis-lorenz/Work in progress.../Databases/IntAct/" + species, names=["Protein_A","Protein_B","Alt_A","Alt_B","D_method","PubmedID","Taxid_A","Taxid_B","Int_type","Source_DB","Conf_score"], usecols=[0,1,2,3,6,8,9,10,11,12,14], header=0)
PHISTO = pd.read_excel("/Users/thesis/Desktop/Google drive/thesis-lorenz/Work in progress.../Databases/PHISTO/" + species + ".xls", names= ["Taxid_A", "Protein_A", "Protein_B", "D_method","PubmedID"], usecols=[1,2,4,6,7])
HPIDB.Source_DB.unique()
print (len(HPIDB), len(IntAct),len(PHISTO))

3059 3173 3091


IntAct and HPIDB sets are basically the same apart from their taxid columns, thus we'll clean up the taxid column of IntAct, merge the two and then clean up the rest

It should also be noted that the taxid cleaning up has one step that needs to 'supervised' because sometimes the procedures creates NaN values which need to be checked manually.

In [481]:
r = IntAct.Taxid_A
r = r.str.lower()
r = r.str.extract(r'(?:taxid:)([0-9]*)',expand = False)
IntAct.Taxid_A = r

r = IntAct.Taxid_B
r = r.str.lower()
r = r.str.extract(r'(?:taxid:)([0-9]*)',expand = False)
IntAct.Taxid_B = r

In [482]:
df=pd.concat([HPIDB,IntAct])
df=df.reset_index(drop=True)

In [483]:
len(df)

6232

In [484]:
taxids= list(pd.unique(list(df.Taxid_A.unique())+list(df.Taxid_B.unique())))
parent= ncbi.get_name_translator([species])
taxid_list=[]
for taxid in taxids:
    lineage = ncbi.get_taxid_translator(ncbi.get_lineage(taxid))
    
    if parent.get(species)[0] in lineage:
        taxid_list.append(taxid)
taxid_list.append('9606')

In [485]:
print(ncbi.get_taxid_translator(taxid_list))
sel1 = df.Taxid_A.apply(lambda x: str(x) in taxid_list)
sel2 = df.Taxid_B.apply(lambda x: str(x) in taxid_list)
df = df.loc[sel1 & sel2,:]

{1392: 'Bacillus anthracis', 9606: 'Homo sapiens'}


In [486]:
len(df)

6203

In [487]:
for name in ["D_method","Int_type","Source_DB"]:
    r = df[name]
    r = r.str.extract((r'(?:psi-mi:"?MI:[0-9]*"?[(])([^)]*)(?:[)])') ,expand = False)
    df[name] = r


for name in ["Protein_A","Protein_B"]:
    r = df[name]
    r = r.str.extract(r'(?:uniprotkb:)([a-zA-Z0-9]*)', expand=False)
    df[name] = r
    

r = df.PubmedID
r = r.str.extract(r'(?:[^p]*)(?:pubmed:)([0-9]*)(?:.*)',expand=False)
df.PubmedID = r

r = df.Conf_score
r = r.str.extract(r'(?:intact-miscore:)([0-9.]*)', expand=False)
df.Conf_score = r

Some entries don't have uniprot ID
We'll search the alt ID column for a uniprot ID
However, in the particular case of B anthracis, there are no Alt Ids available => entries 714-761 in intact file for protein A
Protein B is spread across the dataset. 

Upon further inspection most of these entries trace back do deleted entries in the uniprotKB database.
    Some of these due to deletion. We don't know (yet) if this deletion is due to being wrong or upon request of original submitters
        Others due to redundancy. Trace back to the non redundant entry might be done by downloading the sequence and finding it in the uniprot database? example entry index 56 of intact file
        EBI-2810906 => a0a1v4b700_bacan => 
The other entries aren't even traceable

In [488]:
sel1 = df.Protein_A.isnull()
subset1 = df.loc[sel1,"Alt_A"]
sel2 = df.Protein_B.isnull()
subset2 = df.loc[sel2,"Alt_B"]

for name in [subset1,subset2]:
    name = name.str.extract(r'(?:uniprotkb:)([a-zA-Z0-9]*)', expand=False)
#code to replace empty protein IDs with the alternative ones

In [489]:
sel1 = df.Protein_A.isnull()
sel2 = df.Protein_B.isnull()
df = df.loc[~sel1 & ~sel2,:]
df = df.reset_index(drop=True)

In [490]:
len(df)

6072

In [491]:
df = df.drop(["Alt_A","Alt_B"],axis=1)

Sort columns so pathogen is 'A' and human is 'B'

In [492]:
for i in range(len(df)):
    if df.loc[i,"Taxid_B"] != '9606':
        A = df.loc[i,"Taxid_A"]
        B = df.loc[i,"Taxid_B"]
        C = df.loc[i,"Protein_A"]
        D = df.loc[i,"Protein_B"]
      

        df.loc[i,"Taxid_A"] = B
        df.loc[i,"Taxid_B"] = A
        df.loc[i,"Protein_A"]= D
        df.loc[i,"Protein_B"]= C

Filter out non-HPIs

In [493]:
sel1 = df.Taxid_B == '9606'
df = df.loc[sel1,:]

Make identifier df

In [494]:
ID_all =[]
for i in range (0,len(df)):
    ID_list=[]
    ID_list.append (df.iloc[i,0])
    ID_list.append (df.iloc[i,1])
    ID_list.append (df.iloc[i,3])
    ID_list.append (df.iloc[i,2])
    ID_list.sort()
    ID_all.append(
        ID_list[0]
        +ID_list[1] 
        + str(ID_list[2])
        +ID_list[3]
    )
df=df.assign(ID=ID_all)

In [495]:
print(len(df.ID.unique()))
df.PubmedID.unique()

3012


array(['14978283', '15063758', '15719022', '20711500', '19651602',
       '16455799', '18356299', '14507921', '15243628', '15044490',
       '15326297', '11807546'], dtype=object)

Make identifier PHISTO and make Taxid_B column

In [496]:
ID_all =[]
for i in range (0,len(PHISTO)):
    ID_list=[]
    ID_list.append (PHISTO.iloc[i,1])
    ID_list.append (PHISTO.iloc[i,2])
    ID_list.append (PHISTO.iloc[i,3])
    ID_list.append (str(PHISTO.iloc[i,4]))
    ID_list.sort()
    ID_all.append(
        ID_list[0]
        +ID_list[1] 
        + ID_list[2]
        + ID_list[3]
    )
PHISTO=PHISTO.assign(ID=ID_all)

PHISTO["Taxid_B"]="9606"
PHISTO["Source_DB"]="PHISTO"

In [497]:
print(len(PHISTO.ID.unique()))
print(len(PHISTO))

3091
3091


merge df and PHISTO

In [498]:
df = pd.concat([df,PHISTO])
len(df)

6117

Remove redundancy

In [499]:
df = df.sort_values(by = "Conf_score", ascending = False)
df = df.drop_duplicates("ID")
print(len(df))
df

3167


Unnamed: 0,Conf_score,D_method,ID,Int_type,Protein_A,Protein_B,PubmedID,Source_DB,Taxid_A,Taxid_B
3046,0.87,x-ray crystallography,14978283P40136P62158x-ray crystallography,direct interaction,P40136,P62158,14978283,IntAct,1392,9606
3052,0.87,x-ray crystallography,15719022P40136P62158x-ray crystallography,direct interaction,P40136,P62158,15719022,IntAct,1392,9606
6056,0.87,cosedimentation in solution,19651602P40136P62158cosedimentation in solution,direct interaction,P40136,P62158,19651602,DIP,1392,9606
6055,0.87,comigration in non denaturing gel electrophoresis,19651602P40136P62158comigration in non denatur...,direct interaction,P40136,P62158,19651602,DIP,1392,9606
3047,0.87,x-ray crystallography,15063758P40136P62158x-ray crystallography,direct interaction,P40136,P62158,15063758,IntAct,1392,9606
6071,0.87,x-ray crystallography,11807546P40136P62158x-ray crystallography,direct interaction,P40136,P62158,11807546,IntAct,1392,9606
6070,0.79,x-ray crystallography,15326297P13423P58335x-ray crystallography,direct interaction,P13423,P58335,15326297,IntAct,1392,9606
6069,0.79,surface plasmon resonance,15044490P13423P58335surface plasmon resonance,direct interaction,P13423,P58335,15044490,IntAct,1392,9606
6068,0.79,fluorescent resonance energy transfer,15044490P13423P58335fluorescent resonance ener...,direct interaction,P13423,P58335,15044490,IntAct,1392,9606
6064,0.79,isothermal titration calorimetry,15044490P13423P58335isothermal titration calor...,physical association,P13423,P58335,15044490,IntAct,1392,9606


Count how many times an interaction has been found with different methods, different types of interactions, different publications,...

In [501]:
count=df.groupby(["Protein_A","Protein_B"]).count()
count.sort_values("ID",ascending=False)
df.groupby("PubmedID").size()

PubmedID
11807546       1
12485993       1
14978283       1
15063758       1
15131111       1
15719022       1
18356299       2
19651602       2
20711500     145
11807546       1
14507921       3
14978283       1
15044490       3
15063758       1
15243628       1
15326297       1
15719022       1
16455799       1
18356299       2
19651602       2
20711500    2995
dtype: int64