In [70]:
import re
import numpy as np
import pandas as pd
import xlwt
import os
import ete3
ncbi = ete3.NCBITaxa()
#ncbi.update_taxonomy_database()

Choose species

In [71]:
species ="Francisella tularensis"

Read in exported data files

In [72]:
HPIDB = pd.read_table("/Users/thesis/Desktop/Google drive/thesis-lorenz/Work in progress.../Databases/HPIDB2.0/Mitabs/" + species + "/" + species + ".txt", names=["Protein_A","Protein_B","Alt_A","Alt_B","D_method","PubmedID","Taxid_A","Taxid_B","Int_type","Source_DB","Conf_score"], usecols=[0,1,2,3,6,8,9,10,11,12,14], header=0)
IntAct = pd.read_table("/Users/thesis/Desktop/Google drive/thesis-lorenz/Work in progress.../Databases/IntAct/" + species, names=["Protein_A","Protein_B","Alt_A","Alt_B","D_method","PubmedID","Taxid_A","Taxid_B","Int_type","Source_DB","Conf_score"], usecols=[0,1,2,3,6,8,9,10,11,12,14], header=0)
PHISTO = pd.read_excel("/Users/thesis/Desktop/Google drive/thesis-lorenz/Work in progress.../Databases/PHISTO/" + species + ".xls", names= ["Taxid_A", "Protein_A", "Protein_B", "D_method","PubmedID"], usecols=[1,2,4,6,7])
print(HPIDB.Source_DB.unique())
print (len(HPIDB), len(IntAct),len(PHISTO))

['psi-mi:MI:0469(IntAct)']
1371 1358 1331


IntAct and HPIDB sets are basically the same apart from their taxid columns, thus we'll clean up the taxid column of IntAct, merge the two and then clean up the rest

It should also be noted that the taxid cleaning up has one step that needs to 'supervised' because sometimes the procedures creates NaN values which need to be checked manually.

In [73]:
r = IntAct.Taxid_A
r = r.str.lower()
r = r.str.extract(r'(?:taxid:)([0-9]*)',expand = False)
IntAct.Taxid_A = r

r = IntAct.Taxid_B
r = r.str.lower()
r = r.str.extract(r'(?:taxid:)([0-9]*)',expand = False)
IntAct.Taxid_B = r

In [74]:
df=pd.concat([HPIDB,IntAct])
df=df.reset_index(drop=True)

In [75]:
len(df)

2729

In [76]:
taxids= list(pd.unique(list(df.Taxid_A.unique())+list(pd.unique(list(df.Taxid_B.unique())))))
parent= ncbi.get_name_translator([species])
taxid_list=[]
for taxid in taxids:
    lineage = ncbi.get_lineage(taxid)
    
    if parent.get(species)[0] in lineage:
        taxid_list.append(taxid)
taxid_list.append('9606')

In [77]:
print(ncbi.get_taxid_translator(taxid_list))
sel1 = df.Taxid_A.apply(lambda x: str(x) in taxid_list)
sel2 = df.Taxid_B.apply(lambda x: str(x) in taxid_list)
df = df.loc[sel1 & sel2,:]

{9606: 'Homo sapiens', 119856: 'Francisella tularensis subsp. tularensis', 177416: 'Francisella tularensis subsp. tularensis SCHU S4'}


In [78]:
len(df)

2729

In [79]:
for name in ["D_method","Int_type","Source_DB"]:
    r = df[name]
    r = r.str.extract((r'(?:psi-mi:"?MI:[0-9]*"?[(])([^)]*)(?:[)])') ,expand = False)
    df[name] = r


for name in ["Protein_A","Protein_B"]:
    r = df[name]
    r = r.str.extract(r'(?:uniprotkb:)([a-zA-Z0-9]*)', expand=False)
    df[name] = r
    

r = df.PubmedID
r = r.str.extract(r'(?:[^p]*)(?:pubmed:)([0-9]*)(?:.*)',expand=False)
df.PubmedID = r

r = df.Conf_score
r = r.str.extract(r'(?:intact-miscore:)([0-9.]*)', expand=False)
df.Conf_score = r

Some entries don't have uniprot ID
We'll search the alt ID column for a uniprot ID
However, in the particular case of B anthracis, there are no Alt Ids available => entries 714-761 in intact file for protein A
Protein B is spread across the dataset. 

Upon further inspection most of these entries trace back do deleted entries in the uniprotKB database.
    Some of these due to deletion. We don't know (yet) if this deletion is due to being wrong or upon request of original submitters
        Others due to redundancy. Trace back to the non redundant entry might be done by downloading the sequence and finding it in the uniprot database? example entry index 56 of intact file
        EBI-2810906 => a0a1v4b700_bacan => 
The other entries aren't even traceable

In [80]:
sel1 = df.Protein_A.isnull()
subset1 = df.loc[sel1,"Alt_A"]
sel2 = df.Protein_B.isnull()
subset2 = df.loc[sel2,"Alt_B"]

for name in [subset1,subset2]:
    name = name.str.extract(r'(?:uniprotkb:)([a-zA-Z0-9]*)', expand=False)
#code to replace empty protein IDs with the alternative ones

In [81]:
sel1 = df.Protein_A.isnull()
sel2 = df.Protein_B.isnull()
df = df.loc[~sel1 & ~sel2,:]
df = df.reset_index(drop=True)

In [82]:
len(df)

2637

In [83]:
df = df.drop(["Alt_A","Alt_B"],axis=1)

Sort columns so pathogen is 'A' and human is 'B'

In [84]:
df.Taxid_A= df.Taxid_A.apply(lambda x: str(x))
df.Taxid_B = df.Taxid_B.apply(lambda x: str(x))
for i in range(len(df)):
    if df.loc[i,"Taxid_B"] != '9606':
        A = df.loc[i,"Taxid_A"]
        B = df.loc[i,"Taxid_B"]
        C = df.loc[i,"Protein_A"]
        D = df.loc[i,"Protein_B"]
      

        df.loc[i,"Taxid_A"] = B
        df.loc[i,"Taxid_B"] = A
        df.loc[i,"Protein_A"]= D
        df.loc[i,"Protein_B"]= C

Filter out non-HPIs

In [85]:
sel = df.Taxid_B == '9606'

In [86]:
df = df.loc[sel,:]
print(len(df))

2637


Make identifier df

In [87]:
ID_all =[]
for i in range (0,len(df)):
    ID_list=[]
    ID_list.append (df.iloc[i,0])
    ID_list.append (df.iloc[i,1])
    ID_list.append (df.iloc[i,3])
    ID_list.append (df.iloc[i,2])
    ID_list.sort()
    ID_all.append(
        ID_list[0]
        +ID_list[1] 
        + str(ID_list[2])
        +ID_list[3]
    )
df=df.assign(ID=ID_all)

In [88]:
print(len(df.ID.unique()))
df.PubmedID.unique()

1330


array(['20711500'], dtype=object)

Make identifier PHISTO and make Taxid_B column

In [89]:
ID_all =[]
for i in range (0,len(PHISTO)):
    ID_list=[]
    ID_list.append (PHISTO.iloc[i,1])
    ID_list.append (PHISTO.iloc[i,2])
    ID_list.append (PHISTO.iloc[i,3])
    ID_list.append (str(PHISTO.iloc[i,4]))
    ID_list.sort()
    ID_all.append(
        ID_list[0]
        +ID_list[1] 
        + ID_list[2]
        + ID_list[3]
    )
PHISTO=PHISTO.assign(ID=ID_all)

PHISTO["Taxid_B"]="9606"
PHISTO["Source_DB"]="PHISTO"

In [90]:
print(len(PHISTO.ID.unique()))
print(len(PHISTO))

1331
1331


merge df and PHISTO

In [91]:
df = pd.concat([df,PHISTO])
len(df)

3968

Remove redundancy

In [92]:
df = df.sort_values(by = "Conf_score", ascending = False)
df = df.drop_duplicates("ID")
print(len(df))

1335


Count how many times an interaction has been found with different methods, different types of interactions, different publications,...

In [93]:
count=df.groupby(["Source_DB"]).count()
count

Unnamed: 0_level_0,Conf_score,D_method,ID,Int_type,Protein_A,Protein_B,PubmedID,Taxid_A,Taxid_B
Source_DB,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1
IntAct,1330,1330,1330,1330,1330,1330,1330,1330,1330
PHISTO,0,5,5,0,5,5,5,5,5


In [94]:
df.groupby("PubmedID").size()

PubmedID
20711500       5
20711500    1330
dtype: int64

In [95]:
df = df.reset_index()
df = df[["Taxid_A","Taxid_B","Protein_A","Protein_B","Int_type","D_method","PubmedID","Source_DB","Conf_score","ID"]]

In [96]:
df.to_csv("/Users/thesis/Desktop/Python scripts/Test outputs/" + species, sep ="\t")

In [97]:
df

Unnamed: 0,Taxid_A,Taxid_B,Protein_A,Protein_B,Int_type,D_method,PubmedID,Source_DB,Conf_score,ID
0,119856,9606,Q8G8T7,P13598,physical association,two hybrid pooling approach,20711500,IntAct,0.37,20711500P13598Q8G8T7two hybrid pooling approach
1,177416,9606,Q5NID9,Q15650,physical association,two hybrid pooling approach,20711500,IntAct,0.37,20711500Q15650Q5NID9two hybrid pooling approach
2,177416,9606,Q5NF74,Q96A61,physical association,two hybrid pooling approach,20711500,IntAct,0.37,20711500Q5NF74Q96A61two hybrid pooling approach
3,177416,9606,Q5NGC7,Q92547,physical association,two hybrid pooling approach,20711500,IntAct,0.37,20711500Q5NGC7Q92547two hybrid pooling approach
4,177416,9606,Q5NIP6,O14773,physical association,two hybrid pooling approach,20711500,IntAct,0.37,20711500O14773Q5NIP6two hybrid pooling approach
5,177416,9606,Q5NEV2,O94842,physical association,two hybrid pooling approach,20711500,IntAct,0.37,20711500O94842Q5NEV2two hybrid pooling approach
6,177416,9606,Q5NES5,Q8IYM9,physical association,two hybrid pooling approach,20711500,IntAct,0.37,20711500Q5NES5Q8IYM9two hybrid pooling approach
7,177416,9606,Q5NEX4,Q8NDV7,physical association,two hybrid pooling approach,20711500,IntAct,0.37,20711500Q5NEX4Q8NDV7two hybrid pooling approach
8,177416,9606,Q5NIP6,Q8NDV7,physical association,two hybrid pooling approach,20711500,IntAct,0.37,20711500Q5NIP6Q8NDV7two hybrid pooling approach
9,177416,9606,Q5NHD1,O94826,physical association,two hybrid pooling approach,20711500,IntAct,0.37,20711500O94826Q5NHD1two hybrid pooling approach
