In [1]:
import re
import numpy as np
import pandas as pd
import xlwt
import os
import ete3
ncbi = ete3.NCBITaxa()
import requests
import time
#ncbi.update_taxonomy_database()

In [2]:
BASE = 'http://www.uniprot.org'
KB_ENDPOINT = '/uniprot/'
TOOL_ENDPOINT = '/uploadlists/'

Choose species

In [3]:
species ="Francisella tularensis"

Read in exported data files

In [4]:
HPIDB = pd.read_table("/Users/thesis/Desktop/Google drive/thesis-lorenz/Work in progress.../Databases/HPIDB2.0/Mitabs/" + species + "/" + species + ".txt", names=["protein_a","protein_b","alt_a","alt_b","alias_a","alias_b","d_method","pubmedid","taxid_a","taxid_b","int_type","source_db","conf_score"], usecols=[0,1,2,3,4,5,6,8,9,10,11,12,14], header=0)
IntAct = pd.read_table("/Users/thesis/Desktop/Google drive/thesis-lorenz/Work in progress.../Databases/IntAct/" + species, names=["protein_a","protein_b","alt_a","alt_b","alias_a","alias_b","d_method","pubmedid","taxid_a","taxid_b","int_type","source_db","conf_score"], usecols=[0,1,2,3,4,5,6,8,9,10,11,12,14], header=0)
PHISTO = pd.read_excel("/Users/thesis/Desktop/Google drive/thesis-lorenz/Work in progress.../Databases/PHISTO/" + species + ".xls", names= ["taxid_a", "protein_a", "protein_b", "d_method","pubmedid"], usecols=[1,2,4,6,7])
print(HPIDB.source_db.unique())
print (len(HPIDB), len(IntAct),len(PHISTO))

['psi-mi:MI:0469(IntAct)']
1371 1358 1331


IntAct and HPIDB sets are basically the same apart from their taxid columns, thus we'll clean up the taxid column of IntAct, merge the two and then clean up the rest

It should also be noted that the taxid cleaning up has one step that needs to 'supervised' because sometimes the procedures creates NaN values which need to be checked manually.

In [5]:
r = IntAct.taxid_a
r = r.str.lower()
r = r.str.extract(r'(?:taxid:)([0-9]*)',expand = False)
IntAct.taxid_a = r

r = IntAct.taxid_b
r = r.str.lower()
r = r.str.extract(r'(?:taxid:)([0-9]*)',expand = False)
IntAct.taxid_b = r

In [6]:
df=pd.concat([HPIDB,IntAct])
df=df.reset_index(drop=True)

In [7]:
len(df)

2729

In [8]:
taxids= list(pd.unique(list(df.taxid_a.unique())+list(pd.unique(list(df.taxid_b.unique())))))
parent= ncbi.get_name_translator([species])
taxid_list=[]
for taxid in taxids:
    lineage = ncbi.get_lineage(taxid)
    
    if parent.get(species)[0] in lineage:
        taxid_list.append(taxid)
taxid_list.append('9606')

In [9]:
print(ncbi.get_taxid_translator(taxid_list))
sel1 = df.taxid_a.apply(lambda x: str(x) in taxid_list)
sel2 = df.taxid_b.apply(lambda x: str(x) in taxid_list)
df = df.loc[sel1 & sel2,:]

{9606: 'Homo sapiens', 119856: 'Francisella tularensis subsp. tularensis', 177416: 'Francisella tularensis subsp. tularensis SCHU S4'}


In [10]:
len(df)

2729

In [11]:
for name in ["d_method","int_type","source_db"]:
    r = df[name]
    r = r.str.extract((r'(?:psi-mi:"?MI:[0-9]*"?[(])([^)]*)(?:[)])') ,expand = False)
    df[name] = r


for name in ["protein_a","protein_b"]:
    r = df[name]
    r = r.str.extract(r'(?:uniprotkb:)([a-zA-Z0-9]*)', expand=False)
    df[name] = r
    

r = df.pubmedid
r = r.str.extract(r'(?:[^p]*)(?:pubmed:)([0-9]*)(?:.*)',expand=False)
df.pubmedid = r

r = df.conf_score
r = r.str.extract(r'(?:intact-miscore:)([0-9.]*)', expand=False)
df.conf_score = r

Some entries don't have uniprot ID
We'll search the alt ID column for a uniprot ID
However, in the particular case of B anthracis, there are no Alt Ids available => entries 714-761 in intact file for protein A
Protein B is spread across the dataset. 

Upon further inspection most of these entries trace back do deleted entries in the uniprotKB database.
    Some of these due to deletion. We don't know (yet) if this deletion is due to being wrong or upon request of original submitters
        Others due to redundancy. Trace back to the non redundant entry might be done by downloading the sequence and finding it in the uniprot database? example entry index 56 of intact file
        EBI-2810906 => a0a1v4b700_bacan => 
The other entries aren't even traceable

In [12]:
def Find_alt_ID(ID,organism):
    if type(ID) != str:
        return 'not a string'
    time.sleep(0.5)
    payload = {'query': 'gene:' + '\"'+ ID +'\"'+ 'AND organism:' + '\"' + organism + '\"' + 'AND reviewed:yes',
           'format': 'list'}

    result2 = requests.get(BASE + KB_ENDPOINT, params=payload)

    if result2.ok:
        return(result2.text).strip()
    else:
        print('Something went wrong ', result.status_code)

In [14]:
sel1 = df.protein_a.isnull()
subset1 = df.loc[sel1,"alias_a"]
subset1 = subset1.str.extract(r'uniprotkb:([a-zA-Z0-9_-]*)\(gene name\)',expand=False)
subset1 = subset1.apply(lambda x : Find_alt_ID(x,'Human'))
subset1
df.loc[sel1,"protein_a"] = subset1

In [15]:
sel2 = df.protein_b.isnull()
subset2 = df.loc[sel2,"alias_b"]
subset2 = subset2.str.extract(r'uniprotkb:([a-zA-Z0-9_-]*)\(gene name\)',expand=False)
subset2 = subset2.apply(lambda x : Find_alt_ID(x,species))
df.loc[sel1,"protein_b"] = subset2

In [16]:
sel1 = df.protein_a.isnull()
sel2 = df.protein_b.isnull()
df = df.loc[~sel1 & ~sel2,:]
df = df.reset_index(drop=True)

In [17]:
len(df)

2637

In [18]:
df = df.drop(["alt_a","alt_b"],axis=1)

Sort columns so pathogen is 'A' and human is 'B'

In [19]:
df.taxid_a= df.taxid_a.apply(lambda x: str(x))
df.taxid_b = df.taxid_b.apply(lambda x: str(x))
for i in range(len(df)):
    if df.loc[i,"taxid_b"] != '9606':
        A = df.loc[i,"taxid_a"]
        B = df.loc[i,"taxid_b"]
        C = df.loc[i,"protein_a"]
        D = df.loc[i,"protein_b"]
      

        df.loc[i,"taxid_a"] = B
        df.loc[i,"taxid_b"] = A
        df.loc[i,"protein_a"]= D
        df.loc[i,"protein_b"]= C

Filter out non-HPIs

In [20]:
sel = df.taxid_b == '9606'

In [21]:
df = df.loc[sel,:]
print(len(df))

2637


In [28]:
def map_retrieve(ids2map, source_fmt='ACC+ID',
                 target_fmt='ACC', output_fmt='list'):
    
    if hasattr(ids2map, 'pop'):
        ids2map = ' '.join(ids2map)
    payload = {'from': source_fmt,
               'to': target_fmt,
               'format': output_fmt,
               'query': ids2map,
               }
    response = requests.get(BASE + TOOL_ENDPOINT, params=payload)
    if response.ok:
        return response.text
    else:
        response.raise_for_status()

In [24]:
#df.protein_a = df.protein_a.apply(lambda x: map_retrieve(x).split())
for i in range(0,50):
    map_retrieve(df.loc[i,'protein_b'])

Make identifier df

In [29]:
map_retrieve(['P08107','Q8G8T7','Q79RC1'])

HTTPError: 400 Client Error: Bad Request for url: http://www.uniprot.org/uniprot/?query=yourlist:M2018021683C3DD8CE55183C76102DC5D3A26728B0791C6D&sort=yourlist:M2018021683C3DD8CE55183C76102DC5D3A26728B0791C6D&format=noformat

In [26]:
df.protein_a

0       Q8G8T7
1       Q8G8T7
2       Q8G8T7
3       Q8G8T7
4       Q4VP87
5       Q8G8T7
6       Q4VP87
7       Q4VP87
8       Q79RC1
9       Q8G8T7
10      Q8G8T7
11      Q8G8T7
12      Q79RC1
13      Q4VP87
14      Q8G8T7
15      Q4VP87
16      Q4VP87
17      Q4VP87
18      Q4VP87
19      Q4VP87
20      Q79RC4
21      Q79RC1
22      Q8G8T7
23      Q5NFU8
24      Q5NGF9
25      Q5NFB0
26      Q5NFA1
27      Q5NHL7
28      Q5NEX4
29      Q5NEB5
         ...  
2607    Q5NEM9
2608    Q5NHX0
2609    Q5NII1
2610    Q5NF60
2611    Q5NFN3
2612    Q5NGG6
2613    Q5NGC3
2614    Q5NEC0
2615    Q5NG53
2616    Q5NH85
2617    Q5NII1
2618    Q5NFN4
2619    Q5NID2
2620    Q5NGZ2
2621    Q5NIJ2
2622    Q5NID2
2623    Q5NF50
2624    Q5NEC0
2625    Q5NF29
2626    Q5NG68
2627    Q5NF67
2628    Q5NGR3
2629    Q5NG37
2630    Q5NH32
2631    Q5NF37
2632    Q5NET2
2633    Q5NHX9
2634    Q5NEU6
2635    Q5NF36
2636    Q5NEC0
Name: protein_a, Length: 2637, dtype: object

In [21]:
ID_all =[]
for i in range (0,len(df)):
    ID_list=[]
    ID_list.append (df.iloc[i,0])
    ID_list.append (df.iloc[i,1])
    ID_list.append (df.iloc[i,3])
    ID_list.append (df.iloc[i,2])
    ID_list.sort()
    ID_all.append(
        ID_list[0]
        +ID_list[1] 
        + str(ID_list[2])
        +ID_list[3]
    )
df['id']=ID_all

In [22]:
print(len(df.ID.unique()))
df.pubmedid.unique()

2275


array(['20711500'], dtype=object)

Make identifier PHISTO and make Taxid_B column

In [23]:
ID_all =[]
for i in range (0,len(PHISTO)):
    ID_list=[]
    ID_list.append (PHISTO.iloc[i,1])
    ID_list.append (PHISTO.iloc[i,2])
    ID_list.append (PHISTO.iloc[i,3])
    ID_list.append (str(PHISTO.iloc[i,4]))
    ID_list.sort()
    ID_all.append(
        ID_list[0]
        +ID_list[1] 
        + ID_list[2]
        + ID_list[3]
    )
PHISTO=PHISTO.assign(ID=ID_all)

PHISTO["taxid_b"]="9606"
PHISTO["source_db"]="PHISTO"

In [24]:
print(len(PHISTO.ID.unique()))
print(len(PHISTO))

1331
1331


merge df and PHISTO

In [25]:
df = pd.concat([df,PHISTO])
len(df)

3968

Remove redundancy

In [26]:
df = df.sort_values(by = "conf_score", ascending = False)
df = df.drop_duplicates("ID")
print(len(df))

3606


Count how many times an interaction has been found with different methods, different types of interactions, different publications,...

In [27]:
count=df.groupby(["source_db"]).count()
count

Unnamed: 0_level_0,Alias_A,Alias_B,Conf_score,D_method,ID,Int_type,Protein_A,Protein_B,PubmedID,Taxid_A,Taxid_B
Source_DB,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
IntAct,2275,2275,2275,2275,2275,2275,2275,2275,2275,2275,2275
PHISTO,0,0,0,1331,1331,0,1331,1331,1331,1331,1331


In [28]:
df.groupby("pubmedid").size()

PubmedID
20711500    1331
20711500    2275
dtype: int64

In [29]:
df = df.reset_index()
df = df[["taxid_a","taxid_b","protein_a","protein_b","int_type","d_method","pubmedid","source_db","conf_score","ID"]]

In [30]:
df.to_csv("/Users/thesis/Desktop/Python scripts/Test outputs/" + species, sep ="\t")

In [31]:
df

Unnamed: 0,Taxid_A,Taxid_B,Protein_A,Protein_B,Int_type,D_method,PubmedID,Source_DB,Conf_score,ID
0,119856,9606,Q8G8T7,P13598,physical association,two hybrid pooling approach,20711500,IntAct,0.37,P13598Q8G8T7psi-mi:icam2_human(display_long)|u...
1,177416,9606,Q5NID9,Q15650,physical association,two hybrid pooling approach,20711500,IntAct,0.37,Q15650Q5NID9psi-mi:eftu_fratt(display_long)|un...
2,177416,9606,Q5NF74,Q96A61,physical association,two hybrid pooling approach,20711500,IntAct,0.37,Q5NF74Q96A61psi-mi:q5nf74_fratt(display_long)|...
3,177416,9606,Q5NGC7,Q92547,physical association,two hybrid pooling approach,20711500,IntAct,0.37,Q5NGC7Q92547psi-mi:q5ngc7_fratt(display_long)|...
4,177416,9606,Q5NIP6,O14773,physical association,two hybrid pooling approach,20711500,IntAct,0.37,O14773Q5NIP6psi-mi:gata_fratt(display_long)|un...
5,177416,9606,Q5NEV2,O94842,physical association,two hybrid pooling approach,20711500,IntAct,0.37,O94842Q5NEV2psi-mi:q5nev2_fratt(display_long)|...
6,177416,9606,Q5NES5,Q8IYM9,physical association,two hybrid pooling approach,20711500,IntAct,0.37,Q5NES5Q8IYM9psi-mi:q5nes5_fratt(display_long)|...
7,177416,9606,Q5NEX4,Q8NDV7,physical association,two hybrid pooling approach,20711500,IntAct,0.37,Q5NEX4Q8NDV7psi-mi:q5nex4_fratt(display_long)|...
8,177416,9606,Q5NIP6,Q8NDV7,physical association,two hybrid pooling approach,20711500,IntAct,0.37,Q5NIP6Q8NDV7psi-mi:gata_fratt(display_long)|un...
9,177416,9606,Q5NHD1,O94826,physical association,two hybrid pooling approach,20711500,IntAct,0.37,O94826Q5NHD1psi-mi:q5nhd1_fratt(display_long)|...
