In [1]:
import urllib.parse
import urllib.request
import pandas as pd

# Constants
URL_UNIPROT = 'https://www.uniprot.org/uploadlists/'
URL_STRING = 'http://string-db.org/api/tsv/interactorsList?'
PATH_ORIGINAL_DB = '../datasets/original.txt'

In [2]:
# Import original dataset: convert to a unique query string
original_proteins = ''
with open(PATH_ORIGINAL_DB) as file:
    for line in file:
        original_proteins += line[:-1] + ' '

In [3]:
# Make query from UniProt to STRING

params_map = {
'from': 'ACC',
'to': 'STRING_ID',
'format': 'tab',
'query': original_proteins[:-1],
'columns': 'id'
}

data = urllib.parse.urlencode(params_map)
data = data.encode('utf-8')
req = urllib.request.Request(URL_UNIPROT, data)
with urllib.request.urlopen(req) as f:
    response = f.read()
print(response.decode('utf-8')[:200])

From	To
Q7KZ85	9606.ENSP00000319104
O75791	9606.ENSP00000339186
P62993	9606.ENSP00000376345
Q13588	9606.ENSP00000284154
Q06124	9606.ENSP00000340944
P29350	9606.ENSP00000391592
A6NKC9	9606.ENSP00000327


In [4]:
# Create dataframe with Swissprot - STRING mapping for original dataset
string_ids = pd.DataFrame( list(map(lambda x: x.split(), response.decode('utf-8').split('\n')[:-1]))[1:], columns = ['id','string'] )
string_ids.head()

Unnamed: 0,id,string
0,Q7KZ85,9606.ENSP00000319104
1,O75791,9606.ENSP00000339186
2,P62993,9606.ENSP00000376345
3,Q13588,9606.ENSP00000284154
4,Q06124,9606.ENSP00000340944


In [6]:
len(string_ids.id.values) == len(set(string_ids.id.values))

True

In [7]:
'''Retrive interactors for the query protein (STRING ID)'''

def string_request(protein):

    params = {
        'identifier': protein
    }

    data = urllib.parse.urlencode(params)
    data = data.encode('utf-8')
    req = urllib.request.Request(URL_STRING, data)
    with urllib.request.urlopen(req) as f:
        response = f.read()
    # Header and query protein excluded; last empty string excluded
    return response.decode('utf-8').split('\n')[2:-1]

In [8]:
interactors = []
for protein in string_ids.string.values:
    interactors.extend(string_request(protein))

In [9]:
len(interactors) == len(set(interactors))

False

In [7]:
params_str = {
'from': 'STRING_ID',
'to': 'ACC',
'format': 'tab',
'query': ','.join(set(interactors)),
'columns': 'id'
}

data = urllib.parse.urlencode(params_str)
data = data.encode('utf-8')
req = urllib.request.Request(URL_UNIPROT, data)
with urllib.request.urlopen(req) as f:
    response = f.read()
print(response.decode('utf-8')[:200])

Entry	yourlist:M202002116746803381A1F0E0DB47453E0216320D77A3ACT
O75122	9606.ENSP00000417518
P17948	9606.ENSP00000282397
P10747	9606.ENSP00000324890
P15153	9606.ENSP00000249071
Q5T9C2	9606.ENSP00000362


In [8]:
# Create dataframe with Swissprot - STRING mapping for original dataset + interactors

frames = [string_ids, pd.DataFrame( list(map(lambda x: x.split(), 
                                             response.decode('utf-8').split('\n')[:-1]))[1:], columns = ['id','string'] )]
string_dataset = pd.concat(frames, sort = False)

In [9]:
string_dataset.head()

Unnamed: 0,id,string
0,Q7KZ85,9606.ENSP00000319104
1,O75791,9606.ENSP00000339186
2,P62993,9606.ENSP00000376345
3,Q13588,9606.ENSP00000284154
4,Q06124,9606.ENSP00000340944


In [10]:
string_dataset.shape

(491, 2)

In [11]:
string_dataset.to_csv('../datasets/string.cvs')