In [94]:
'''
class ReactionManager(object):
    #Class to implement a manager of Reaction data.

    def __init__(self):
        #Constructor.
        self.__nodes = {}
        self.__reac_ids = {}
        self.__reac_enz_rels = []
        self.__org_enz_rels = []
        self.__enz_man = EnzymeManager()
'''


def add_uniprot_data(enzyme_ids, source, num_threads=0):
    print(enzyme_ids)
    '''Gets Uniprot data.'''

    #fields = ['entry name', 'protein names', 'organism-id', 'ec']
    fields = ['id', 'protein_name', 'organism_id', 'ec']
    #enzyme_ids = [enzyme_id for enzyme_id in enzyme_ids if enzyme_id not in self.__nodes]
    uniprot_values = get_uniprot_values(enzyme_ids, fields,batch_size=128,verbose=False,num_threads=num_threads)

    print('add_uniprot_data function: added uniprot values: ',len(uniprot_values))



    for uniprot_id, uniprot_value in uniprot_values.items():
        enzyme_node = {':LABEL': 'Enzyme',
                       'uniprot:ID(Enzyme)': uniprot_id}
        #self.__nodes[uniprot_id] = enzyme_node

        organism_id = uniprot_value.pop('Organism (ID)') \
            if 'Organism (ID)' in uniprot_value else None

        if 'Entry name' in uniprot_value:
            enzyme_node['entry'] = uniprot_value['Entry name']

        if 'Protein names' in uniprot_value:
            enzyme_node['names'] = uniprot_value['Protein names']

            if enzyme_node['names']:
                enzyme_node['name'] = enzyme_node['names'][0]

        if 'EC number' in uniprot_value:
            enzyme_node['ec-code'] = uniprot_value['EC number']

        #if organism_id:
            #self.__org_enz_rels.append([organism_id, 'expresses',uniprot_id, {'source': source}])
            

def get_uniprot_values(uniprot_ids, fields, batch_size, verbose=False, num_threads=0):
    values = []

    if num_threads:
        thread_pool = thread_utils.ThreadPool(num_threads)

        for i in range(0, len(uniprot_ids), batch_size):
            thread_pool.add_task(_get_uniprot_batch, uniprot_ids, i,batch_size, fields, values, verbose)

        thread_pool.wait_completion()
    else:
        for i in range(0, len(uniprot_ids), batch_size):
            _get_uniprot_batch(uniprot_ids, i, batch_size, fields, values,verbose)

    return {value['Entry']: value for value in values}



def _get_uniprot_batch(uniprot_ids, i, batch_size, fields, values, verbose):
    '''Get batch of Uniprot data.'''
    if verbose:
        print('seq_utils: getting Uniprot values ' + str(i) + ' - ' +
              str(min(i + batch_size, len(uniprot_ids))) + ' / ' +
              str(len(uniprot_ids)))

    #If getting values in batch Remove 'accession:' +  from start of join([HERE .....]) and accession: from query=HERE
    batch = uniprot_ids[i:min(i + batch_size, len(uniprot_ids))]
    query = '%20OR%20'.join(['accession:' + uniprot_id for uniprot_id in batch])
    url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \
        '&format=tsv&fields=accession%2C' + '%2C'.join([parse.quote(field)
                                              for field in fields])

    print(url)

    _parse_uniprot_data(url, values)
    
    
def _parse_uniprot_data(url, values):
    '''Parses Uniprot data.'''
    headers = None

    try:
        resp = requests.get(url, allow_redirects=True)

        for line in resp.iter_lines():
            line = line.decode('utf-8')
            tokens = line.strip().split('\t')

            if headers is None:
                headers = tokens
            else:
                resp = dict(zip(headers, tokens))

                if 'Protein names' in resp:
                    regexp = re.compile(r'(?<=\()[^)]*(?=\))|^[^(][^()]*')
                    names = regexp.findall(resp.pop('Protein names'))
                    resp['Protein names'] = [nme.strip() for nme in names]

                for key in resp:
                    if key.startswith('Cross-reference'):
                        resp[key] = resp[key].split(';')
                values.append(resp)
        print('values from parse_uniprot_data: ',type(values))
        return values
    except Exception as err:
        print(err)
    


In [45]:
### Query by protein ID


from urllib import parse
import requests
import re


num_threads = 1
source = 'rhea'
enzyme_ids = ['B4RBW1', 'A9BIS7', 'B5Z3E3']

add_uniprot_data(enzyme_ids, source)


['B4RBW1', 'A9BIS7', 'B5Z3E3']
https://rest.uniprot.org/uniprotkb/search?query=accession:B4RBW1%20OR%20accession:A9BIS7%20OR%20accession:B5Z3E3&format=tsv&fields=accession%2Cid%2Cprotein_name%2Corganism_id%2Cec
add_uniprot_data function: added uniprot values:  3


In [102]:

#Download then work with it

def add_uniprot_data_organism(organism_ids, source, num_threads=0):
    print(organism_ids)
    '''Gets Uniprot data.'''

    #fields = ['entry name', 'protein names', 'organism-id', 'ec']
    fields = ['id', 'protein_name', 'organism_id', 'ec']
    #enzyme_ids = [enzyme_id for enzyme_id in enzyme_ids if enzyme_id not in self.__nodes]
    organism_values = get_uniprot_values_organism(organism_ids, fields,batch_size=128,verbose=False,num_threads=num_threads)

    print('add_uniprot_data function: added uniprot values: ',len(organism_values))



    for uniprot_id, uniprot_value in organism_values.items():
        enzyme_node = {':LABEL': 'Enzyme',
                       'uniprot:ID(Enzyme)': uniprot_id}
        #self.__nodes[uniprot_id] = enzyme_node

        organism_id = uniprot_value.pop('Organism (ID)') \
            if 'Organism (ID)' in uniprot_value else None

        if 'Entry name' in uniprot_value:
            enzyme_node['entry'] = uniprot_value['Entry name']

        if 'Protein names' in uniprot_value:
            enzyme_node['names'] = uniprot_value['Protein names']

            if enzyme_node['names']:
                enzyme_node['name'] = enzyme_node['names'][0]

        if 'EC number' in uniprot_value:
            enzyme_node['ec-code'] = uniprot_value['EC number']

        #if organism_id:
            #self.__org_enz_rels.append([organism_id, 'expresses',uniprot_id, {'source': source}])
            
    return organism_values

def get_uniprot_values_organism(organism_ids, fields, batch_size, verbose=False, num_threads=0):
    values = []

    for i in range(0, len(organism_ids), batch_size):
        values = _get_uniprot_batch_organism(organism_ids, i, batch_size, fields, values,verbose)

    return {value['Organism (ID)']: value for value in values}


def _get_uniprot_batch_organism(uniprot_ids, i, batch_size, fields, values, verbose):
    '''Get batch of Uniprot data.'''
    if verbose:
        print('seq_utils: getting Uniprot values ' + str(i) + ' - ' +
              str(min(i + batch_size, len(uniprot_ids))) + ' / ' +
              str(len(uniprot_ids)))

    #If getting values in batch Remove 'accession:' +  from start of join([HERE .....]) and accession: from query=HERE
    batch = uniprot_ids[i:min(i + batch_size, len(uniprot_ids))]
    query = '%20OR%20'.join(['organism_id:' + uniprot_id for uniprot_id in batch])
    url = 'https://rest.uniprot.org/uniprotkb/search?query=' + query + \
        '&format=tsv&fields=organism_id%2C' + '%2C'.join([parse.quote(field)
                                              for field in fields])

    print('_get_uniprot_batch_organism url: ',url)

    values = _parse_uniprot_data(url, values)
    return values
    
    


In [103]:
### Query by organism ID

#query = 'https://rest.uniprot.org/uniprotkb/search?query=organism_id:226900'


source = 'rhea'
organism_ids = ['226900','296591']

organism_values = add_uniprot_data_organism(organism_ids, source)

print(organism_values)

['226900', '296591']
_get_uniprot_batch_organism url:  https://rest.uniprot.org/uniprotkb/search?query=organism_id:226900%20OR%20organism_id:296591&format=tsv&fields=organism_id%2Cid%2Cprotein_name%2Corganism_id%2Cec
values from parse_uniprot_data:  <class 'list'>
add_uniprot_data function: added uniprot values:  2
{'226900': {'Entry Name': 'GLMU_BACCR', 'EC number': '2.3.1.157; 2.7.7.23', 'Protein names': ['Bifunctional protein GlmU [Includes: UDP-N-acetylglucosamine pyrophosphorylase', 'EC 2.7.7.23', 'N-acetylglucosamine-1-phosphate uridyltransferase', 'EC 2.3.1.157']}, '296591': {'Entry Name': 'RLMN_POLSJ', 'EC number': '2.1.1.192', 'Protein names': ['Dual-specificity RNA methyltransferase RlmN', 'EC 2.1.1.192', '23S rRNA (adenine(2503', '2', '23S rRNA m2A2503 methyltransferase', 'Ribosomal RNA large subunit methyltransferase N', 'tRNA (adenine(37', '2', 'tRNA m2A37 methyltransferase']}}
