In [1]:
import requests
import xml.etree.ElementTree as xml
from IPython.display import HTML, display
import eutils.client

ec = eutils.client.Client()

def get_nuccore_id(uid):
    """
    Get nuccore id by its refseq id.
    """
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
    params = {'dbfrom': 'assembly', 'db':'nuccore', 'retmode':'json', 'id': uid}
    r = requests.get(url, params=params)
    if r.status_code != 200:
        raise Exception("Cant nuccore id for specified assembly")
    doc = r.json()['linksets'][0]['linksetdbs']
    for link in doc:
        if link['linkname'] == "assembly_nuccore_refseq":
            return int(link['links'][0])
    else:
        return int(doc[0]['links'][0])

def list_genes(nuc_id):
    """
    List genes for specified organism nuccore id.
    """
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
    params = {'dbfrom':'nuccore', 'db': 'gene', 'retmode': 'json', 'id':nuc_id}
    r = requests.get(url, params=params)
    if r.status_code != 200:
        raise Exception("Cant find annotation")
    return r.json()['linksets'][0]['linksetdbs'][0]['links']

In [2]:
accession = 'GCF_000013425.1'  # Organism Assembly

In [3]:
def annotate(accession):
    params = {'release':accession}
    r = requests.get(
        'https://www.ncbi.nlm.nih.gov/projects/r_gencoll/asm4portal/gc_assembly_svc.cgi',
        params=params)
    if r.status_code != 200:
        raise Exception("Cant find assembly")
    doc = r.content.decode('utf-8')
    root = xml.fromstring(doc)
    refseq_id = int(root.attrib['uid'])
    header = root.find('header')
    organism_name = header.find('species-organism').text

    html = "<h1>" + organism_name + "</h1>"

    nuccore_id = get_nuccore_id(refseq_id)

    genes = list_genes(nuccore_id)

    html += "<h3> Genes: </h3>"

    html += "<table>"

    max_show = 100
    for gene in genes[0:max_show]:
        egs = ec.efetch(db='gene', id=gene)
        eg = egs.entrezgenes[0]
        html += "<tr>"
        prot = eg.references[0].products[0]
        html += "<td>" + prot.acv + "</td>"
        html += "<td>" + prot.label + "</td>"
        html += "</tr>"
    html += "</table>"

    if len(genes) > max_show:
        html += "<p>And " + str(len(genes) - max_show) + " more... </p>"

    display(HTML(html))

In [4]:
annotate(accession)

0,1
YP_008530245.1,SAOUHSC_13820
YP_008530244.1,membrane protein
YP_008530243.1,30S ribosomal protein S10
YP_008530242.1,membrane protein
YP_008530241.1,large-conductance mechanosensitive channel
YP_008530240.1,hypothetical protein
YP_008530239.1,membrane protein
YP_008530238.1,hypothetical protein
YP_500506.1,hypothetical protein
YP_500505.1,hypothetical protein
