In [61]:
import requests
import xml.etree.ElementTree as xml
from IPython.display import HTML, display
from Bio import Entrez

#email Set the Entrez email parameter (default is not set).
Entrez.email = "great_team@hackathon.ncbi.org"

#tool Set the Entrez tool parameter (default is biopython).
Entrez.tool = "hackathon_examples"

def get_nuccore_id(uid):
    """
    Get nuccore id by its refseq id.
    """
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
    params = {'dbfrom': 'assembly', 'db':'nuccore', 'retmode':'json', 'id': uid}
    r = requests.get(url, params=params)
    if r.status_code != 200:
        raise Exception("Cant nuccore id for specified assembly")
    doc = r.json()['linksets'][0]['linksetdbs']
    for link in doc:
        if link['linkname'] == "assembly_nuccore_refseq":
            return int(link['links'][0])
    else:
        return int(doc[0]['links'][0])

def list_genes(nuc_id):
    """
    List genes for specified organism nuccore id.
    """
    url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/elink.fcgi'
    params = {'dbfrom':'nuccore', 'db': 'gene', 'retmode': 'json', 'id':nuc_id}
    r = requests.get(url, params=params)
    if r.status_code != 200:
        raise Exception("Cant find annotation")
    return r.json()['linksets'][0]['linksetdbs'][0]['links']

In [62]:
accession = 'GCF_000013425.1'  # Organism Assembly

In [100]:
def annotate(accession):
    params = {'release':accession}
    r = requests.get(
        'https://www.ncbi.nlm.nih.gov/projects/r_gencoll/asm4portal/gc_assembly_svc.cgi',
        params=params)
    if r.status_code != 200:
        raise Exception("Cant find assembly")
    doc = r.content.decode('utf-8')
    root = xml.fromstring(doc)
    refseq_id = int(root.attrib['uid'])
    header = root.find('header')
    organism_name = header.find('species-organism').text

    html = "<h1>" + organism_name + "</h1>"
    
    #search = eutils.client.Client().esearch(db="pubmed",term=organism_name)
    
    #html += "<p>" + str(len(search.ids)) + " pubmeds found</p>" 

    nuccore_id = get_nuccore_id(refseq_id)

    genes = list_genes(nuccore_id)

    html += "<h3> Genes: </h3>"

    html += "<table>"
    
    max_show = 10

    html += "<tr>"
        
    html += "<th> Gene Locus </th>"
    html += "<th> Protein </th>"
    html += "<th> Protein sequence </th>"
    html += "</tr>"
    for gene in genes[0:max_show]:
        egs=Entrez.efetch(db='gene', id=gene)
        handle = Entrez.efetch(db="gene", id=gene, retmode="xml")
        root = xml.fromstring(handle.read())
        gene_node = root.find('Entrezgene')
        locus = gene_node.find('Entrezgene_gene').find('Gene-ref').find('Gene-ref_locus-tag').text
        prot = gene_node.find('Entrezgene_prot').find('Prot-ref').find('Prot-ref_name').find('Prot-ref_name_E').text
        
        loc_node = gene_node.find('Entrezgene_locus')
        com = loc_node.find('Gene-commentary')
        prod = com.find('Gene-commentary_products')
        prod_com = prod.find('Gene-commentary')
        prod_seq = prod_com.find('Gene-commentary_accession').text
        prod_ver = prod_com.find('Gene-commentary_version').text
        html += "<tr>"
        
        html += "<td>" + locus + "</td>"
        html += "<td>" + prot + "</td>"
        html += "<td>" + prod_seq + "." + prod_ver + "</td>"
        html += "</tr>"
    html += "</table>"

    if len(genes) > max_show:
        html += "<p>And " + str(len(genes) - max_show) + " more... </p>"

    display(HTML(html))

In [101]:
annotate(accession)

Gene Locus,Protein,Protein sequence
SAOUHSC_3042a,SAOUHSC_13820,YP_008530245.1
SAOUHSC_03037a,membrane protein,YP_008530244.1
SAOUHSC_02512a,30S ribosomal protein S10,YP_008530243.1
SAOUHSC_01761a,membrane protein,YP_008530242.1
SAOUHSC_1342a,large-conductance mechanosensitive channel,YP_008530241.1
SAOUHSC_1307a,hypothetical protein,YP_008530240.1
SAOUHSC_01055a,membrane protein,YP_008530239.1
SAOUHSC_00381a,hypothetical protein,YP_008530238.1
SAOUHSC_02009,hypothetical protein,YP_500506.1
SAOUHSC_02008,hypothetical protein,YP_500505.1
