# Introduction
A brief overview of basic E-utility functions along with examples in BioPython.

Note: Make sure BioPython is installed

In [1]:
from Bio import Entrez

#email Set the Entrez email parameter (default is not set).
Entrez.email = "great_team@hackathon.ncbi.org"

#tool Set the Entrez tool parameter (default is biopython).
Entrez.tool = "hackathon_examples"

## einfo
Provides field index term counts, last update, and available links for each database.

Example: If no database parameter is supplied, einfo will return a list of all valid Entrez databases.

In [13]:
handle = Entrez.einfo()
db_info = Entrez.read(handle)
handle.close()
print( "Available Databases: %s" % (' ,'.join(db_info["DbList"])))

Available Databases: pubmed ,protein ,nuccore ,ipg ,nucleotide ,nucgss ,nucest ,structure ,sparcle ,genome ,annotinfo ,assembly ,bioproject ,biosample ,blastdbinfo ,books ,cdd ,clinvar ,clone ,gap ,gapplus ,grasp ,dbvar ,gene ,gds ,geoprofiles ,homologene ,medgen ,mesh ,ncbisearch ,nlmcatalog ,omim ,orgtrack ,pmc ,popset ,probe ,proteinclusters ,pcassay ,biosystems ,pccompound ,pcsubstance ,pubmedhealth ,seqannot ,snp ,sra ,taxonomy ,biocollections ,unigene ,gencoll ,gtr


Example: Find database statistics for Entrez Protein.

In [3]:
db='protein'
handle = Entrez.einfo(db=db)
protein_db_info = Entrez.read(handle)
handle.close()
dbInfo = protein_db_info['DbInfo']
print( 'DB: '+dbInfo['MenuName'])
print( 'Count: '+dbInfo['Count'])
print( 'DbBuild: '+dbInfo['DbBuild'])

DB: Protein
Count: 456909504
DbBuild: Build171106-1116m.1


## esearch

Searches and retrieves primary IDs (for use in EFetch, ELink, and ESummary) and term translations and optionally retains results for future use in the user's environment.

Example: search Entrez pubmed for breast cancer articles published in Science during 2008.

In [4]:
db    = 'pubmed'
query = 'science[journal] AND breast cancer AND 2008[pdat]'

handle = Entrez.esearch(db, query, useHistory=True)
search_results =Entrez.read(handle)
handle.close()
print('QueryKey: %s in WebEnv: %s' % ( search_results['QueryKey'], search_results['WebEnv']))
print('Found %s Pubmed IDs: [%s]' % (search_results['Count'], ', '.join(search_results['IdList'])))

QueryKey: 1 in WebEnv: NCID_1_45725515_130.14.18.34_9001_1510159928_560518390_0MetA0_S_MegaStore_F_1
Found 6 Pubmed IDs: [19008416, 18927361, 18787170, 18487186, 18239126, 18239125]


## esummary

Retrieves document summaries from a list of primary IDs or from the user's environment.

Example: retrieve summaries and display titles

In [5]:
handle = Entrez.esummary(db=db, id=','.join(search_results['IdList']))

summaries = Entrez.read(handle)
handle.close()

for title in [summary['Title'] for summary in summaries]:
    print("Title: "+title)

Title: Genomic loss of microRNA-101 leads to overexpression of histone methyltransferase EZH2 in cancer.
Title: Genetics. DNA test for breast cancer risk draws criticism.
Title: FBXW7 targets mTOR for degradation and cooperates with PTEN in tumor suppression.
Title: Design logic of a cannabinoid receptor signaling network that triggers neurite outgrowth.
Title: Cancer proliferation gene discovery through functional genomics.
Title: Profiling essential genes in human mammary cells by multiplex RNAi screening.


## elink
Checks for the existence of an external or Related Articles link from a list of one or more primary IDs. 
Retrieves primary IDs and relevancy scores for links to Entrez databases or Related Articles; creates a hyperlink to the primary LinkOut provider for a specific ID and database, or lists LinkOut URLs and Attributes for multiple IDs.

Example: Find related items related to initial article (PMID:19008416)

In [6]:
db = 'pubmed'
id = 19008416
handle = Entrez.elink(db=db, id=id)

for linkset in Entrez.read(handle):
    linksetdb=linkset['LinkSetDb']
    for links in linksetdb:
         print('%s has %d links' % (links['LinkName'], len(links['Link'])))
            
handle.close()


pubmed_pubmed has 267 links
pubmed_pubmed_alsoviewed has 17 links
pubmed_pubmed_citedin has 324 links
pubmed_pubmed_combined has 6 links
pubmed_pubmed_five has 6 links
pubmed_pubmed_refs has 25 links
pubmed_pubmed_reviews has 35 links
pubmed_pubmed_reviews_five has 6 links


## efetch
Retrieves records in the requested format from a list of one or more primary IDs or from the user's environment

In [7]:
db = 'nuccore'
return_type = 'fasta'
query = 'NM_001126.3'
number_of_rows_to_display = 10

handle = Entrez.esearch(db=db, term=query)
search_results = Entrez.read(handle)
handle.close()

f = Entrez.efetch(db=db, id=search_results['IdList'], rettype=return_type)
data = f.read().splitlines()
print("Showing %s of %s lines" % (number_of_rows_to_display, len(data)))
for line in data[:number_of_rows_to_display]:
    print(line)
f.close()

Showing 10 of 42 lines
>NM_001126.3 Homo sapiens adenylosuccinate synthase (ADSS), mRNA
ACGGGAGTGGCGCGCCAGGCCGCGGAAGGGGCGTGGCCTCGGTCCGGGGTGGCGGCCGTTGCCGCCACCA
GGGCCTCTTCCTGCGGGCGGTGCTGCCGAGGCCGGCCTGCGCGGGGCAGTCATGGTACCCCCTTGAGCGG
GCTGTGGCGGAGAGCGGGGCGGGGACTGGCTGGAGGGTGGCGGCCCGGCGGGGCGGGGGCGGGGCCGGCC
TCTGGCTCCTTCTTCCTCTGCATGTGGCTGGCGGCCGCAGAGCAGTTCAGTTCGCTCACTCCTCGCCGGC
CGCCTCTCCTTCGGGCTCTCCTCGCGTCACTGGAGCCATGGCGTTCGCCGAGACCTACCCGGCGGCATCC
TCCCTGCCCAACGGCGATTGCGGCCGCCCCAGGGCGCGGCCCGGAGGAAACCGGGTGACGGTGGTGCTCG
GTGCGCAGTGGGGCGACGAAGGCAAAGGGAAGGTGGTGGACCTGCTGGCGCAGGACGCCGACATCGTGTG
CCGCTGCCAGGGAGGAAATAATGCTGGCCATACAGTTGTTGTGGATTCTGTGGAATATGATTTTCATCTC
TTACCCAGTGGAATAATTAATCCAAATGTCACTGCATTCATTGGAAATGGTGTGGTAATTCATCTACCTG


## egquery
Provides Entrez database counts for a single search using Global Query.

In [8]:
query = 'NM_001126.3'

handle = Entrez.egquery(term=query)
results = Entrez.read(handle)

output = [r for r in results['eGQueryResult'] if r['Count'] != "0"]
for result in output:
    print('DB:%s found %s results' % (result['DbName'], result['Count']))

DB:nuccore found 1 results
DB:snp found 5286 results
DB:gene found 1 results
DB:unigene found 1 results
DB:homologene found 1 results


## espell
Retrieves spelling suggestions.

In [9]:
query = 'geene transxript protien'

handle = Entrez.espell(term=query)

corrected = Entrez.read(handle)

print(corrected['CorrectedQuery'])

gene transcript protein


## ecitmatch
Retrieves PubMed IDs (PMIDs) that correspond to a set of input citation strings.

Example: retrieve PMID for multiple citations, and display their titles.

In [10]:
db = 'pubmed'
citations = [{"journal_title": "proc natl acad sci u s a", "x": 1, "year": "1991", "author_name": "mann bj", "key": "citation_1"},
             {"journal_title": "PLoS One", "year": "2016", "author_name": "pick l" }]

handle = Entrez.ecitmatch(db=db, bdata=citations)

tmp = handle.read().strip().split('\n')
results = [i.split("|") for i in tmp]

for result in results:
    print(result)
    print(Entrez.read(Entrez.esummary(db=db, id=result[6]))[0]['Title'])


['proc natl acad sci u s a', '1991', '', '', 'mann bj', 'citation_1', '2014248']
Sequence of a cysteine-rich galactose-specific lectin of Entamoeba histolytica.
['PLoS One', '2016', '', '', 'pick l', '', '27723822']
Activation of Ftz-F1-Responsive Genes through Ftz/Ftz-F1 Dependent Enhancers.


## epost
Posts a file containing a list of primary IDs for future use in the user's environment to use with subsequent search strategies

Example: upload a list nuccore ids and retrieve summaries and sequences

In [11]:
db = 'nuccore'
ids = search_results['IdList']
rettype = 'fasta'
number_of_rows_to_display = 20

handle = Entrez.epost(db=db, id=','.join(ids))
env = Entrez.read(handle)
print(env)

handle2 = Entrez.esummary(db=db, query_key=env['QueryKey'], WebEnv=env['WebEnv'])

summaries = Entrez.read(handle2)
handle.close()

for title in [summary['Title'] for summary in summaries]:
    print("Title: "+title)
    
fetch = Entrez.efetch(db=db, rettype=rettype, query_key=env['QueryKey'], WebEnv=env['WebEnv'])

data = fetch.read().splitlines()
print("Showing %s of %s lines" % (number_of_rows_to_display, len(data)))
for line in data[:number_of_rows_to_display]:
    print(line)
fetch.close()

{'WebEnv': 'NCID_1_15171705_130.14.22.215_9001_1510159932_1922208742_0MetA0_S_MegaStore_F_1', 'QueryKey': '1'}
Title: Homo sapiens adenylosuccinate synthase (ADSS), mRNA
Showing 20 of 42 lines
>NM_001126.3 Homo sapiens adenylosuccinate synthase (ADSS), mRNA
ACGGGAGTGGCGCGCCAGGCCGCGGAAGGGGCGTGGCCTCGGTCCGGGGTGGCGGCCGTTGCCGCCACCA
GGGCCTCTTCCTGCGGGCGGTGCTGCCGAGGCCGGCCTGCGCGGGGCAGTCATGGTACCCCCTTGAGCGG
GCTGTGGCGGAGAGCGGGGCGGGGACTGGCTGGAGGGTGGCGGCCCGGCGGGGCGGGGGCGGGGCCGGCC
TCTGGCTCCTTCTTCCTCTGCATGTGGCTGGCGGCCGCAGAGCAGTTCAGTTCGCTCACTCCTCGCCGGC
CGCCTCTCCTTCGGGCTCTCCTCGCGTCACTGGAGCCATGGCGTTCGCCGAGACCTACCCGGCGGCATCC
TCCCTGCCCAACGGCGATTGCGGCCGCCCCAGGGCGCGGCCCGGAGGAAACCGGGTGACGGTGGTGCTCG
GTGCGCAGTGGGGCGACGAAGGCAAAGGGAAGGTGGTGGACCTGCTGGCGCAGGACGCCGACATCGTGTG
CCGCTGCCAGGGAGGAAATAATGCTGGCCATACAGTTGTTGTGGATTCTGTGGAATATGATTTTCATCTC
TTACCCAGTGGAATAATTAATCCAAATGTCACTGCATTCATTGGAAATGGTGTGGTAATTCATCTACCTG
GATTGTTTGAAGAAGCAGAGAAAAATGTTCAAAAAGGAAAAGGACTAGAAGGCTGGGAAAAAAGGCTTAT
TATATCTGACAGAGCTCATATTGTATTTGATT