In [1]:
# download package
# ! pip install biopython

# import package
from Bio import Entrez
from urllib.error import HTTPError
import requests, sys, json, string, pprint
import xml.etree.ElementTree as ET

# Gene

- ENSEMBL

In [2]:
def cafegenetree_search(symbol=""):
    """
    Searches the cafe tree of the gene tree that contains the gene (symbol) in Ensemble (only for Homo sapiens)
    Returns the cafe tree of the gene tree 
    """
    try:
        server = "https://rest.ensembl.org"
        ext = "/cafe/genetree/member/symbol/homo_sapiens/{}?nh_format=simple".format(symbol)
        url = server + ext

        r = requests.get(url, headers={"Content-Type":"application/json"})

        decoded = r.json()

        if list(decoded.keys())[0] != "error":
            return decoded['tree']
        else:
            first_attempt = "error"

        # revise symbol input    
        if first_attempt == "error":
            symbol = symbol.replace(" ", "").replace("-", "")

            ext = "/cafe/genetree/member/symbol/homo_sapiens/{}?nh_format=simple".format(symbol)
            url = server + ext

            r = requests.get(url, headers={"Content-Type":"application/json"})

            decoded = r.json()

            if list(decoded.keys())[0] != "error":
                return decoded['tree']
            else:
                return "Error input: Please enter a gene term."
    
    # avoid no string parameter
    except AttributeError:
        return "Error input: Please enter a gene term."  

    

def genetree_search(symbol="", seq = "cdna"):
    """
    Searches the gene tree that contains the gene (symbol) in Ensemble (only for Homo sapiens)
    Returns the gene tree including protein seq (protein) or cdna seq (cdna)
    """
    try:
        server = "https://rest.ensembl.org"
        ext = "/genetree/member/symbol/homo_sapiens/{}?prune_taxon=9526;prune_species=human;sequence={}".format(symbol, seq)
        url = server + ext

        r = requests.get(url, headers={"Content-Type":"application/json"})

        decoded = r.json()

        if list(decoded.keys())[0] != "error":
            return decoded['tree']
        else:
            first_attempt = "error"

        # revise symbol input    
        if first_attempt == "error":
            symbol = symbol.replace(" ", "").replace("-", "")

            ext = "/genetree/member/symbol/homo_sapiens/{}?prune_taxon=9526;prune_species=human;sequence={}".format(symbol, seq)
            url = server + ext

            r = requests.get(url, headers={"Content-Type":"application/json"})

            decoded = r.json()

            if list(decoded.keys())[0] != "error":
                return decoded['tree']
            else:
                return "Error input: Please enter a gene term." 
        
    # avoid no string parameter
    except AttributeError:
        return "Error input: Please enter a gene term." 


    
def homology_search(symbol="", seq = "cdna"):
    """
    Searches homology information (orthologs) by gene (symbol) in Ensemble (only for Homo sapiens)
    Returns homology information including protein seq (protein) or cdna seq (cdna)
    """
    try:
        server = "https://rest.ensembl.org"
        ext = "/homology/symbol/human/{}?type=orthologues;target_taxon=10090;target_species=mouse;sequence={}".format(symbol, seq)
        url = server + ext

        r = requests.get(url, headers={"Content-Type":"application/json"})

        decoded = r.json()

        if list(decoded.keys())[0] != "error":
            return decoded["data"][0]["homologies"][0]
        else:
            first_attempt = "error"

        # revise symbol input    
        if first_attempt == "error":
            symbol = symbol.replace(" ", "").replace("-", "")

            ext = "/homology/symbol/human/{}?type=orthologues;target_taxon=10090;target_species=mouse;sequence={}".format(symbol, seq)
            url = server + ext

            r = requests.get(url, headers={"Content-Type":"application/json"})

            decoded = r.json()

            if list(decoded.keys())[0] != "error":
                return decoded["data"][0]["homologies"][0]
            else:
                return "Error input: Please enter a gene term." 
            
    # avoid no string parameter
    except AttributeError:
        return "Error input: Please enter a gene term." 


def database_search(symbol=""):
    """
    Searches gene (symbol) in a linked external database in Ensemble (only for Homo sapiens)
    Returns a linked external database
    """
    # remove spaces
    symbol = symbol.replace(" ", "")
    
    # remove all punctuations in symbol
    for i in string.punctuation:
        symbol = symbol.replace(i, "")

    server = "https://rest.ensembl.org"
    ext = "/lookup/symbol/homo_sapiens"
    url = server + ext
    
    data_dic = {}
    data_dic["symbols"] = [symbol]
    data_input = str(data_dic).replace('"', '*').replace("'", '"').replace("*", "'")

    r = requests.post(url, headers={"Content-Type":"application/json", "Accept":"application/json"}, data=data_input)
        
    decoded = r.json()
    
    # check whether the result is empty
    if len(decoded) != 0:
        return decoded
    else:
        return "Error input: Please enter a gene term."


def ex_database_search(symbol=""):
    """
    Searches gene (symbol) in a linked external database in Ensemble (only for Homo sapiens)
    Returns a linked external database
    """
    try:
        # remove spaces
        symbol = symbol.replace(" ", "")

        # remove all punctuations in symbol
        for i in string.punctuation:
            symbol = symbol.replace(i, "")

        if len(symbol) != 0:
            server = "https://rest.ensembl.org"
            ext = "/xrefs/symbol/homo_sapiens/{}?".format(symbol)
            url = server + ext

            r = requests.get(url, headers={ "Content-Type":"application/json"})

            decoded = r.json()

            if len(decoded) != 0 and list(decoded)[0].keys() != "error":
                return decoded[0]
            else:
                first_attempt = "error"

            # revise symbol input    
            if first_attempt == "error":
                symbol = symbol.replace(" ", "").replace("-", "")

                ext = "/xrefs/symbol/homo_sapiens/{}?".format(symbol)
                url = server + ext

                r = requests.get(url, headers={ "Content-Type":"application/json"})

                decoded = r.json()

                if len(decoded) != 0 and list(decoded.keys())[0] != "error":
                    return decoded[0]
                else:
                    return "Lookup found nothing."
        else:
            return "Error input: Please enter a gene term."
        
    # avoid no string parameter
    except AttributeError:
        return "Error input: Please enter a gene term." 

In [3]:
pprint.pprint(cafegenetree_search("myc"))

{'children': [{'children': [{'children': [{'id': 4015400732,
                                           'lambda': 8.82715e-05,
                                           'n_members': 0,
                                           'name': 'Caenorhabditis elegans '
                                                   'strain N2',
                                           'p_value_lim': 0.01,
                                           'tax': {'common_name': 'Caenorhabditis '
                                                                  'elegans',
                                                   'id': 6239,
                                                   'production_name': 'caenorhabditis_elegans',
                                                   'scientific_name': 'Caenorhabditis '
                                                                      'elegans '
                                                                      'strain '
                                        

In [4]:
pprint.pprint(genetree_search("myc", seq = "cdna"))

{'branch_length': 0.01131,
 'children': [{'branch_length': 0.003615,
               'children': [{'branch_length': 0,
                             'children': [{'branch_length': 0.001108,
                                           'children': [{'branch_length': 0,
                                                         'children': [{'branch_length': 0,
                                                                       'children': [{'branch_length': 0.001108,
                                                                                     'confidence': {},
                                                                                     'id': {'accession': 'ENSPTRG00000020581',
                                                                                            'source': 'EnsEMBL'},
                                                                                     'sequence': {'id': [{'accession': 'ENSPTRP00000075568',
                                               

In [5]:
pprint.pprint(homology_search("myc", seq = "cdna"))

{'dn_ds': None,
 'method_link_type': 'ENSEMBL_ORTHOLOGUES',
 'source': {'align_seq': 'CTGGATTTTTTTCGGGTAGTGGAAAACCAGCAGCCTCCCGCGACGATGCCCCTCAACGTTAGCTTCACCAACAGGAACTATGACCTCGACTACGACTCGGTGCAGCCGTATTTCTACTGCGACGAGGAGGAGAACTTCTACCAGCAGCAGCAGCAGAGCGAGCTGCAGCCCCCGGCGCCCAGCGAGGATATCTGGAAGAAATTCGAGCTGCTGCCCACCCCGCCCCTGTCCCCTAGCCGCCGCTCCGGGCTCTGCTCGCCCTCCTACGTTGCGGTC---ACACCCTTCTCCCTTCGGGGAGACAACGACGGCGGTGGCGGGAGCTTCTCCACGGCCGACCAGCTGGAGATGGTGACCGAGCTGCTGGGAGGAGACATGGTGAACCAGAGTTTCATCTGCGACCCGGACGACGAGACCTTCATCAAAAACATCATCATCCAGGACTGTATGTGGAGCGGCTTCTCGGCCGCCGCCAAGCTCGTCTCAGAGAAGCTGGCCTCCTACCAGGCTGCGCGCAAAGACAGCGGCAGCCCGAACCCCGCCCGCGGCCACAGCGTCTGCTCCACCTCCAGCTTGTACCTGCAGGATCTGAGCGCCGCCGCCTCAGAGTGCATCGACCCCTCGGTGGTCTTCCCCTACCCTCTCAACGACAGCAGCTCGCCCAAGTCCTGCGCCTCGCAAGACTCCAGCGCCTTCTCTCCGTCCTCGGATTCTCTGCTCTCCTCGACGGAGTCCTCCCCGCAGGGCAGCCCCGAGCCCCTGGTGCTCCATGAGGAGACACCGCCCACCACCAGCAGCGACTCTGAGGAGGAACAAGAAGATGAGGAAGAAATCGATGTTGTTTCTGTGGAAAAGAGGCAGGCTCCTGGCAAAAGGTCAGAGTCTGGATCACCTTCTGCTGGAGGCCACAGCAA

In [6]:
pprint.pprint(database_search("myc"))

{'myc': {'assembly_name': 'GRCh38',
         'biotype': 'protein_coding',
         'db_type': 'core',
         'description': 'MYC proto-oncogene, bHLH transcription factor '
                        '[Source:HGNC Symbol;Acc:HGNC:7553]',
         'display_name': 'MYC',
         'end': 127742951,
         'id': 'ENSG00000136997',
         'logic_name': 'ensembl_havana_gene_homo_sapiens',
         'object_type': 'Gene',
         'seq_region_name': '8',
         'source': 'ensembl_havana',
         'species': 'homo_sapiens',
         'start': 127735434,
         'strand': 1,
         'version': 21}}


In [7]:
pprint.pprint(ex_database_search("myc"))

{'id': 'ENSG00000136997', 'type': 'gene'}


- NCBI gene

In [5]:
def initialization():
    """
    Configures this module and the environment 
    """
    Entrez.email = "alice810415@gmail.com"
    Entrez.tool = "BiologicalSearch"


def genesearch_ID(symbol, retmax=1):
    """
    Searches a gene term in Gene of NCBI (only for Homo sapiens)
    Returns an Gene ID list
    """
    # remove all punctuations in symbol
    for i in string.punctuation:
        symbol = symbol.replace(i, "")
    
    # search that symbol from the voice panel or text input in Gene of NCBI
    handle = Entrez.esearch(db="gene", term=f"Homo sapiens[Orgn] AND {symbol}[Gene]", idtype="acc", remode="json", retmax=retmax)
    record = Entrez.read(handle)
    id_list = record["IdList"]
    
    if len(id_list) != 0:
        return id_list
    else:
        return "Lookup found nothing."
    #TODO: something wrong in "abc"


def genesearch_res(res_list=[""]):
    """
    Searches the Gene ID list in Gene of NCBI
    Returns gene information
    """
    try:
        # advoid a blank list
        if len(res_list) != 0:
            if res_list != "Lookup found nothing.": 
                my_output = {}
                my_dict = {}

                for i in res_list:
                    handle = Entrez.efetch(db="gene", id=i, rettype="gb", retmode="text")
                    res = handle.read()

                    if "This record was replaced with GeneID" not in res:
                        # set the key 
                        start = res.find("\n1. ")
                        end = res.find("\nOfficial Symbol:")
                        dic_key = res[start:end].replace("\n1. ", "")

                        # set the Gene ID
                        my_dict["Gene ID"] = i

                        # convert text into dict
                        res = res.replace("\n1. {}\n".format(dic_key), "").replace("\n\n", "").replace(" and ", "\n").replace("; Location: ", "\nLocation: ")

                        start = res.find("Other Designations")
                        end = res.find("\nChromosome")
                        output = res[start:end].replace("\n", " ")

                        res = res.replace(res[start:end], output)

                        _dict = dict((x.strip(), y.strip()) for x, y in (element.split(':') for element in res.split('\n')))
                        my_dict["result"] = _dict

                        my_output[dic_key] = my_dict
                        my_dict = {}

                return my_output
            else:
                return "Lookup found nothing."
        else:
            # avoid empty lists.
            return "Error input: Please enter IDs in a list."

    # avoid unacceptable term
    except HTTPError:
        return "Error input: Please enter a gene ID."
    
    except IndexError:
        return "Error input: Please enter a ID list." 

    except TypeError:
        return "Error input: Please enter a list."
    
    # avoid empty string in a list
    except ValueError:
        return "Error input: Please enter a list."


def genesearch(symbol, retmax=1):
    initialization()
    
    return genesearch_res(genesearch_ID(symbol, retmax))

In [6]:
pprint.pprint(genesearch("myc", retmax=1))

{'MYC': {'Gene ID': '4609',
         'result': {'Annotation': 'Chromosome 8 NC_000008.11 '
                                  '(127735434..127742951)',
                    'Chromosome': '8',
                    'ID': '4609',
                    'Location': '8q24.21',
                    'MIM': '190080',
                    'Name': 'MYC proto-oncogene, bHLH transcription factor '
                            '[Homo sapiens (human)]',
                    'Official Symbol': 'MYC',
                    'Other Aliases': 'MRTLC, bHLHe39, c-Myc, MYC',
                    'Other Designations': 'myc proto-oncogene protein; avian '
                                          'myelocytomatosis viral oncogene '
                                          'homolog; class E basic '
                                          'helix-loop-helix protein 39; '
                                          'myc-related '
                                          'translation/localization regulatory '
                

- NCBI nucleotide

In [64]:
def ntsearch_ID(symbol, ret_max=1):
    """
    Searches a gene symbol in Genbank of NCBI (only for Homo sapiens)
    Returns an UID list
    """
    # remove spaces
    symbol = symbol.replace(" ", "")
    
    # remove all punctuations in symbol
    for i in string.punctuation:
        symbol = symbol.replace(i, "")
        
    # set error for an unacceptable symbol
    if symbol == "":
        first_attempt = "error"
    else:
        first_attempt = "pass"
    
    # search that symbol from the voice panel or text input in Genbank of NCBI
    handle = Entrez.esearch(db="nuccore", term=f"Homo sapiens[Orgn] AND {symbol}[Gene]", remode="json", retmax=ret_max)
    record = Entrez.read(handle)
    id_list = record["IdList"]
    
    if len(id_list) != 0 and first_attempt != "error":
        return id_list
    else:
        return "Lookup found nothing."
    #TODO: something wrong in "abc"


def ntsearch_res(res_list=[""]):
    """
    Searches UID list in Genbank of NCBI
    Returns gene information in the whole UID list
    """
    try:
        # advoid a blank list
        if len(res_list) != 0:
            if res_list != "Lookup found nothing.": 
                my_output = {}
                my_dict = {}
                my_list = []
                my_test = []

                for i in res_list:
                    handle = Entrez.efetch(db="nucleotide", id=i, rettype="gb", retmode="txt")
                    res = handle.read()

                    # check whether it exists in the dict
                    start = res.find("VERSION")
                    end = res.find("\nKEYWORDS")
                    output = res[start:end]

                    if my_output and output.split()[1] in my_test:
                        continue

                    my_test.append(output.split()[1])

                    start = res.find("LOCUS")
                    end = res.find("\nDEFINITION")
                    output = res[start:end]
                    my_list = output.split()
                    list_key = my_list[0]
                    my_list.pop(0)
                    my_dict[list_key.title()] = " ".join(my_list)
                    my_list = []

                    start = res.find("DEFINITION")
                    end = res.find("\nACCESSION")
                    output = res[start:end]
                    my_list = output.split()
                    list_key = my_list[0]
                    my_list.pop(0)
                    my_dict[list_key.title()] = " ".join(my_list)
                    my_list = []

                    start = res.find("ACCESSION")
                    end = res.find("\nVERSION")
                    output = res[start:end]
                    my_list = output.split()
                    list_key = my_list[0]
                    my_list.pop(0)
                    my_dict[list_key.title()] = " ".join(my_list)
                    my_list = []

                    start = res.find("VERSION")
                    end = res.find("\nKEYWORDS")
                    output = res[start:end]
                    my_list = output.split()
                    list_key = my_list[0]
                    my_list.pop(0)
                    my_dict[list_key.title()] = " ".join(my_list)
                    my_list = []

                    start = res.find("KEYWORDS")
                    end = res.find("\nSOURCE")
                    output = res[start:end]
                    my_list = output.split()
                    list_key = my_list[0]
                    my_list.pop(0)
                    my_dict[list_key.title()] = " ".join(my_list)
                    my_list = []

                    start = res.find("SOURCE")
                    end = res.find("\n  ORGANISM")
                    output = res[start:end]
                    my_list = output.split()
                    list_key = my_list[0]
                    my_list.pop(0)
                    my_dict[list_key.title()] = " ".join(my_list)
                    my_list = []

                    start = res.find("ORGANISM")
                    end = res.find("\nREFERENCE   1")
                    output = res[start:end]
                    my_list = output.split()
                    list_key = my_list[0]
                    my_list.pop(0)
                    my_dict[list_key.title()] = " ".join(my_list)
                    my_list = []

                    start = res.find("FEATURES")
                    output = res[start:]
                    my_dict["FEATURES".title()] = output[46:]

                    my_output[i] = my_dict
                    my_dict = {}

                return (res, my_output)
            else:
                return "Lookup found nothing."

        else:
            # avoid empty lists.
            return "Error input: Please enter IDs in a list."

    # avoid unacceptable term
    except HTTPError:
        return "Error input: Please enter a gene ID."

    # avoid empty string in a list
    except IndexError:
        return "Error input: Please enter a ID list." 

    except TypeError:
        return "Please key a list." 
        
        
def ntsearch(symbol, retmax=1):
    initialization()
    
    return ntsearch_res(ntsearch_ID(symbol, retmax))

In [67]:
print(ntsearch("myc", retmax=1)[0])

LOCUS       NG_007161              14518 bp    DNA     linear   PRI 15-NOV-2021
DEFINITION  Homo sapiens MYC proto-oncogene, bHLH transcription factor (MYC),
            RefSeqGene (LRG_1397) on chromosome 8.
ACCESSION   NG_007161
VERSION     NG_007161.2
KEYWORDS    RefSeq; RefSeqGene.
SOURCE      Homo sapiens (human)
  ORGANISM  Homo sapiens
            Eukaryota; Metazoa; Chordata; Craniata; Vertebrata; Euteleostomi;
            Mammalia; Eutheria; Euarchontoglires; Primates; Haplorrhini;
            Catarrhini; Hominidae; Homo.
REFERENCE   1  (bases 1 to 14518)
  AUTHORS   Hann SR, King MW, Bentley DL, Anderson CW and Eisenman RN.
  TITLE     A non-AUG translational initiation in c-myc exon 1 generates an
            N-terminally distinct protein whose synthesis is disrupted in
            Burkitt's lymphomas
  JOURNAL   Cell 52 (2), 185-195 (1988)
   PUBMED   3277717
COMMENT     REVIEWED REFSEQ: This record has been curated by NCBI staff in
            collaboration with Zahra Jala

- ENA

In [8]:
def ENAsearch_gene(symbol=""):
    """
    Searches a gene term in ENA
    Returns the whole text
    """
    try:
        # remove spaces
        symbol = symbol.replace(" ", "")

        # remove all punctuations in symbol
        for i in string.punctuation:
            symbol = symbol.replace(i, "")

        if len(symbol) != 0:
            server = "https://www.ebi.ac.uk/ena/browser/api"
            ext = "/embl/textsearch?domain=embl&query={0}%20AND%20human&limit=1".format(symbol)
            url = server + ext

            r = requests.get(url)

            res = r.text
            
            my_dict = {}
            my_list = []
            
            start = res.find("ID")
            end = res.find("\nXX\nAC")
            output = res[start:end]
            my_list = output.split("   ")
            list_key = my_list[0]
            my_dict[list_key] = my_list[1]
            my_list = []
            
            start = res.find("AC")
            end = res.find("\nXX\nDT")
            output = res[start:end]
            my_list = output.split("   ")
            list_key = my_list[0]
            my_dict[list_key] = my_list[1]
            my_list = []
            
            start = res.find("\nXX\nDT")
            end = res.find("\nXX\nDE")
            output = res[start:end].replace("\nXX\n", "")
            my_list = output.split("   ")
            list_key = my_list[0]
            my_dict[list_key] = my_list[1]
            my_list = []
            #TODO check
            
            start = res.find("DE")
            end = res.find("\nXX\nKW")
            output = res[start:end]
            my_list = output.split("   ")
            list_key = my_list[0]
            my_dict[list_key] = my_list[1]
            my_list = []
            
            start = res.find("OS")
            end = res.find("\nOC")
            output = res[start:end]
            my_list = output.split("   ")
            list_key = my_list[0]
            my_dict[list_key] = my_list[1]
            my_list = []
            
            start = res.find("OC")
            end = res.find("\nXX\nRN")
            output = res[start:end]
            my_list = output.split("   ")
            list_key = my_list[0]
            my_dict[list_key] = my_list[1]
            my_list = []
            #TODO check
            
            start = res.find("FH")
            end = res.find("\n//\n")
            output = res[start:end]
            my_list = output.split("   ")
            list_key = my_list[0]
            my_dict[list_key] = output
            my_list = []

            return my_dict
        else:
            return "Error input: Please enter a gene term."
    
    # avoid not string argument
    except AttributeError:
        return "Error input: Please enter a gene term."

    # avoid not string argument
    except SyntaxError:
        return "Error input: Please enter a gene term."
    

def ENAsearch_fasta(symbol=""):
    """
    Searches a gene term in ENA
    Returns gene fasta
    """
    try:
        # remove spaces
        symbol = symbol.replace(" ", "")

        # remove all punctuations in symbol
        for i in string.punctuation:
            symbol = symbol.replace(i, "")

        server = "https://www.ebi.ac.uk/ena/browser/api"
        ext = "/fasta/textsearch?domain=embl&query={0}%20AND%20human&limit=1".format(symbol)
        url = server + ext

        r = requests.get(url)

        decoded = r.text
        if len(decoded) != 0:
            return decoded
        else:
            return "Error input: Please enter a gene term."
    
    # avoid not string argument
    except AttributeError:
        return "Error input: Please enter a gene term."

In [50]:
pprint.pprint(ENAsearch_gene("myc"))

{'AC': 'X00196; K01902-K01903;',
 'DE': 'Homo sapiens partial MYC gene for myc oncogene, exons 1-2 and joined '
       'CDS',
 'DT': '28-JAN-1986 (Rel. 08, Created)\nDT',
 'FH': 'FH   Key             Location/Qualifiers\n'
       'FH\n'
       'FT   source          1..3324\n'
       'FT                   /organism="Homo sapiens"\n'
       'FT                   /mol_type="genomic DNA"\n'
       'FT                   /db_xref="taxon:9606"\n'
       'FT   regulatory      40..43\n'
       'FT                   /gene="MYC"\n'
       'FT                   /regulatory_class="TATA_box"\n'
       'FT   regulatory      343..348\n'
       'FT                   /gene="MYC"\n'
       'FT                   /regulatory_class="TATA_box"\n'
       'FT   exon            360..925\n'
       'FT                   /gene="MYC"\n'
       'FT                   /number=1\n'
       'FT   regulatory      503..509\n'
       'FT                   /gene="MYC"\n'
       'FT                   /regulatory_class="TATA_b

In [14]:
pprint.pprint(ENAsearch_fasta("myc"))

('>ENA|X00196|X00196.1 Homo sapiens partial MYC gene for myc oncogene, exons '
 '1-2 and joined CDS\n'
 'TTCTCGTGTGGAGGGCAGCTGTTCCGCCTGGCGTAGATTTATACTCACAGGATAAGGTAA\n'
 'CGGTTTGTCAAACAGTACTGCTACGGAGGAGCAGCAGAGAAAGGGAGAGGGTTTGAGAGG\n'
 'GAGCGAAAAGAAAATGGTAGGCGCGCGTAGTTAATTCAATGCGGCTCTCTTACTCTGTTT\n'
 'ACATCCTAGAGCTAGAGTGCTCGGCTGCCCGGCTGAGTCTCCTCCCCACCTTCCCCACCC\n'
 'TCCCCACCCTCCCCATAAGCGCCCTCCCGGGTTCCCAAAGCAGAGGGCGTGGGGGAAAAG\n'
 'AAAAAAGATCCTCTCTCGCTAACTCTCCGCCCACCGGCCCTTTATAATGCGAGGGTCTGG\n'
 'ACGGCTGAGGACCCCCGAGCTGTGCTGCTCGCGGCCGCCACCGCCGGGCCCCGGCCGTCC\n'
 'CTGGCTCCCCTCCTGCCTCGAGAAGGGCAGGGCTTCTCAGAGGCTTGGCGGGAAAAAGAA\n'
 'CGGAGGGAGGGATCGCGCTGAGTATAAAAGCCGGTTTTCGGGGCTTTATCTAACTCGCTG\n'
 'TAGTAATTCCAGCGAGAGGCAGAGGGAGCGAGCGGGCGGCCGGCTAGGGTGGAAGAGCCG\n'
 'GGCGAGCAGAGCTGCGCTGCGGGCGTCCTGGGAAGGGAGATCCGGAGCGAATAGGGGGCT\n'
 'TCGCCTCTGGCCCAGCCCTCCCGCTGATCCCCCAGCCAGCGGTCCGCAACCCTTGCCGCA\n'
 'TCCACGAAACTTTGCCCATAGCAGCGGGCGGGCACTTTACGACTGGAACTTACAACACCC\n'
 'GAGCAAGGACGCGACTCTCCGACGCGGGGAGGCTATT

- HGNC

In [9]:
def hgnc_search(symbol):
    """    
    Searches a gene term (only accept approved symbol) in HGNC (only for Homo sapiens)
    Returns the gene information 
    """
    # remove spaces
    symbol = symbol.replace(" ", "")
    
    # remove all punctuations in symbol
    for i in string.punctuation:
        symbol = symbol.replace(i, "")
    
    server = "http://rest.genenames.org"
    ext = "/fetch/symbol/{}".format(symbol)
    url = server + ext
    
    r = requests.get(url, headers={ "Accept" : "application/json"})
    
    try:
        decoded = r.json()
        if len(decoded['response']['docs']) != 0:
            return decoded
        else:
            return "Lookup found nothing." 
    except:
        return "Lookup found nothing." 

In [43]:
pprint.pprint(hgnc_search("myc"))

{'response': {'docs': [{'_version_': 1716661428066189312,
                        'agr': 'HGNC:7553',
                        'alias_symbol': ['c-Myc', 'bHLHe39', 'MYCC'],
                        'ccds_id': ['CCDS6359', 'CCDS87627'],
                        'cosmic': 'MYC',
                        'date_approved_reserved': '2001-06-22T00:00:00Z',
                        'date_modified': '2021-04-13T00:00:00Z',
                        'date_name_changed': '2017-02-27T00:00:00Z',
                        'ensembl_gene_id': 'ENSG00000136997',
                        'entrez_id': '4609',
                        'gencc': 'HGNC:7553',
                        'gene_group': ['Basic helix-loop-helix proteins'],
                        'gene_group_id': [420],
                        'hgnc_id': 'HGNC:7553',
                        'location': '8q24.21',
                        'location_sortable': '08q24.21',
                        'locus_group': 'protein-coding gene',
                        'lo

# gene/protein

- EBI

In [10]:
def ebisearch_gene(symbol=""):
    """
    Searches gene in EBI
    Returns the result
    """
    try:
        # remove all punctuations in symbol
        for i in string.punctuation:
            symbol = symbol.replace(i, " ")

        # remove spaces
        symbol = symbol.replace(" ", "")

        if len(symbol) != 0:
            server = "https://www.ebi.ac.uk"

            # if symbol is gene
            ext = "/proteins/api/proteins?offset=0&size=10&gene={}&organism=Homo%20sapiens".format(symbol)
            url = server + ext

            r = requests.get(url, headers={ "Accept": "application/json"})

            decoded = r.json()
            
            if len(decoded) != 0:
                return decoded[0]
            else:
                return "Error input: Please enter a gene term."

        else:
            return "Error input: Please enter a gene term."
        
    except AttributeError:
        return "Error input: Please enter a gene term."
    # TODO: something wrong in "abc"
    

def ebisearch_protein(symbol=""):
    """
    Searches protein in EBI
    Returns the result
    """
    try:
        # remove all punctuations in symbol
        for i in string.punctuation:
            symbol = symbol.replace(i, " ")

        # remove space at the beginning and end of the symbol string
        symbol = symbol.strip()

        # modify the symbol to follow the instructions of the API
        symbol = symbol.replace(" ", "%20")

        if len(symbol) != 0:
            server = "https://www.ebi.ac.uk"

            ext = "/proteins/api/proteins?offset=0&size=10&protein={}&organism=Homo%20sapiens".format(symbol)
            url = server + ext

            r = requests.get(url, headers={ "Accept": "application/json"})

            decoded = r.json()

            # check whether the result is correct
            if len(decoded[0].keys()) > 2:
                return decoded[0]
            else:            
                return "Error input: Please enter a protein term."

        else:
            return "Error input: Please enter a protein term."
        
    except AttributeError:
        return "Error input: Please enter a protein term."
    # TODO: something wrong in "abc"

In [18]:
pprint.pprint(ebisearch_gene("myc"))

{'accession': 'A0A024R9L7',
 'comments': [{'text': [{'evidences': [{'code': 'ECO:0000256',
                                        'source': {'id': 'PIRNR001705',
                                                   'name': 'PIRNR',
                                                   'url': 'https://www.uniprot.org/unirule/PIRNR001705'}}],
                         'value': 'Transcription factor that binds DNA in a '
                                  'non-specific manner, yet also specifically '
                                  'recognizes the core sequence '
                                  "5'-CAC[GA]TG-3'. Activates the "
                                  'transcription of growth-related genes'}],
               'type': 'FUNCTION'},
              {'text': [{'evidences': [{'code': 'ECO:0000256',
                                        'source': {'id': 'PIRNR001705',
                                                   'name': 'PIRNR',
                                                   'u

In [19]:
pprint.pprint(ebisearch_protein("tyrosine kinase"))

{'accession': 'A0A024QZA8',
 'comments': [{'reaction': {'dbReferences': [{'id': 'RHEA:10596',
                                              'type': 'Rhea'},
                                             {'id': 'RHEA-COMP:10136',
                                              'type': 'Rhea'},
                                             {'id': 'RHEA-COMP:10137',
                                              'type': 'Rhea'},
                                             {'id': 'CHEBI:15378',
                                              'type': 'ChEBI'},
                                             {'id': 'CHEBI:30616',
                                              'type': 'ChEBI'},
                                             {'id': 'CHEBI:46858',
                                              'type': 'ChEBI'},
                                             {'id': 'CHEBI:82620',
                                              'type': 'ChEBI'},
                                             {'id':

# protein

- NCBI

In [11]:
def initialization():
    """
    Configures this module and the environment 
    """
    Entrez.email = "alice810415@gmail.com"
    Entrez.tool = "BiologicalSearch"


def proteinsearch_GI(term, ret_max=1):
    """
    Searches a protein term in Protein of NCBI (only for Homo sapiens)
    Returns an GI list
    """
    # remove all punctuations in symbol
    for i in string.punctuation:
        term = term.replace(i, " ")
    
    # search that term from the voice panel or text input in protein of NCBI
    handle = Entrez.esearch(db="protein", term=f"Homo sapiens[Orgn] AND {term}[Protein]", remode="json", retmax=ret_max)
    record = Entrez.read(handle)
    id_list = record["IdList"]
    
    if len(id_list) == 0 or "2127821799" in id_list:
        return "Lookup found nothing." 
    else:
        return id_list


def proteinsearch_res(res_list=[""]):
    """
    Searches GIs in Protein of NCBI
    Returns the protein informatio in the whole list
    """ 
    try:
        # advoid a blank list
        if len(res_list) != 0:
            
            # searches GIs in Protein of NCBI
            if res_list != "Lookup found nothing.": 
                my_output = {}
                my_dict = {}
                my_list = []
                my_test = []

                for i in res_list:
                    handle = Entrez.efetch(db="protein", id=i, rettype="gb", retmode="text")
                    res = handle.read()

                    # check whether it exists in the dict
                    start = res.find("VERSION")
                    end = res.find("\nKEYWORDS")
                    output = res[start:end]

                    if my_output and output.split()[1] in my_test:
                        continue

                    my_test.append(output.split()[1])

                    start = res.find("LOCUS")
                    end = res.find("\nDEFINITION")
                    output = res[start:end]
                    my_list = output.split()
                    list_key = my_list[0]
                    my_list.pop(0)
                    my_dict[list_key.title()] = " ".join(my_list)
                    my_list = []

                    start = res.find("DEFINITION")
                    end = res.find("\nACCESSION")
                    output = res[start:end]
                    my_list = output.split()
                    list_key = my_list[0]
                    my_list.pop(0)
                    my_dict[list_key.title()] = " ".join(my_list)
                    my_list = []

                    start = res.find("ACCESSION")
                    end = res.find("\nVERSION")
                    output = res[start:end]
                    my_list = output.split()
                    list_key = my_list[0]
                    my_list.pop(0)
                    my_dict[list_key.title()] = " ".join(my_list)
                    my_list = []

                    start = res.find("VERSION")
                    end = res.find("\nDBSOURCE")
                    output = res[start:end]
                    my_list = output.split()
                    list_key = my_list[0]
                    my_list.pop(0)
                    my_dict[list_key.title()] = " ".join(my_list)
                    my_list = []

                    start = res.find("DBSOURCE")
                    end = res.find("\nKEYWORDS")
                    output = res[start:end]
                    my_list = output.split()
                    list_key = my_list[0]
                    my_list.pop(0)
                    my_dict[list_key.title()] = " ".join(my_list)
                    my_list = []

                    start = res.find("KEYWORDS")
                    end = res.find("\nSOURCE")
                    output = res[start:end]
                    my_list = output.split()
                    list_key = my_list[0]
                    my_list.pop(0)
                    my_dict[list_key.title()] = " ".join(my_list)
                    my_list = []

                    start = res.find("SOURCE")
                    end = res.find("\n  ORGANISM")
                    output = res[start:end]
                    my_list = output.split()
                    list_key = my_list[0]
                    my_list.pop(0)
                    my_dict[list_key.title()] = " ".join(my_list)
                    my_list = []

                    start = res.find("ORGANISM")
                    end = res.find("\nREFERENCE   1")
                    output = res[start:end]
                    my_list = output.split()
                    list_key = my_list[0]
                    my_list.pop(0)
                    my_dict[list_key.title()] = " ".join(my_list)
                    my_list = []

                    start = res.find("FEATURES")
                    end = res.find("\nORIGIN")
                    output = res[start:end]
                    my_dict["FEATURES".title()] = output
                    my_list = []

                    start = res.find("ORIGIN")
                    output = res[start:]
                    my_dict["ORIGIN".title()] = output.replace("\n//\n\n", "")
                    my_list = []

                    my_output[i] = my_dict
                    my_dict = {}

                return my_output
            else:
                return "Lookup found nothing." 
        else:
            # avoid a empty list 
            return "Error input: Please enter GIs in a list."

    # avoid unacceptable term
    except HTTPError:
        return "Error input: Please enter a protein term."

    # avoid empty string in a list
    except IndexError:
        return "Error input: Please enter a GI list." 

    except TypeError:
        return "Please key a list." 


def proteinsearch(symbol, retmax=1):
    initialization()
    
    return proteinsearch_res(proteinsearch_GI(symbol, retmax)) 

In [44]:
pprint.pprint(proteinsearch("tyrosine kinase"))

{'1161364': {'Accession': 'AAB60412',
             'Dbsource': 'accession AH005334.2',
             'Definition': 'tyrosine kinase [Homo sapiens].',
             'Features': 'FEATURES             Location/Qualifiers\n'
                         '     source          1..527\n'
                         '                     /organism="Homo sapiens"\n'
                         '                     /db_xref="taxon:9606"\n'
                         '                     /cell_type="leukocyte"\n'
                         '                     /tissue_type="blood"\n'
                         '                     /dev_stage="adult"\n'
                         '     Protein         1..527\n'
                         '                     /product="tyrosine kinase"\n'
                         '     Region          85..139\n'
                         '                     /region_name="SH3_TXK"\n'
                         '                     /note="Src Homology 3 domain of '
                  

- UniProt

In [12]:
def uniprot_search(symbol="", formattype="tab"):
    """
    Searches protein in UniProt
    Returns the result
    """
    try:
        # remove all punctuations in symbol
        for i in string.punctuation:
            symbol = symbol.replace(i, " ")

        # remove space at the beginning and end of the symbol string
        symbol = symbol.strip()

        if len(symbol) != 0:
            server = "https://www.uniprot.org/uniprot"

            # format: fasta, list, txt, xml, tab
            ext = "/?query=organism:9606+AND+{0}&limit=2&columns=id,entry name,reviewed,protein names,genes,length&format={1}".format(symbol, formattype)
            url = server + ext

            r = requests.get(url)

            decoded = r.text

            return decoded
        else:
            return "Error input: Please enter a protein term."
        
    # avoid not string argument
    except AttributeError:
        return "Error input: Please enter a protein term."

In [15]:
# better to use accession from NCBI
print(uniprot_search("P51460", "xml"))

<?xml version='1.0' encoding='UTF-8'?>
<uniprot xmlns="http://uniprot.org/uniprot" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://uniprot.org/uniprot http://www.uniprot.org/support/docs/uniprot.xsd">
<entry dataset="Swiss-Prot" created="2000-05-30" modified="2021-09-29" version="139">
<accession>Q99622</accession>
<accession>B2R4Q6</accession>
<name>C10_HUMAN</name>
<protein>
<recommendedName>
<fullName>Protein C10</fullName>
</recommendedName>
</protein>
<gene>
<name type="primary">C12orf57</name>
<name type="synonym">C10</name>
</gene>
<organism>
<name type="scientific">Homo sapiens</name>
<name type="common">Human</name>
<dbReference type="NCBI Taxonomy" id="9606"/>
<lineage>
<taxon>Eukaryota</taxon>
<taxon>Metazoa</taxon>
<taxon>Chordata</taxon>
<taxon>Craniata</taxon>
<taxon>Vertebrata</taxon>
<taxon>Euteleostomi</taxon>
<taxon>Mammalia</taxon>
<taxon>Eutheria</taxon>
<taxon>Euarchontoglires</taxon>
<taxon>Primates</taxon>
<taxon>Haplorrhini</taxo

In [16]:
# better to use accession from NCBI
xml_string = uniprot_search("P51460", "xml")

<Element '{http://uniprot.org/uniprot}uniprot' at 0x7ffa40c78ef0>

In [204]:
root = ET.fromstring(xml_string)
for c in root.findall('.//{http://uniprot.org/uniprot}title'):
    print(c.text, end="\n")

Large-scale sequencing in human chromosome 12p13: experimental and computational gene structure determination.
Complete sequencing and characterization of 21,243 full-length human cDNAs.
The status, quality, and expansion of the NIH full-length cDNA project: the Mammalian Gene Collection (MGC).
Lys-N and trypsin cover complementary parts of the phosphoproteome in a refined SCX-based approach.
Initial characterization of the human central proteome.
Comparative large-scale characterisation of plant vs. mammal proteins reveals similar and idiosyncratic N-alpha acetylation features.
Whole-exome sequencing identifies mutated c12orf57 in recessive corpus callosum hypoplasia.
A newly recognized autosomal recessive syndrome affecting neurologic function and vision.
Mutations in c12orf57 cause a syndromic form of colobomatous microphthalmia.
A human cDNA coding for the Leydig insulin-like peptide (Ley I-L).
Structural organization of the porcine and human genes coding for a Leydig cell-specific

# Pubmeb

In [217]:
def pubmed_ID(symbol=""):
    """
    Searches articles regarding gene/protein in Pubmed
    Returns a UID list
    """
    # search articles regarding gene/protein which has been published in 180 days
    server = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
    ext = "/esearch.fcgi?db=pubmed&term={}&retmode=json&reldate=180&datetype=pdat&retmax=1&usehistory=y".format(symbol)
    url = server + ext

    r = requests.get(url)
    
    decoded = r.text
    
    id_list = json.loads(decoded)["esearchresult"]["idlist"]

    return id_list


def pubmed_res(res_list=[""]):
    """
    Searches a UID list of Pubmed
    Returns article names
    """ 
    server = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
    ext = "/efetch.fcgi?db=pubmed&id=17284678&retmode=xml&rettype=abstract".format(symbol)
    url = server + ext
    

In [219]:
pubmed_ID("myc")

['34799403']

In [50]:
import requests
from lxml import etree
from io import BytesIO
from xml.etree import ElementTree as ET
from bs4 import BeautifulSoup
import lxml.etree as etree
from xml.etree.ElementTree import XML, fromstring
from xml.etree import ElementTree
import xml.dom.minidom

In [49]:
# server = "https://eutils.ncbi.nlm.nih.gov/entrez/eutils"
# ext = "/esearch.fcgi?db=pubmed&term={}&retmode=json&reldate=360&datetype=pdat&retmax=5&usehistory=y".format(symbol)
# url = server + ext

r = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=17284678&retmode=xml&rettype=abstract")
decoded = r.text
#print(decoded)
myxml = ET.fromstring(decoded)

print(ElementTree.dump(myxml))

<PubmedArticleSet><PubmedArticle><MedlineCitation Status="MEDLINE" Owner="NLM"><PMID Version="1">17284678</PMID><DateCompleted><Year>2007</Year><Month>04</Month><Day>05</Day></DateCompleted><DateRevised><Year>2018</Year><Month>11</Month><Day>13</Day></DateRevised><Article PubModel="Print-Electronic"><Journal><ISSN IssnType="Print">1088-9051</ISSN><JournalIssue CitedMedium="Print"><Volume>17</Volume><Issue>3</Issue><PubDate><Year>2007</Year><Month>Mar</Month></PubDate></JournalIssue><Title>Genome research</Title><ISOAbbreviation>Genome Res</ISOAbbreviation></Journal><ArticleTitle>Sequencing and analysis of chromosome 1 of Eimeria tenella reveals a unique segmental organization.</ArticleTitle><Pagination><MedlinePgn>311-9</MedlinePgn></Pagination><Abstract><AbstractText>Eimeria tenella is an intracellular protozoan parasite that infects the intestinal tracts of domestic fowl and causes coccidiosis, a serious and sometimes lethal enteritis. Eimeria falls in the same phylum (Apicomplexa) a

In [53]:
# for Pretty Print XML String
# https://codeblogmoney.com/xml-pretty-print-using-python-with-examples/
import xml.dom.minidom
xml = xml.dom.minidom.parseString(decoded)

xml_pretty_str = xml.toprettyxml()

print(xml_pretty_str)

<?xml version="1.0" ?>
<!DOCTYPE PubmedArticleSet
  PUBLIC '-//NLM//DTD PubMedArticle, 1st January 2019//EN'
  'https://dtd.nlm.nih.gov/ncbi/pubmed/out/pubmed_190101.dtd'>
<PubmedArticleSet>
	<PubmedArticle>
		<MedlineCitation Status="MEDLINE" Owner="NLM">
			<PMID Version="1">17284678</PMID>
			<DateCompleted>
				<Year>2007</Year>
				<Month>04</Month>
				<Day>05</Day>
			</DateCompleted>
			<DateRevised>
				<Year>2018</Year>
				<Month>11</Month>
				<Day>13</Day>
			</DateRevised>
			<Article PubModel="Print-Electronic">
				<Journal>
					<ISSN IssnType="Print">1088-9051</ISSN>
					<JournalIssue CitedMedium="Print">
						<Volume>17</Volume>
						<Issue>3</Issue>
						<PubDate>
							<Year>2007</Year>
							<Month>Mar</Month>
						</PubDate>
					</JournalIssue>
					<Title>Genome research</Title>
					<ISOAbbreviation>Genome Res</ISOAbbreviation>
				</Journal>
				<ArticleTitle>Sequencing and analysis of chromosome 1 of Eimeria tenella reveals a unique segmental organiza

AttributeError: 'xml.etree.ElementTree.Element' object has no attribute 'toprettyxml'

In [262]:
root = ET.fromstring(decoded)
for child in root.findall('.//ArticleTitle'):
    print(child.text)

Sequencing and analysis of chromosome 1 of Eimeria tenella reveals a unique segmental organization.


In [272]:
for c in root.findall('.//ArticleId'):
    print(c.text, end="\n")

17284678
gr.5823007
10.1101/gr.5823007
PMC1800922
11076859
15713233
3543808
15980438
12368867
7971922
12802683
16086015
2325704
12097910
15111065
9862982
12520045
9023104
12745365
16556216
15144565
15875012
12368864
15105014
8559656
12427472
16132816
16332714
11120685
15463458
10961844
11042156
12869580
15145805
12618375
16205714
11237011
16020726


In [142]:
data = json.loads(pubmed("myc"))

In [135]:

idlist = data["esearchresult"]["idlist"]
string = ""
number = len(idlist)
lastone = idlist[number - 1]
for item in idlist:
    if item == lastone:
        string = string + item
    else:
        string = string + item + ","

In [136]:
#決定要回傳的格式
retmode = "xml"
rettype = ""
re = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&retmode=" + retmode + "&id=" + string + "&rettype=" + rettype)
#使用python 內建的xml處理函數
tree = ET.fromstring(re.text.encode("utf-8"))

In [138]:
d = tree.findall("PubmedArticle/")
doi_list = []
n = -1
for item in d:
    temp = len(doi_list)

In [54]:
if item.tag == "PubmedData":
    for doii in item.findall("ArticleIdList"):
        
        for link in doii:
            if link.attrib["IdType"] == "doi":
                doi_list.append(link.text)
            if temp == len(doi_list):
                doi_list.append("No doi")

NameError: name 'item' is not defined

In [140]:
p = tree.findall("PubmedArticle/MedlineCitation/Article/ArticleTitle")


In [141]:
p

[<Element 'ArticleTitle' at 0x7f7fd9992d60>,
 <Element 'ArticleTitle' at 0x7f7fd8ff4a90>,
 <Element 'ArticleTitle' at 0x7f7fd92f8d60>,
 <Element 'ArticleTitle' at 0x7f7fd8faeae0>,
 <Element 'ArticleTitle' at 0x7f7fd8f11db0>]