In [1]:
import re
import requests

In [2]:
base_url = 'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'

### esearch

In [3]:
query = 'escherichia+coli[orgn]+AND+biomol+mrna[prop]'

In [4]:
db = 'nucleotide'

In [5]:
esearch_url = f'esearch.fcgi?db={db}&term={query}&usehistory=y'

In [6]:
main_url = base_url + esearch_url

In [7]:
print(
    main_url
)

http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=nucleotide&term=escherichia+coli[orgn]+AND+biomol+mrna[prop]&usehistory=y


In [11]:
def getDataFromNCBI(ncbi_url):
    resp = requests.get(ncbi_url)
    
    return resp.text

In [12]:
ncbi_data = getDataFromNCBI(main_url)

In [20]:
ncbi_data[:2000]

'<?xml version="1.0" encoding="UTF-8" ?>\n<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd">\n<eSearchResult><Count>1053</Count><RetMax>20</RetMax><RetStart>0</RetStart><QueryKey>1</QueryKey><WebEnv>MCID_636a1606422503063a0d5131</WebEnv><IdList>\n<Id>2283393694</Id>\n<Id>2283393687</Id>\n<Id>2183382558</Id>\n<Id>1946387296</Id>\n<Id>1946387294</Id>\n<Id>1946387292</Id>\n<Id>1946387290</Id>\n<Id>1946387288</Id>\n<Id>1946387286</Id>\n<Id>1946387284</Id>\n<Id>1946387282</Id>\n<Id>1946387280</Id>\n<Id>1946387278</Id>\n<Id>1946387277</Id>\n<Id>1946387275</Id>\n<Id>1946387273</Id>\n<Id>1946387272</Id>\n<Id>1946387270</Id>\n<Id>1946387268</Id>\n<Id>1946387266</Id>\n</IdList><TranslationSet><Translation>     <From>escherichia coli[orgn]</From>     <To>"Escherichia coli"[Organism]</To>    </Translation></TranslationSet><TranslationStack>   <TermSet>    <Term>"Escherichia coli"[Organism]</Term>    <Field>Organism</

In [10]:
rgx_count   = re.compile(r'<Count>(.+)</Count>') 
rgx_querkey = re.compile(r'<QueryKey>(.+)</QueryKey>')
rgx_webenv  = re.compile(r'<WebEnv>(.+)</WebEnv>')

In [14]:
count_search    = re.search(rgx_count, ncbi_data) 
querykey_search = re.search(rgx_querkey, ncbi_data)
webenv_search   = re.search(rgx_webenv, ncbi_data)

In [15]:
count_search

<re.Match object; span=(188, 207), match='<Count>1053</Count>'>

In [16]:
count    = count_search.group(1) 
querykey = querykey_search.group(1)
webenv   = webenv_search.group(1)

In [17]:
count

'1053'

In [18]:
querykey

'1'

In [19]:
webenv

'MCID_636a1606422503063a0d5131'

### efetch

In [21]:
base_url

'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/'

In [25]:
efetch_1 = f'efetch.fcgi?db={db}&WebEnv={webenv}'
efetch_2 = f'&query_key={querykey}&retstat=0'
efetch_3 = '&retmax=1&rettype=gb&mode=text'

In [26]:
main_url = base_url + efetch_1 + efetch_2 + efetch_3

In [27]:
main_url

'http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=nucleotide&WebEnv=MCID_636a1606422503063a0d5131&query_key=1&retstat=0&retmax=1&rettype=gb&mode=text'

In [28]:
gb_data = getDataFromNCBI(main_url)

In [29]:
print(
    gb_data
)

LOCUS       OX222974                2331 bp    mRNA    linear   EST 05-AUG-2022
DEFINITION  Escherichia coli EST, clone
            acgggcccca_tactacgatt.Nanopore_20201210, mRNA sequence.
ACCESSION   OX222974
VERSION     OX222974.1
DBLINK      BioProject: PRJEB54063
KEYWORDS    EST.
SOURCE      Escherichia coli
  ORGANISM  Escherichia coli
            Bacteria; Proteobacteria; Gammaproteobacteria; Enterobacterales;
            Enterobacteriaceae; Escherichia.
REFERENCE   1
  AUTHORS   Vasarhelyi,B.
  TITLE     Direct Submission
  JOURNAL   Submitted (12-JUL-2022) ELKH BRC, Kintses, Temesvari korut 62,
            Szeged, 6726, Hungary
COMMENT     sequences of antibiotic resistance gene from the metagenomes of the
            human gut ans soil samples and from the genomes of different
            pathogenic bacterial strains.
FEATURES             Location/Qualifiers
     source          1..2331
                     /organism="Escherichia coli"
                     /mol_type="mRNA"
    

In [45]:
def writeBGToFile(gbData):
    
    data_id = gb_data.split()[1].strip()
    
    with open(f'./{data_id}.gb', 'w') as f_out:
        f_out.write(gbData)

In [46]:
writeBGToFile(gb_data)

Data is now in GB format -> FASTA

In [78]:
def gbToFasta(path_to_gb):
    header = '>'
    flag = False
    
    with open(path_to_gb, 'r') as f:
        for line in f:
            line = line.strip()
            
            if line.startswith("LOCUS"):
                header += line.split()[1]
                print(header)
                continue
            
            if line.startswith("ORIGIN"):
                flag = True
                continue
                
            if flag:
                sequence = line.split()[1:]
                sequence = ''.join(sequence).upper()
                print(sequence)
            
            

In [79]:
gbToFasta('./OX222974.gb')

>OX222974
GAATTCACGCGCGATACGACCCACTGCCATCAGCCCTTCCGGGGTGATTTCACCACCCGG
AGAACGCGGGATCACCGAGTAGGTGCCGTCTTTTTGGATGTTAGCGAGGAAGTTGTCGTT
AGAATCCTGCAGCGGAGTATGTTCCGGCTTCAGAATGTATTCGTTCCAGCAGGAGGCCAG
CAGCGAACCGACGGTCGGTTTACAAACTTCACAACCGTAACCTTTGCCGTGTTTCGCCAG
CAGTTCTTCGAAGGTTTTAATGCCTTCAACGCGGATCAAATGGAACAGTTCCTGACGCGA
ATAAGCAAAGTGTTCACACAGGTTGTTGTTAACTTCGATACCCTGTTTCGCCAGTTCCGC
GTTCAGTACCTGAGTGACCAGCGGAATACAACCGCCGCAGCCCGTACCGGCTTTGGTTTC
GGCTTTCAGCGCCGCAACGGTGTGGCAGCCTTTGTTGATAGCAGCAATCAGGTCGCCTTT
GGTGACGTCGAAGCAGGAGCAGATTTGCGCGCTGTCCGGCAGTTTATCAACACCGATAGA
CGGCTTACCGCTACCCGAGTGCGCCGGCAGAATCAGAGAATCCGGGTTTTCCGGCAGTTC
GATAGCGTTCAGCACCAGTTGCAGCAGGTTGCCGTAGTCGCTGGTATCGCCCACCAGTAC
CGCACCGAGCAGGGTTTTGTTGTCTTCGCTGACAATCAGGCGTTTGTAGATCTCTTTGCT
TTCGTCGAGGTAAACGTAGCTACGTGCGCCAGGCGTGCGACCGTGCGCATCACCAATACC
GCCTACGTCTACGCCCAGCAGTTTCAGCTTGGCGCTAAGGTCTGCACCTTCAAAGGCGTT
TTCGCTACCAAGAATATGGTCAACGGCGACCTGCGCCATTTTGTAGCCTGGTGCTACCAG
ACCAAATACACGGTTGTTCCAGCTTGCGCATTCACCGATGGCGTAGATATCCGGATCGGA
AGTCTGGCAGGAAT