# Literature Download

In [84]:
import csv
import urllib
import xml.etree.ElementTree as ET
from time import sleep
from bs4 import BeautifulSoup
import pandas as pd

## Searching PubMed Database

In [85]:
"""
Dependency list in case of setting up poetry
- ipykernel
"""

URL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi"
DB = "pubmed"
TERM = "(self-resistance[Title/Abstract]) AND (gene[Title/Abstract])"
USEHISTORY = "y"
RETTYPE = "xml"

In [86]:
params = {"db": DB, "term": TERM, "usehistory": USEHISTORY, "rettype": RETTYPE}
search_url = "{}?{}".format(URL, urllib.parse.urlencode(params))
print(search_url)

http://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?db=pubmed&term=%28self-resistance%5BTitle%2FAbstract%5D%29+AND+%28gene%5BTitle%2FAbstract%5D%29&usehistory=y&rettype=xml


In [87]:
response = urllib.request.urlopen(search_url)
response_str = response.read().decode("utf-8")
response_str

'<?xml version="1.0" encoding="UTF-8" ?>\n<!DOCTYPE eSearchResult PUBLIC "-//NLM//DTD esearch 20060628//EN" "https://eutils.ncbi.nlm.nih.gov/eutils/dtd/20060628/esearch.dtd">\n<eSearchResult><Count>150</Count><RetMax>20</RetMax><RetStart>0</RetStart><QueryKey>1</QueryKey><WebEnv>MCID_66f4418127f7d7a98a067a59</WebEnv><IdList>\n<Id>38922130</Id>\n<Id>38795484</Id>\n<Id>38704580</Id>\n<Id>38559311</Id>\n<Id>38233135</Id>\n<Id>38051581</Id>\n<Id>37961497</Id>\n<Id>37575356</Id>\n<Id>37569821</Id>\n<Id>37374983</Id>\n<Id>37198233</Id>\n<Id>37166326</Id>\n<Id>36847561</Id>\n<Id>36704842</Id>\n<Id>36671236</Id>\n<Id>36555354</Id>\n<Id>36445346</Id>\n<Id>36187977</Id>\n<Id>35855324</Id>\n<Id>35830808</Id>\n</IdList><TranslationSet/><QueryTranslation>"self-resistance"[Title/Abstract] AND "gene"[Title/Abstract]</QueryTranslation></eSearchResult>\n'

In [88]:
"""
https://docs.python.org/3.11/library/xml.etree.elementtree.html#parsing-xml
fromstring() parses XML from a string directly into an Element, 
which is the root element of the parsed tree.
"""

root = ET.fromstring(response_str)
count = root.findall("Count")[0].text
webenv = root.findall("WebEnv")[0].text
querykey = root.findall("QueryKey")[0].text

In [89]:
print(count)
print(webenv)
print(querykey)

150
MCID_66f4418127f7d7a98a067a59
1


## Getting Method Section of Each Paper

In [90]:
FETCH_URL = "http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi"
# RETSTART = 0
# RETMAX = 1
RETMODE = "xml"
RETTYPE = "abstract"

In [91]:
title_list = []
abstract_list = []
pmid_list = []
for i in range(0,150):
    retstart = i
    retmax = retstart+1

    fetch_params = {
    "db":DB,
    "query_key":querykey,
    "WebEnv":webenv,
    "retstart":retstart,
    "retmax":retmax,
    "retmode":RETMODE,
    "RETTYPE":RETTYPE
    }

    fetch_url = "{}?{}".format(FETCH_URL, urllib.parse.urlencode(fetch_params))

    abstract_response = urllib.request.urlopen(fetch_url)
    abstract_response = abstract_response.read().decode("utf-8")
    soup = BeautifulSoup(abstract_response, features="xml")

    pmid = soup.find_all("PMID")[0].text
    title = soup.find_all("ArticleTitle")[0].text
    abstract = soup.find_all("AbstractText")[0].text
    
    pmid_list.append(pmid)
    title_list.append(title)
    abstract_list.append(abstract)

    print(f"{i}: {title}")
    sleep(1)

0: Mechanism of Fumonisin Self-Resistance: Fusarium verticillioides Contains Four Fumonisin B1-Insensitive-Ceramide Synthases.


In [92]:
df_dict = {
    "title": title_list,
    "pmid": pmid_list,
    "abstract": abstract_list
}

In [93]:
abstract_df = pd.DataFrame.from_dict(df_dict)

In [94]:
abstract_df.to_csv("./abstracts/abstracts_self_resistance_and_gene.csv", index=False)