In [17]:
from abc import ABCMeta, abstractmethod
from urllib.parse import urlencode
import requests
from datetime import datetime
from bs4 import BeautifulSoup as bsoup

class NCBI_Searcher(metaclass=ABCMeta):
    """ 'Interface' que define a utilização da API das databases da NCBI.
    """
    
    search_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    meta_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
    fetch_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
        
    def search(self, queryterms: list=None, search_type: str=None,
               start_year: int=1900, end_year: int=None,
               max_records: int=20, start_record: int=0,
               author: str=None):
        """
        Realiza uma pesquisa NCBI.
        
        @param queryterms: list of lists. Terms within the same list are
            separated by an OR. Lists are separated by an AND
        @param search_type: meta_data or querytext. 
            meta_data: This field enables a free-text search of all 
                configured metadata fields and the abstract.
            querytext: This field enables a free-text search of all 
                fields. 
        @param start_year: Start value of Publication Year to restrict results by.
        @param end_year: End value of Publication Year to restrict results by.
        @param max_records: The number of records to fetch.
        @param start_record: Sequence number of first record to fetch.
        @param author: An author's name. Searches both first name and last name
            
        @return: uma lista de títulos e IDs no formato [(title, id)]
        """
        
        term = self._search_term(queryterms, search_type=search_type)
        if author:
            term = "%s AND %s[Author]" % (term, author)
        
        fixed_payload = {"retmode": "json", "datetype": "pdat",
                         "db":self._db, "sort":self._sort_order}
        payload = {"term":term, 
                   "retmax": max_records, "retstart":start_record,
                   "mindate": start_year, "maxdate": end_year or datetime.now().year}
        payload.update(fixed_payload)
        
        url = "%s?%s" % (self.search_url, urlencode(payload))
        
        response = requests.get(url).json()['esearchresult']
        
        print('QTD. resultados: %s' % response['count'])
        
        id_list = response['idlist']

        if id_list:
            result = self.get_article_metadata(*id_list)['result']

            return [(result[uid]['title'], uid) for uid in result['uids']]
        return []
    
    def get_article_metadata(self, *args):
        """
        Retorna os metadados do(s) artigo(s).
        
        @param args: IDs dos artigos. O ID do artigo depende da base de dados.
            O ID de um artigo pode ser diferente entre a PMC e a PubMed
            por exemplo. Podem ser passados vários IDs ao mesmo tempo.
        @param db: base de dados a ser pesquisada. Se for mais de uma,
            pode ser separado por vírgula. As mais úteis serão:
            pmc, pubmed
        @param retmode: A forma de retorno. None será em XML.
        
        @return: o json cru retornado pela API.
        """
        id_list = ','.join([str(x) for x in args])
            
        payload = {"id":id_list, "db":self._db, "retmode":"json"}
        url = "%s?%s" % (self.meta_url, urlencode(payload))
        
        r = requests.get(url).json()
        
        return r        
    
    def _search_term(self, queryterms: list, search_type: str=None):
        """Monta o termo de pesquisa completo para mandar para a API."""
        
        if search_type in ['querytext', None]:
            # Retorna simplesmente a busca concatenando com os OR's e AND's
            return "(%s)" % " AND ".join(["(%s)" % " OR ".join(orses)for orses in queryterms])
        elif search_type != 'meta_data':
            raise Exception('Tipo de pesquisa não faz sentido: %s\nTipos suportados:' % search_type)
        
        # Retorna concacentando com os OR'S e AND's, mas embutindo também os campos de pesquisa em cada termo
        queryterms = [[self._embutir_fields(orses) for orses in andes] for andes in queryterms]
        return "(%s)" % " AND ".join(["(%s)" % " OR ".join(orses)for orses in queryterms])
    
    def _embutir_fields(self, term: str):
        """Faz uma transformação, embutindo fields no termo de pesquisa.
        Isso é para poder realizar a pesquisa em apenas alguns campos ao invés de todos.
        
        Exemplo: sendo self.__fields = ['title', 'abstract'],
        a chamada 
        `self._embutir_fields("machine learning")`
        Transforma:
            machine learning ---> (machine learning[title] OR machine learning[abstract])
        """
        
        return "(%s)" % " OR ".join(["%s[%s]" % (term, field) for field in self._fields])
    
    @property
    @abstractmethod
    def _fields(self):
        """Cada subclasse deverá definir quais serão os campos de pesquisa de cada termo.
        O retorno deverá ser uma lista de fields.
        Exemplo: 
        return ['title', 'abstract']
        """
        pass
    
    @property
    @abstractmethod
    def _db(self):
        """Cada subclasse deverá definir o seu banco.
        Exemplo: 
        return 'pmc'
        """
        pass
    
    @property
    @abstractmethod
    def _sort_order(self):
        """Cada classe deverá definir o parâmetro sort_order.
        Exemplo:
        return 'Journal'
        """
        pass
        
    @property
    @abstractmethod
    def _article_url(self):
        """Cada classe deverá definir a URL da página de um artigo.
        """
        pass
    
class PMC_Searcher(NCBI_Searcher):
    """Realiza pesquisas na base PMC."""
    
    @property
    def _fields(self):
        return ['Abstract', 'Body - Key Terms', 'MeSH Terms',
                'MeSH Major Topic', 'Methods - Key Terms']
    
    @property
    def _db(self):
        return 'pmc'
    
    @property
    def _sort_order(self):
        return 'relevance'
    
    @property
    def _article_url(self):
        return 'https://www.ncbi.nlm.nih.gov/pmc/articles/'
    
    
class PubMed_Searcher(NCBI_Searcher):
    """Realiza pesquisas na base PubMed."""
    
    @property
    def _fields(self):
        return ['Text Words']
    
    @property
    def _db(self):
        return 'pubmed'
    
    @property
    def _sort_order(self):
        return ''
    
    @property
    def _article_url(self):
        return "https://www.ncbi.nlm.nih.gov/pubmed/"
    
    def get(self,*args):
        id_list = ','.join([str(x) for x in args])
        
        payload = {"id":id_list, "db":self._db, "retmode":"xml"}
        url = "%s?%s" % (self.fetch_url, urlencode(payload))
        
        soup = bsoup(requests.get(url).content, "xml")
        
        pubmed_articles = soup.findAll('PubmedArticle')
        
        documentos = []
        append = documentos.append
        for p_art in pubmed_articles:
            authors = ["%s %s" % (a.ForeName.text, a.LastName.text) for a in p_art.findAll("Author")]
            keywords = [k.text for k in p_art.findAll("Keyword")]
            data_pub_string = "%s %s" % (p_art.PubDate.Year.text, p_art.PubDate.Month.text)
            
            documento = {}
            documento['resumo'] = getattr(p_art.AbstractText, 'text', '')
            documento['resumo_url'] = "%s%s" % (self._article_url, p_art.PMID.text)
            documento['autores'] = ",".join(authors)
            documento['doi'] = p_art.findAll("ArticleId", {"IdType":"doi"})[0].text
            documento['palavras_chaves'] = ",".join(keywords)
            documento['data_publicacao'] = datetime.strptime(data_pub_string, "%Y %b")
            documento['titulo_publicacao'] = p_art.Title.text
            documento['titulo'] = p_art.ArticleTitle.text
            append(documento)
        
        return documentos

In [6]:
technology_queryterms = [
    'machine learning', 'deep learning', 'artificial intelligence', 
    'neural network', 'scoring system'
]

health_queryterms = [
    'coronary artery disease', 'chest pain', 'heart disease', 'MACE', 
    'Acute Cardiac Complications'
]

# technology_queryterms = [
#     'machine learning', 'deep learning'
# ]

# health_queryterms = [
#     'coronary artery disease', 'chest pain'
# ]

queryterms = [technology_queryterms, health_queryterms]

r = PMC_Searcher().search(queryterms=queryterms, start_year=2008)
[(x[0][:80],x[1]) for x in r[:10]]

QTD. resultados: 72277


[('ESICM LIVES 2016: part one: Mi', '5042924'),
 ('36th International Symposium o', '5493079'),
 ('Abstracts from the 36th Annual', '3654146'),
 ('Abstracts from the 38th Annual', '4405523'),
 ('Abstracts from the 37th Annual', '4429500'),
 ('25th Annual Computational Neur', '5001212'),
 ('ESICM LIVES 2016: part three: ', '5042925'),
 ('Heart Disease and Stroke Stati', '5408160'),
 ('ESICM LIVES 2016: part two: Mi', '5042923'),
 ('Korean Guidelines for the Appr', '4347263')]

In [8]:
r = PubMed_Searcher().search(queryterms=queryterms, start_year=2008)
[(x[0][:80],x[1]) for x in r[:10]]

QTD. resultados: 2675


[('Cytokine Responses to Rhinovirus and Development of Asthma, Allergic Sensitizati',
  '29466680'),
 ('Comparison of fast multi-slice and standard segmented techniques for detection o',
  '29458430'),
 ('Do patients with diabetes type 2 or chronic heart failure understand a medicatio',
  '29441963'),
 ('Development and preliminary testing of the Brief Developmental Assessment: an ea',
  '29433600'),
 ('Classification of the clinical images for benign and malignant cutaneous tumors ',
  '29428356'),
 ('Controlling the Risk Domain in Pediatric Asthma through Personalized Care.',
  '29427984'),
 ('Scoring system to guide decision making for the use of bilateral internal mammar',
  '29413876'),
 ('Big Data Analytics, the Microbiome, Host-omic and Bug-omic Data and Risk for Car',
  '29413172'),
 ('Automatic Calcium Scoring in Low-Dose Chest CT Using Deep Neural Networks With D',
  '29408789'),
 ('Predicting cardiogenic pulmonary edema in heart failure patients by using an N-t',
  '29408167

In [9]:
xml = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pmc&id=5042924,5493079&retmode=xml")
soup = bsoup(xml.content, "xml").findAll('article')

In [12]:
pubmed = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=pubmed&id=29413172&retmode=xml")

In [13]:
soup = bsoup(pubmed.content, "xml").findAll('PubmedArticle')

In [18]:
PubMed_Searcher().get(29413172,29408789)

[{'autores': 'Chayakrit Krittanawong',
  'data_publicacao': datetime.datetime(2018, 3, 1, 0, 0),
  'doi': '10.1016/j.hlc.2017.07.012',
  'palavras_chaves': 'Big Data Analytics,Big data,Bug-omic,Host-omic,Human microbiome,Machine Learning',
  'resumo': '',
  'resumo_url': 'https://www.ncbi.nlm.nih.gov/pubmed/29413172',
  'titulo': 'Big Data Analytics, the Microbiome, Host-omic and Bug-omic Data and Risk for Cardiovascular Disease.',
  'titulo_publicacao': 'Heart, lung & circulation'},
 {'autores': 'Nikolas Lessmann,Bram van Ginneken,Majd Zreik,Pim A de Jong,Bob D de Vos,Max A Viergever,Ivana Isgum',
  'data_publicacao': datetime.datetime(2018, 2, 1, 0, 0),
  'doi': '10.1109/TMI.2017.2769839',
  'palavras_chaves': '',
  'resumo': 'Heavy smokers undergoing screening with low-dose chest CT are affected by cardiovascular disease as much as by lung cancer. Low-dose chest CT scans acquired in screening enable quantification of atherosclerotic calcifications and thus enable identification of s

In [37]:
soup[0].findAll("Author")[0].ForeName

<ForeName>Kevin</ForeName>

In [14]:
[k.text for k in soup[0].findAll("Keyword")]

['Big Data Analytics',
 'Big data',
 'Bug-omic',
 'Host-omic',
 'Human microbiome',
 'Machine Learning']

In [53]:
soup[0].ArticleTitle.text

'Automatic prediction of coronary artery disease from clinical narratives.'