In [None]:
from abc import ABCMeta, abstractmethod
from urllib.parse import urlencode
import requests
from datetime import datetime
from bs4 import BeautifulSoup as bsoup
from functools import reduce

In [37]:
class NCBI_Searcher(metaclass=ABCMeta):
    """ 'Interface' que define a utilização da API das databases da NCBI.
    """

    search_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    meta_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
    fetch_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'

    def search(self, queryterms: list = None, search_type: str = None,
               start_year: int = 1900, end_year: int = None,
               max_records: int = 20, start_record: int = 0,
               author: str = None, journal: str = None):
        """
        Realiza uma pesquisa NCBI.
        @param queryterms: list of lists. Terms within the same list are
            separated by an OR. Lists are separated by an AND
        @param search_type: meta_data or querytext.
            meta_data: This field enables a free-text search of all
                configured metadata fields and the abstract.
            querytext: This field enables a free-text search of all
                fields.
        @param start_year: Start value of Publication Year to restrict results by.
        @param end_year: End value of Publication Year to restrict results by.
        @param max_records: The number of records to fetch.
        @param start_record: Sequence number of first record to fetch.
        @param author: An author's name. Searches both first name and last name
            Accepts a list of author names too.
        @param journal: An author's name. Accepts a list of journals too.
        @return: a dictionaries list whose keys are compatible with Documento model.
        """

        term = self._search_term(queryterms, search_type=search_type)
        if author:
            author = [author] if type(author) == str else author
            author = ['%s[Author]' % a for a in author]
            term = "%s AND (%s)" % (term, " OR ".join(author) )

        if journal:
            journal = [journal] if type(journal) == str else journal
            journal = ['"%s"[Journal]' % j for j in journal]
            term = "%s AND (%s)" % (term, " OR ".join(journal) )

        fixed_payload = {"retmode": "json", "datetype": "pdat",
                         "db": self._db, "sort": self._sort_order}
        payload = {"term": term,
                   "retmax": max_records, "retstart": start_record,
                   "mindate": start_year or '', "maxdate": end_year or datetime.now().year}
        payload.update(fixed_payload)

        url = "%s?%s" % (self.search_url, urlencode(payload))
        
        print("URL SEARCH: %s" % url)

        print("Você pode realizar essa mesma busca no navegador com o termo de busca:\n%s" % term)

        response = requests.get(url).json()['esearchresult']

        print('QTD. resultados: %s' % response['count'])

        id_list = response['idlist']

        if id_list:
            return self._get_article_metadata(*id_list)
        return []

    def _search_term(self, queryterms: list, search_type: str = None):
        """Monta o termo de pesquisa completo para mandar para a API."""

        if search_type in ['querytext', None]:
            # Retorna simplesmente a busca concatenando com os OR's e AND's
            return "(%s)" % " AND ".join(["(%s)" % " OR ".join(orses) for orses in queryterms])
        elif search_type != 'meta_data':
            raise Exception('Tipo de pesquisa não faz sentido: %s\nTipos suportados:' % search_type)

        # Retorna concacentando com os OR'S e AND's, mas embutindo também os campos de pesquisa em cada termo
        queryterms = [[self._embutir_fields(orses) for orses in andes] for andes in queryterms]
        return "(%s)" % " AND ".join(["(%s)" % " OR ".join(orses) for orses in queryterms])

    def _embutir_fields(self, term: str):
        """Faz uma transformação, embutindo fields no termo de pesquisa.
        Isso é para poder realizar a pesquisa em apenas alguns campos ao invés de todos.
        Exemplo: sendo self.__fields = ['title', 'abstract'],
        a chamada
        `self._embutir_fields("machine learning")`
        Transforma:
            machine learning ---> (machine learning[title] OR machine learning[abstract])
        """

        return "(%s)" % " OR ".join(["%s[%s]" % (term, field) for field in self._fields])

    @staticmethod
    def deepgetter(obj, attrs, default=None):
        """Faz uma chamada sucessiva da função getattr, para ir pegando os atributos
        de um objeto.
        Exemplo:
        deepgetter(Cidade, 'regiao.pais') é equivalente a fazer Cidade.regiao.pais
        """
        getter = lambda x, y: getattr(x, y, default)
        return reduce(getter, attrs.split('.'), obj)

    @abstractmethod
    def _get_article_metadata(self, *args):
        """Cada subclasse deverá implementar a função que pega o retorno da API e transforma numa lista de dicionários
        no formato do modelo Documento."""
        pass

    @property
    @abstractmethod
    def _fields(self):
        """Cada subclasse deverá definir quais serão os campos de pesquisa de cada termo.
        O retorno deverá ser uma lista de fields.
        Exemplo:
        return ['title', 'abstract']
        """
        pass

    @property
    @abstractmethod
    def _db(self):
        """Cada subclasse deverá definir o seu banco.
        Exemplo:
        return 'pmc'
        """
        pass

    @property
    @abstractmethod
    def _sort_order(self):
        """Cada classe deverá definir o parâmetro sort_order.
        Exemplo:
        return 'Journal'
        """
        pass

    @property
    @abstractmethod
    def _article_url(self):
        """Cada classe deverá definir a URL da página de um artigo."""
        pass


class PMC_Searcher(NCBI_Searcher):
    """Realiza pesquisas na base PMC."""

    @property
    def _fields(self):
        return ['Abstract', 'Body - Key Terms', 'MeSH Terms',
                'MeSH Major Topic', 'Methods - Key Terms']

    @property
    def _db(self):
        return 'pmc'

    @property
    def _sort_order(self):
        return 'relevance'

    @property
    def _article_url(self):
        return 'https://www.ncbi.nlm.nih.gov/pmc/articles/'
    
    @staticmethod
    def _get_unique_id(p_art):
        """Vascula o XML (um <PubmedArticle>) para encontrar o ID único do artigo.
        Se nao tiver DOI presente no XML, coloca o ID que tiver (esperado que seja o PubMed ID)"""
        
        try:
            unique_id = p_art.findAll("article-id", {"pub-id-type": "doi"})[0].text
        except:
            unique_id = p_art.findAll("article-id")[0]
            unique_id = "%s%s" % (unique_id['pub-id-type'], unique_id.text)
            
        return unique_id

    def _get_article_metadata(self, *args):
        id_list = ','.join([str(x) for x in args])

        payload = {"id": id_list, "db": self._db, "retmode": "xml"}
        url = "%s?%s" % (self.fetch_url, urlencode(payload))
        print("URL META: %s" % url)

        soup = bsoup(requests.get(url).content, "xml")

        pmc_articles = soup.findAll('article')

        documentos = []
        append = documentos.append

        for p_art in pmc_articles:
            author_list = p_art.findAll("contrib", {"contrib-type": "author"})
            authors = []
            for author in author_list:
                try:
                    authors.append("%s %s" % (getattr(author, "given-names").text, author.surname.text))
                except:
                    authors.append(author.text)

            keywords = [k.text for k in p_art.findAll("kwd")]
            pmc_id = soup.findAll("article-id", {"pub-id-type": 'pmc'})[0].text

            documento = {}
            documento['resumo'] = getattr(p_art.abstract, 'text', ' - ')
            documento['html_url'] = "%s%s" % (self._article_url, pmc_id)
            documento['autores'] = ",".join(authors)
            documento['doi'] = self._get_unique_id(p_art)
            documento['palavras_chaves'] = ",".join(keywords)
            documento['titulo'] = getattr(p_art, "article-title").text

            try:
                pub_date = p_art.findAll("pub-date", {"pub-type": "epub"})[0]
            except:
                pub_date = p_art.findAll("pub-date", {"pub-type": "ppub"})[0]

            data_pub_string = "%s %s" % (pub_date.year.text, pub_date.month.text)
            
            try:
                documento['data'] = datetime.strptime(data_pub_string, "%Y %m").date()
            except:
                documento['data'] = datetime.strptime(data_pub_string, "%Y %b").date()

            append(documento)

        return documentos


class PubMed_Searcher(NCBI_Searcher):
    """Realiza pesquisas na base PubMed."""

    @property
    def _fields(self):
        return ['Text Words']

    @property
    def _db(self):
        return 'pubmed'

    @property
    def _sort_order(self):
        return ''

    @property
    def _article_url(self):
        return "https://www.ncbi.nlm.nih.gov/pubmed/"
    
    @staticmethod
    def _get_data(p_art):
        """Vasculha o XML (um <PubmedArticle>) para encontrar a data de publicação
        Se for encontrada uma data válida, retorna um datetime.
        Se não, retorna uma string, que espera-se que contenha uma informação de data"""
        
        if hasattr(p_art.PubDate.Year, "text"):
            ano = p_art.PubDate.Year.text
        elif hasattr(p_art.PubDate.MedlineDate, "text"):
            ano = p_art.PubDate.MedlineDate.text[:8]
        
        try:
            data_pub_string = "%s %s" % (ano, self.deepgetter(p_art, 'PubDate.Month.text', default='Jan'))
            data = datetime.strptime(data_pub_string, "%Y %b").date()
        except:
            try:
                data_pub_string = "%s %s" % (ano, self.deepgetter(p_art, 'PubDate.Month.text', default='Jan'))
                data = datetime.strptime(data_pub_string, "%Y %m").date()
            except:
                data = str(p_art.PubDate.text)

        return data
        
        
        
    @staticmethod
    def _get_unique_id(p_art):
        """Vascula o XML (um <PubmedArticle>) para encontrar o ID único do artigo.
        Se nao tiver DOI presente no XML, coloca o ID que tiver (esperado que seja o PubMed ID)"""
        
        try:
            unique_id = p_art.findAll("ArticleId", {"IdType": "doi"})[0].text
        except:
            unique_id = p_art.findAll("ArticleId")[0]
            unique_id = "%s%s" % (unique_id['IdType'], unique_id.text)
            
        return unique_id

    def _get_article_metadata(self, *args):
        id_list = ','.join([str(x) for x in args])

        payload = {"id": id_list, "db": self._db, "retmode": "xml"}
        url = "%s?%s" % (self.fetch_url, urlencode(payload))
        
        print("URL META: %s" % url)

        soup = bsoup(requests.get(url).content, "xml")

        pubmed_articles = soup.findAll('PubmedArticle')

        documentos = []
        append = documentos.append

        for p_art in pubmed_articles:
            authors = ["%s %s" % (a.ForeName.text, a.LastName.text) for a in p_art.findAll("Author")]
            keywords = [k.text for k in p_art.findAll("Keyword")]
            
            documento = {}
            documento['resumo'] = getattr(p_art.AbstractText, 'text', ' - ')
            documento['html_url'] = "%s%s" % (self._article_url, p_art.PMID.text)
            documento['autores'] = ",".join(authors)
            documento['doi'] = self._get_unique_id(p_art)
            documento['palavras_chaves'] = ",".join(keywords)
            documento['titulo'] = p_art.ArticleTitle.text
            data = self._get_data(p_art)
            if type(data) == str:
                documento['resumo'] = "%s\n%s" % (data, documento['resumo'])
            else:
                documento['data'] = self._get_data(p_art)

            append(documento)

        return documentos

In [2]:
# termos de pesquisa relacionados a tecnologia
technology_queryterms = [
    'machine learning', 'deep learning', 'artificial intelligence', 
    'neural network', 'scoring system'
]

# termos de pesquisa relacionados a area da saude
health_queryterms = [
    'coronary artery disease', 'chest pain', 'heart disease', 'MACE', 
    'Acute Cardiac Complications'
]

queryterms = [technology_queryterms, health_queryterms]

# lista de revistas que delimitam a busca
journal = ["BioMedical Engineering OnLine",
           "Biomedical Engineering"]

# r = PMC_Searcher().search(queryterms=queryterms, start_year=2008, max_records=5, journal=journal)
# [a['titulo'][:50] for a in r]



In [None]:
queryterms = [['artificial intelligence'], ['cardiovascular death']]

r = PubMed_Searcher().search(queryterms=queryterms, max_records=5)
[a['titulo'][:50] for a in r]

In [38]:
queryterms = [['rup'], ['cardiovascular disease']]

r = PMC_Searcher().search(queryterms=queryterms, max_records=5)
[a['titulo'][:50] for a in r]

URL SEARCH: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?term=%28%28rup%29+AND+%28cardiovascular+disease%29%29&retmax=5&retstart=0&mindate=1900&maxdate=2018&retmode=json&datetype=pdat&db=pmc&sort=relevance
Você pode realizar essa mesma busca no navegador com o termo de busca:
((rup) AND (cardiovascular disease))
QTD. resultados: 110
URL META: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?id=5095330%2C5388187%2C1888177%2C4212304%2C3264995&db=pmc&retmode=xml
> [0;32m<ipython-input-37-64b6fb5ce4d9>[0m(206)[0;36m_get_article_metadata[0;34m()[0m
[0;32m    205 [0;31m            [0;32mimport[0m [0mipdb[0m[0;34m;[0m [0mipdb[0m[0;34m.[0m[0mset_trace[0m[0;34m([0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m--> 206 [0;31m            [0mdocumento[0m[0;34m[[0m[0;34m'doi'[0m[0;34m][0m [0;34m=[0m [0mself[0m[0;34m.[0m[0m_get_unique_id[0m[0;34m([0m[0mp_art[0m[0;34m)[0m[0;34m[0m[0m
[0m[0;32m    207 [0;31m            [0mdocumento[0m

['Tongxinluo modulates cytokine secretion by cardiac',
 'Increased Vascular Permeability Measured With an A',
 'Myocardial diseases of animals.',
 'UEG Week 2014 Oral Presentations',
 'Circulating levels of vascular endothelial markers']