In [76]:
from abc import ABCMeta, abstractmethod
from urllib.parse import urlencode
import requests
from datetime import datetime

class NCBI_Searcher(metaclass=ABCMeta):
    """ 'Interface' que define a utilização da API das databases da NCBI.
    """
    
    search_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    meta_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
    sort_order = 'Journal'
        
    def search(self, queryterms: list=None, search_type: str=None,
               start_year: int=1900, end_year: int=None,
               max_records: int=20, start_record: int=0,
               author: str=None):
        """
        Realiza uma pesquisa NCBI.
        
        @param queryterms: list of lists. Terms within the same list are
            separated by an OR. Lists are separated by an AND
        @param search_type: meta_data or querytext. 
            meta_data: This field enables a free-text search of all 
                configured metadata fields and the abstract.
            querytext: This field enables a free-text search of all 
                fields. 
        @param start_year: Start value of Publication Year to restrict results by.
        @param end_year: End value of Publication Year to restrict results by.
        @param max_records: The number of records to fetch.
        @param start_record: Sequence number of first record to fetch.
        @param author: An author's name. Searches both first name and last name
            
        @return: uma lista de títulos e IDs no formato [(title, id)]
        """
        
        term = self._search_term(queryterms, search_type=search_type)
        if author:
            term = "%s AND %s[Author]" % (term, author)
        
        fixed_payload = {"retmode": "json", "datetype": "pdat",
                            "sort":self.sort_order, "db":self._db,}
        payload = {"term":term, 
                   "retmax": max_records, "retstart":start_record,
                   "mindate": start_year, "maxdate": end_year or datetime.now().year}
        payload.update(fixed_payload)
        
        url = "%s?%s" % (self.search_url, urlencode(payload))
        
        response = requests.get(url).json()['esearchresult']
        
        print ('TERMs: %s' % response['querytranslation'])
        print('QTD. resultados: %s' % response['count'])
        
        id_list = response['idlist']

        if id_list:
            result = self.get_article_metadata(*id_list)['result']

            return [(result[uid]['title'], uid) for uid in result['uids']]
        return id_list
    
    def get_article_metadata(self, *args, retmode: str='json'):
        """
        Retorna os metadados do(s) artigo(s).
        
        @param args: ID do artigo. O ID do artigo depende da base de dados.
            O ID de um artigo pode ser diferente entre a PMC e a PubMed
            por exemplo. Podem ser passados vários IDs ao mesmo tempo.
        @param db: base de dados a ser pesquisada. Se for mais de uma,
            pode ser separado por vírgula. As mais úteis serão:
            pmc, pubmed
        @param retmode: A forma de retorno. None será em XML.
        
        @return: o json cru retornado pela API.
        """
        id_list = ','.join([str(x) for x in args])
            
        payload = {"id":id_list, "db":self._db, "retmode":retmode}
        url = "%s?%s" % (self.meta_url, urlencode(payload))
        
        r = requests.get(url).json()
        
        return r
    
    def _search_term(self, queryterms: list, search_type: str=None):
        """Monta o termo de pesquisa completo para mandar para a API."""
        
        if search_type in ['querytext', None]:
            # Retorna simplesmente a busca concatenando com os OR's e AND's
            return "(%s)" % " AND ".join(["(%s)" % " OR ".join(orses)for orses in queryterms])
        elif search_type != 'meta_data':
            raise Exception('Tipo de pesquisa não faz sentido: %s\nTipos suportados:' % search_type)
        
        # Retorna concacentando com os OR'S e AND's, mas embutindo também os campos de pesquisa em cada termo
        queryterms = [[self._embutir_fields(orses) for orses in andes] for andes in queryterms]
        return "(%s)" % " AND ".join(["(%s)" % " OR ".join(orses)for orses in queryterms])
    
    def _embutir_fields(self, term: str):
        """Faz uma transformação, embutindo fields no termo de pesquisa.
        Isso é para poder realizar a pesquisa em apenas alguns campos ao invés de todos.
        
        Exemplo: sendo self.__fields = ['title', 'abstract'],
        a chamada 
        `self._embutir_fields("machine learning")`
        Transforma:
            machine learning ---> (machine learning[title] OR machine learning[abstract])
        """
        
        return "(%s)" % " OR ".join(["%s[%s]" % (term, field) for field in self._fields])
    
    @property
    @abstractmethod
    def _fields(self):
        """Cada subclasse deverá definir quais serão os campos de pesquisa de cada termo.
        O retorno deverá ser uma lista de fields.
        Exemplo: 
        return ['title', 'abstract']
        """
        pass
    
    @property
    @abstractmethod
    def _db(self):
        """Cada subclasse deverá definir o seu banco.
        Exemplo: 
        return 'pmc'
        """
        pass
    
    
class PMC_Searcher(NCBI_Searcher):
    """Realiza pesquisas na base PMC."""
    
    @property
    def _fields(self):
        return ['Abstract', 'Body - Key Terms', 'MeSH Terms',
                'MeSH Major Topic', 'Methods - Key Terms',
                ]
    
    @property
    def _db(self):
        return 'pmc'

In [77]:
technology_queryterms = [
    'machine learning', 'deep learning', 'artificial intelligence', 
    'neural network', 'scoring system'
]

health_queryterms = [
    'coronary artery disease', 'chest pain', 'heart disease', 'MACE', 
    'Acute Cardiac Complications'
]

# technology_queryterms = [
#     'machine learning', 'deep learning'
# ]

# health_queryterms = [
#     'coronary artery disease', 'chest pain'
# ]

queryterms = [technology_queryterms, health_queryterms]

r = PMC_Searcher().search(queryterms=queryterms, search_type="meta_data", author='nan liu')
[(x[0][:30],x[1]) for x in r]

TERMs: (((machine learning[Abstract] OR machine learning[Body - Key Terms] OR "machine learning"[MeSH Terms] OR "machine learning"[MeSH Major Topic] OR machine learning[Methods - Key Terms]) OR (deep learning[Abstract] OR deep learning[Body - Key Terms] OR deep learning[Methods - Key Terms]) OR (artificial intelligence[Abstract] OR artificial intelligence[Body - Key Terms] OR "artificial intelligence"[MeSH Terms] OR "artificial intelligence"[MeSH Major Topic] OR artificial intelligence[Methods - Key Terms]) OR (neural network[Abstract] OR neural network[Body - Key Terms] OR "neural networks (computer)"[MeSH Terms] OR "neural networks (computer)"[MeSH Major Topic] OR neural network[Methods - Key Terms]) OR (scoring system[Abstract] OR scoring system[Body - Key Terms] OR scoring system[Methods - Key Terms])) AND ((coronary artery disease[Abstract] OR coronary artery disease[Body - Key Terms] OR "coronary artery disease"[MeSH Terms] OR "coronary artery disease"[MeSH Major Topic] OR corona

[('Prediction of adverse cardiac ', '4150554'),
 ('Prediction of cardiac arrest i', '3580666')]