In [94]:
from abc import ABCMeta, abstractmethod
from urllib.parse import urlencode
import requests
from datetime import datetime

class NCBI_Searcher(metaclass=ABCMeta):
    """ 'Interface' que define a utilização da API das databases da NCBI.
    """
    
    search_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    meta_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
        
    def search(self, queryterms: list=None, search_type: str=None,
               start_year: int=1900, end_year: int=None,
               max_records: int=20, start_record: int=0,
               author: str=None):
        """
        Realiza uma pesquisa NCBI.
        
        @param queryterms: list of lists. Terms within the same list are
            separated by an OR. Lists are separated by an AND
        @param search_type: meta_data or querytext. 
            meta_data: This field enables a free-text search of all 
                configured metadata fields and the abstract.
            querytext: This field enables a free-text search of all 
                fields. 
        @param start_year: Start value of Publication Year to restrict results by.
        @param end_year: End value of Publication Year to restrict results by.
        @param max_records: The number of records to fetch.
        @param start_record: Sequence number of first record to fetch.
        @param author: An author's name. Searches both first name and last name
            
        @return: uma lista de títulos e IDs no formato [(title, id)]
        """
        
        term = self._search_term(queryterms, search_type=search_type)
        if author:
            term = "%s AND %s[Author]" % (term, author)
        
        fixed_payload = {"retmode": "json", "datetype": "pdat",
                         "db":self._db, "sort":self._sort_order}
        payload = {"term":term, 
                   "retmax": max_records, "retstart":start_record,
                   "mindate": start_year, "maxdate": end_year or datetime.now().year}
        payload.update(fixed_payload)
        
        url = "%s?%s" % (self.search_url, urlencode(payload))
        
        response = requests.get(url).json()['esearchresult']
        
        print ('TERMs: %s' % response['querytranslation'])
        print('QTD. resultados: %s' % response['count'])
        
        id_list = response['idlist']

        if id_list:
            result = self.get_article_metadata(*id_list)['result']

            return [(result[uid]['title'], uid) for uid in result['uids']]
        return []
    
    def get_article_metadata(self, *args):
        """
        Retorna os metadados do(s) artigo(s).
        
        @param args: IDs dos artigos. O ID do artigo depende da base de dados.
            O ID de um artigo pode ser diferente entre a PMC e a PubMed
            por exemplo. Podem ser passados vários IDs ao mesmo tempo.
        @param db: base de dados a ser pesquisada. Se for mais de uma,
            pode ser separado por vírgula. As mais úteis serão:
            pmc, pubmed
        @param retmode: A forma de retorno. None será em XML.
        
        @return: o json cru retornado pela API.
        """
        id_list = ','.join([str(x) for x in args])
            
        payload = {"id":id_list, "db":self._db, "retmode":"json"}
        url = "%s?%s" % (self.meta_url, urlencode(payload))
        
        r = requests.get(url).json()
        
        return r
    
    def _search_term(self, queryterms: list, search_type: str=None):
        """Monta o termo de pesquisa completo para mandar para a API."""
        
        if search_type in ['querytext', None]:
            # Retorna simplesmente a busca concatenando com os OR's e AND's
            return "(%s)" % " AND ".join(["(%s)" % " OR ".join(orses)for orses in queryterms])
        elif search_type != 'meta_data':
            raise Exception('Tipo de pesquisa não faz sentido: %s\nTipos suportados:' % search_type)
        
        # Retorna concacentando com os OR'S e AND's, mas embutindo também os campos de pesquisa em cada termo
        queryterms = [[self._embutir_fields(orses) for orses in andes] for andes in queryterms]
        return "(%s)" % " AND ".join(["(%s)" % " OR ".join(orses)for orses in queryterms])
    
    def _embutir_fields(self, term: str):
        """Faz uma transformação, embutindo fields no termo de pesquisa.
        Isso é para poder realizar a pesquisa em apenas alguns campos ao invés de todos.
        
        Exemplo: sendo self.__fields = ['title', 'abstract'],
        a chamada 
        `self._embutir_fields("machine learning")`
        Transforma:
            machine learning ---> (machine learning[title] OR machine learning[abstract])
        """
        
        return "(%s)" % " OR ".join(["%s[%s]" % (term, field) for field in self._fields])
    
    @property
    @abstractmethod
    def _fields(self):
        """Cada subclasse deverá definir quais serão os campos de pesquisa de cada termo.
        O retorno deverá ser uma lista de fields.
        Exemplo: 
        return ['title', 'abstract']
        """
        pass
    
    @property
    @abstractmethod
    def _db(self):
        """Cada subclasse deverá definir o seu banco.
        Exemplo: 
        return 'pmc'
        """
        pass
    
    @property
    @abstractmethod
    def _sort_order(self):
        """Cada classe deverá definir o parâmetro sort_order.
        Exemplo:
        return 'Journal'
        """
    
class PMC_Searcher(NCBI_Searcher):
    """Realiza pesquisas na base PMC."""
    
    @property
    def _fields(self):
        return ['Abstract', 'Body - Key Terms', 'MeSH Terms',
                'MeSH Major Topic', 'Methods - Key Terms']
    
    @property
    def _db(self):
        return 'pmc'
    
    @property
    def _sort_order(self):
        return 'Journal'
    
    
class PubMed_Searcher(NCBI_Searcher):
    """Realiza pesquisas na base PubMed."""
    
    @property
    def _fields(self):
        return ['Text Words']
    
    @property
    def _db(self):
        return 'pubmed'
    
    @property
    def _sort_order(self):
        return ''

In [92]:
technology_queryterms = [
    'machine learning', 'deep learning', 'artificial intelligence', 
    'neural network', 'scoring system'
]

health_queryterms = [
    'coronary artery disease', 'chest pain', 'heart disease', 'MACE', 
    'Acute Cardiac Complications'
]

# technology_queryterms = [
#     'machine learning', 'deep learning'
# ]

# health_queryterms = [
#     'coronary artery disease', 'chest pain'
# ]

queryterms = [technology_queryterms, health_queryterms]

r = PMC_Searcher().search(queryterms=queryterms, start_year=2008)
[(x[0][:30],x[1]) for x in r]

TERMs: ((("machine learning"[MeSH Terms] OR ("machine"[All Fields] AND "learning"[All Fields]) OR "machine learning"[All Fields]) OR (deep[All Fields] AND ("learning"[MeSH Terms] OR "learning"[All Fields])) OR ("artificial intelligence"[MeSH Terms] OR ("artificial"[All Fields] AND "intelligence"[All Fields]) OR "artificial intelligence"[All Fields]) OR ("neural networks (computer)"[MeSH Terms] OR ("neural"[All Fields] AND "networks"[All Fields] AND "(computer)"[All Fields]) OR "neural networks (computer)"[All Fields] OR ("neural"[All Fields] AND "network"[All Fields]) OR "neural network"[All Fields]) OR (scoring[All Fields] AND system[All Fields])) AND (("coronary artery disease"[MeSH Terms] OR ("coronary"[All Fields] AND "artery"[All Fields] AND "disease"[All Fields]) OR "coronary artery disease"[All Fields]) OR ("chest pain"[MeSH Terms] OR ("chest"[All Fields] AND "pain"[All Fields]) OR "chest pain"[All Fields]) OR ("heart diseases"[MeSH Terms] OR ("heart"[All Fields] AND "diseases"[

[('Auditory evoked potential audi', '4560088'),
 ('500,000 fish phenotypes: The n', '3377363'),
 ('Cladosporium cladosporioides f', '4697913'),
 ('Structural elucidation and mol', '4522731'),
 ('Current methods in structural ', '3376864'),
 ('Partner in fat metabolism: rol', '3339616'),
 ('Clostridium difficile Infectio', '5666691'),
 ('Sleep Disturbances and Fatigue', '3149788'),
 ('Symptom Identification in the ', '2958710'),
 ('Stressors Among Latino Day Lab', '2964275'),
 ('Regulator of G Protein Signali', '5256612'),
 ('Computational Advances for the', '4540734'),
 ('Predicting when Biliary Excret', '4147063'),
 ('Development of a Poly (lactic-', '4147059'),
 ('Protective Effects of Kaempfer', '3691431'),
 ('Comparative In Silico–In Vivo ', '3691424'),
 ('The Challenges of Assessing Os', '3675751'),
 ('What about Pain in Disorders o', '3385819'),
 ('Pain Assessment in Human Fetus', '3385812'),
 ('Applications of Human Pharmaco', '3326168')]

In [93]:
r = PubMed_Searcher().search(queryterms=queryterms, start_year=2008)
[(x[0][:30],x[1]) for x in r]

TERMs: ((("machine learning"[MeSH Terms] OR ("machine"[All Fields] AND "learning"[All Fields]) OR "machine learning"[All Fields]) OR (deep[All Fields] AND ("learning"[MeSH Terms] OR "learning"[All Fields])) OR ("artificial intelligence"[MeSH Terms] OR ("artificial"[All Fields] AND "intelligence"[All Fields]) OR "artificial intelligence"[All Fields]) OR ("neural networks (computer)"[MeSH Terms] OR ("neural"[All Fields] AND "networks"[All Fields] AND "(computer)"[All Fields]) OR "neural networks (computer)"[All Fields] OR ("neural"[All Fields] AND "network"[All Fields]) OR "neural network"[All Fields]) OR (scoring[All Fields] AND system[All Fields])) AND (("coronary artery disease"[MeSH Terms] OR ("coronary"[All Fields] AND "artery"[All Fields] AND "disease"[All Fields]) OR "coronary artery disease"[All Fields]) OR ("chest pain"[MeSH Terms] OR ("chest"[All Fields] AND "pain"[All Fields]) OR "chest pain"[All Fields]) OR ("heart diseases"[MeSH Terms] OR ("heart"[All Fields] AND "diseases"[

[('Comparison of fast multi-slice', '29458430'),
 ('Do patients with diabetes type', '29441963'),
 ('Development and preliminary te', '29433600'),
 ('Classification of the clinical', '29428356'),
 ('Controlling the Risk Domain in', '29427984'),
 ('Scoring system to guide decisi', '29413876'),
 ('Big Data Analytics, the Microb', '29413172'),
 ('Automatic Calcium Scoring in L', '29408789'),
 ('Predicting cardiogenic pulmona', '29408167'),
 ('Development and Validation of ', '29404559'),
 ('Marseille scoring system for e', '29397446'),
 ('An Advanced Bio-Inspired Photo', '29385774'),
 ('Prediction of Incident Hyperte', '29382633'),
 ('Stroke risks and patterns of w', '29381974'),
 ('Predicting non-melanoma skin c', '29374196'),
 ('Prognostic value of coronary c', '29365193'),
 ('Improving Practice Guideline A', '29364732'),
 ('Prognostic Implication of Func', '29361444'),
 ('Application of stacked convolu', '29358103'),
 ('Support Vector Machine Based M', '29354062')]

In [86]:
PubMed_Searcher().get_article_metadata(29458430)

{'header': {'type': 'esummary', 'version': '0.3'},
 'result': {'29458430': {'articleids': [{'idtype': 'pubmed',
     'idtypen': 1,
     'value': '29458430'},
    {'idtype': 'doi', 'idtypen': 3, 'value': '10.1186/s12968-018-0434-2'},
    {'idtype': 'pii', 'idtypen': 4, 'value': '10.1186/s12968-018-0434-2'},
    {'idtype': 'rid', 'idtypen': 8, 'value': '29458430'},
    {'idtype': 'eid', 'idtypen': 8, 'value': '29458430'}],
   'attributes': ['Has Abstract'],
   'authors': [{'authtype': 'Author', 'clusterid': '', 'name': 'Muehlberg F'},
    {'authtype': 'Author', 'clusterid': '', 'name': 'Arnhold K'},
    {'authtype': 'Author', 'clusterid': '', 'name': 'Fritschi S'},
    {'authtype': 'Author', 'clusterid': '', 'name': 'Funk S'},
    {'authtype': 'Author', 'clusterid': '', 'name': 'Prothmann M'},
    {'authtype': 'Author', 'clusterid': '', 'name': 'Kermer J'},
    {'authtype': 'Author', 'clusterid': '', 'name': 'Zange L'},
    {'authtype': 'Author',
     'clusterid': '',
     'name': 'von K