In [46]:
from abc import ABCMeta, abstractmethod
from urllib.parse import urlencode
import requests
from datetime import datetime
from functools import reduce
import time
import xmltodict, json
import ipdb
from threading import Thread

In [47]:
class ThreadWithReturnValue(Thread):
    def __init__(self, group=None, target=None, name=None,
                 args=(), kwargs=None, daemon=None):
        Thread.__init__(self, group=group, target=target, name=name,
                 args=args, kwargs=kwargs, daemon=daemon)
        self._return = None
    def run(self):
        if self._target is not None:
            self._return = self._target(*self._args,
                                                **self._kwargs)
    def join(self):
        Thread.join(self)
        return self._return
    
def dive(item, routes, starter_item=None):
    """Faz uma chamada recursiva de keys de um dicionário aninhado.
    @param item: lista/dicionario a ser recursivamente resgatado.
    @param routes: rota por onde seguir, composta pelos campos/indices separados
        por pontos. Os campos/índices puramente numericos serão convertidos para inteiro,
        como se fossem índices de lista.
        Se routes for uma lista, a funcao ira tentar sucessivamente as rotas da lista,
        ate que um funcione ou que acabe a lista.
    @param starter_item: proxy para permitir que o item original seja acessivel dentro
        da funcao mesmo com certo nivel de recursividade. Nao deve ser usado pelo usuario!
        
    @return o valor/objeto no final do resgate. Se houver algum erro ao longo do caminho,
        será retornada a string 'error:%s' % key que deu erro.
        
    Exemplo:
    a = {"lista":[1,2]}
    dive(a, "lista.1") ---> 2
    
    """
    if starter_item == None:
        # Starter item serve para poder resetar e tentar novamente do começo
        starter_item = item
    
    if isinstance(routes, str):
        routes = [routes]
        
    checkpoints = routes[0].split('.')
    next_checkpoint = checkpoints.pop(0)
    # Fazemos a modificacao da rota in-place.
    routes[0] = ".".join(checkpoints)
    
    if next_checkpoint.isdigit():
        next_checkpoint = int(next_checkpoint)
    try:
        content = item[next_checkpoint]
    except Exception as e:
        # Deu errado: joga essa rota que estavamos tentando fora
        routes.pop(0)
        if isinstance(routes, list) and len(routes):
            # Comeca do zero, na proxima rota, se tiver
            return dive(starter_item, routes)
        # Nao tem proxima rota, entao deu erro mesmo
        return "error:%s|%s<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<<" % (next_checkpoint, repr(e))
    
    if len(checkpoints) == 0:
        return content
    # Passa o terceiro argumento para poder saber como reiniciar se der erro
    return dive(content, routes, starter_item)

def deepgetter(obj, attrs, default=None):
        """Faz uma chamada sucessiva da função getattr, para ir pegando os atributos
        de um objeto.
        Exemplo:
        deepgetter(Cidade, 'regiao.pais') é equivalente a fazer Cidade.regiao.pais
        """
        getter = lambda x, y: getattr(x, y, default)
        return reduce(getter, attrs.split('.'), obj)
    
def inclusive_range(start, end, step=1):
    """Retorna tuplas que servirao para delimitar os retstart (offset de inicio do retorno da API)
    e o retmax (quantos artigos virao em cada requisicao).
    Serve para poder pegar todos os artigos em chunks, e é necessario pois
    não é possivel pegar varios artigos de uma vez por causa do erro HTTP 413 (request URI too large)"""
    
    while start < end:
        tupla = None
        if start + step < end:
            tupla = (start, step)
            start = start + step
        else:
            tupla = (start, end-start)
            start = end
            
        yield tupla
        
def get_all_text(item, rota_extra=[]):
    """Recursivamente explora uma lista ou dicionario, em busca de campos de texto
    segundo o passado em rota_extra e seguindo o padrão de procurar 'p' e '#text'
    retornando todas as strings encontradas"""
    
    retorno = []
    
    if isinstance(item, str):
        retorno.append(item)
    if isinstance(item, list):
        for i in item:
            retorno.extend(get_all_text(i, rota_extra))
    if isinstance(item, dict):
        rota = ['p', '#text']
        rota.extend(rota_extra)
        item = dive(item, rota)
        retorno.extend(get_all_text(item, rota_extra))
        
    return retorno
    

    
class NCBI_Searcher(metaclass=ABCMeta):
    """ 'Interface' que define a utilização da API das databases da NCBI.
    """

    search_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi'
    meta_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esummary.fcgi'
    fetch_url = 'https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi'
    ncbi_register = {"tool":"Atena", "email":"ddddiegolima@gmail.com"}
    recursive = True
    request_uri_limit = 100

    def search(self, queryterms: list = None, search_type: str = None,
               start_year: int = None, end_year: int = None,
               max_records: int = None, start_record: int = None,
               author: str = None, journal: str = None, search_url: str = None):
        """
        Realiza uma pesquisa NCBI.
        @param queryterms: list of lists. Terms within the same list are
            separated by an OR. Lists are separated by an AND
        @param search_type: meta_data or querytext.
            meta_data: This field enables a free-text search of all
                configured metadata fields and the abstract.
            querytext: This field enables a free-text search of all
                fields.
        @param start_year: Start value of Publication Year to restrict results by.
        @param end_year: End value of Publication Year to restrict results by.
        @param max_records: The number of records to fetch.
        @param start_record: Sequence number of first record to fetch.
        @param author: An author's name. Searches both first name and last name
            Accepts a list of author names too.
        @param journal: An author's name. Accepts a list of journals too.
        @param search_url: Optionally you can directly specify the URL to 
            query from. Setting this parameter will ignore the other parameters.
        @return: a dictionaries list whose keys are compatible with Documento model.
        """

        term = self._search_term(queryterms, search_type=search_type)
        if author:
            author = [author] if type(author) == str else author
            author = ['%s[Author]' % a for a in author]
            term = "%s AND (%s)" % (term, " OR ".join(author) )

        if journal:
            journal = [journal] if type(journal) == str else journal
            journal = ['"%s"[Journal]' % j for j in journal]
            term = "%s AND (%s)" % (term, " OR ".join(journal))
            
        print (term)
            
        if max_records and max_records > self.request_uri_limit:
            retmax = self.request_uri_limit
        else:
            retmax = max_records

        fixed_payload = {"retmode": "json", "datetype": "pdat",
                         "db": self._db, "sort": self._sort_order}
        payload = {"term": term,
                   "retmax": retmax or '', "retstart": start_record or '',
                   "mindate": start_year or '', "maxdate": end_year or ''}
        payload.update(fixed_payload)
        payload.update(self.ncbi_register)
        url = search_url if search_url else "%s?%s" % (self.search_url, urlencode(payload))
        
        print("URL SEARCH: %s" % url)
        t_00 = time.time()
        response = requests.get(url).json()['esearchresult']
        print('{:15s}{:6.3f}'.format("response",time.time() - t_00))
        quantidade_artigos = int(response['count'])
        if self.recursive:
            print("Artigos encontrados: ",quantidade_artigos)
        # Se o usuário não limitou quantidade de resultados, então traz tudo
        max_records = max_records or quantidade_artigos
        
        retorno = []
        id_list = response['idlist']

        if id_list:
            lista = self._get_article_metadata(*id_list)
            retorno.extend(lista)
        
        if max_records > self.request_uri_limit and self.recursive:
            # self.recursive só sera True se a chamada estiver sendo feita pelo usuário.
            # Isso serve para garantir que cada chamada da função self.search
            # neste bloco não provocará recursividade.
            self.recursive = False
            
            for retstart, retmax in inclusive_range(len(retorno), max_records, self.request_uri_limit):
                payload.update({'retstart':retstart})
                payload.update({'retmax':retmax})
                kwargs = {"search_url": "%s?%s" % (self.search_url, urlencode(payload))}

                lista = self.search(**kwargs)
                retorno.extend(lista)
            
            self.recursive = True
        
        
        return retorno

    def _search_term(self, queryterms: list, search_type: str = None):
        """Monta o termo de pesquisa completo para mandar para a API."""
        
        if type(queryterms) != list:
            return

        if search_type in ['querytext', None]:
            # Retorna simplesmente a busca concatenando com os OR's e AND's
            return "(%s)" % " AND ".join(["(%s)" % " OR ".join(orses) for orses in queryterms])
        elif search_type != 'meta_data':
            raise Exception('Tipo de pesquisa não faz sentido: %s\nTipos suportados:' % search_type)

        # Retorna concacentando com os OR'S e AND's, mas embutindo também os campos de pesquisa em cada termo
        queryterms = [[self._embutir_fields(orses) for orses in andes] for andes in queryterms]
        return "(%s)" % " AND ".join(["(%s)" % " OR ".join(orses) for orses in queryterms])

    def _embutir_fields(self, term: str):
        """Faz uma transformação, embutindo fields no termo de pesquisa.
        Isso é para poder realizar a pesquisa em apenas alguns campos ao invés de todos.
        Exemplo: sendo self.__fields = ['title', 'abstract'],
        a chamada
        `self._embutir_fields("machine learning")`
        Transforma:
            machine learning ---> (machine learning[title] OR machine learning[abstract])
        """

        return "(%s)" % " OR ".join(["%s[%s]" % (term, field) for field in self._fields])  

    @abstractmethod
    def _get_article_metadata(self, *args):
        """Cada subclasse deverá implementar a função que pega o retorno da API e transforma numa lista de dicionários
        no formato do modelo Documento."""
        pass

    @property
    @abstractmethod
    def _fields(self):
        """Cada subclasse deverá definir quais serão os campos de pesquisa de cada termo.
        O retorno deverá ser uma lista de fields.
        Exemplo:
        return ['title', 'abstract']
        """
        pass

    @property
    @abstractmethod
    def _db(self):
        """Cada subclasse deverá definir o seu banco.
        Exemplo:
        return 'pmc'
        """
        pass

    @property
    @abstractmethod
    def _sort_order(self):
        """Cada classe deverá definir o parâmetro sort_order.
        Exemplo:
        return 'Journal'
        """
        pass

    @property
    @abstractmethod
    def _article_url(self):
        """Cada classe deverá definir a URL da página de um artigo."""
        pass


class PMC_Searcher(NCBI_Searcher):
    """Realiza pesquisas na base PMC."""

    @property
    def _fields(self):
        return ['Abstract', 'Body - Key Terms', 'MeSH Terms',
                'MeSH Major Topic', 'Methods - Key Terms']

    @property
    def _db(self):
        return 'pmc'

    @property
    def _sort_order(self):
        return 'relevance'

    @property
    def _article_url(self):
        return 'https://www.ncbi.nlm.nih.gov/pmc/articles/'

    def _get_article_metadata(self, *args):
        id_list = ','.join([str(x) for x in args])

        payload = {"id": id_list, "db": self._db, "retmode": "xml"}
        payload.update(self.ncbi_register)
        url = "%s?%s" % (self.fetch_url, urlencode(payload))
        print("URL META: %s" % url)

        t_05 = time.time()
        r = requests.get(url)
        print('{:15s}{:6.3f}'.format("response_M",time.time() - t_05))
        
        t_02 = time.time()
        # Pegar o XML, e transformar num dicionário
        d = json.loads(json.dumps(xmltodict.parse(r.content)))
        artigos = d['pmc-articleset']['article']
        print('{:15s}{:6.3f}'.format("parse",time.time() - t_02))

        documentos = []
        append = documentos.append

        t_04 = time.time()
        debug = True
        
        for artigo in artigos:
            diver = lambda route: dive(artigo, route)
            doc = {}

            ### TITULO
            try:
                titulo = diver('front.article-meta.title-group.article-title')
                titulo = get_all_text(titulo)
                doc['titulo'] = " ".join(titulo)
            except Exception as e:
                print('erro:','titulo')
                print(repr(e))
                if debug:
                    ipdb.set_trace()
            
            ### RESUMO / ABSTRACT
            try:
                resumo = diver('front.article-meta.abstract')
                resumo = get_all_text(resumo, ['sec'])
                doc['resumo'] = "\n".join(resumo)
            except Exception as e:
                print('erro:','resumo')
                print(repr(e))
                if debug:
                    ipdb.set_trace()
            
            ### PALAVRAS-CHAVE
            try:
                keywords = diver('front.article-meta.kwd-group')
                keywords = get_all_text(keywords, ['kwd'])
                doc['palavras_chave'] = ",".join(keywords)
            except Exception as e:
                print('erro:','keywords')
                print(repr(e))
                if debug:
                    ipdb.set_trace()
            
            ### AUTORES    
            try:
                authors = diver('front.article-meta.contrib-group')
                if isinstance(authors, dict):
                    authors = authors['contrib']

                elif isinstance(authors, list):
                    authors = authors[0]['contrib']

                if isinstance(authors, dict):
                    name = authors.get('name', None)
                    if name:
                        authors = "%s %s" % (name['given-names'], name['surname'])
                elif isinstance(authors, list):
                    list_ = []
                    for a in authors:
                        name = a.get('name', None)
                        if name:
                            list_.append("%s %s" % (name['given-names'], name['surname']))
                    authors = ",".join(list_)

                doc['autores'] = authors
            except Exception as e:
                print('erro:','authors')
                print(repr(e))
                if debug:
                    ipdb.set_trace()
            
            
            article_ids = diver('front.article-meta.article-id')
            ### HTML_URL
            try:
                pmc_id = [a for a in article_ids if a["@pub-id-type"] =='pmc'][0]['#text']

                doc['html_url'] = "%s%s" % (self._article_url, pmc_id)
            except Exception as e:
                print('erro:','html_url')
                print(repr(e))
                if debug:
                    ipdb.set_trace()
                    
            ### DOI
            try:
                doi = [a for a in article_ids if a["@pub-id-type"] == 'doi'][0]['#text']

                doc['doi'] = doi
            except Exception as e:
                print('erro:','doi')
                print(repr(e))
                if debug:
                    ipdb.set_trace()
            
            ### DATA
            try:
                
                data = diver('front.article-meta.pub-date')
                if isinstance(data, dict):
                    data = [data]
                data = data[0]
                data = "%s %s" % (data['year'], data.get('month','01'))
                doc['data'] = datetime.strptime(data, "%Y %m").date()
            except Exception as e:
                print('erro:','data')
                print(repr(e))
                if debug:
                    ipdb.set_trace()


            append(doc)
            
        print('{:15s}{:6.3f}'.format("fetch",time.time() - t_04))

        return documentos


class PubMed_Searcher(NCBI_Searcher):
    """Realiza pesquisas na base PubMed."""

    @property
    def _fields(self):
        return ['Text Words']

    @property
    def _db(self):
        return 'pubmed'

    @property
    def _sort_order(self):
        return ''

    @property
    def _article_url(self):
        return "https://www.ncbi.nlm.nih.gov/pubmed/"

    def _get_article_metadata(self, *args):
        id_list = ','.join([str(x) for x in args])

        payload = {"id": id_list, "db": self._db, "retmode": "xml"}
        payload.update(self.ncbi_register)
        url = "%s?%s" % (self.fetch_url, urlencode(payload))

        print("URL META: %s" % url)

        t_05 = time.time()
        r = requests.get(url)
        print('{:15s}{:6.3f}'.format("response_M", time.time() - t_05))

        t_02 = time.time()
        # Pegar o XML, e transformar num dicionário
        d = json.loads(json.dumps(xmltodict.parse(r.content)))
        artigos = d['PubmedArticleSet']['PubmedArticle']
        if isinstance(artigos, dict):
            # No caso de vir somente um artigo no resultado
            artigos = [artigos]
        print('{:15s}{:6.3f}'.format("parse", time.time() - t_02))

        documentos = []
        append = documentos.append

        t_04 = time.time()
        debug = True

        for artigo in artigos:
            diver = lambda route: dive(artigo, route)
            doc = {}
            
            ### TITULO
            titulo = diver('MedlineCitation.Article.ArticleTitle')
            titulo = get_all_text(titulo)
            doc['titulo'] = " ".join(titulo)
            
            ### RESUMO / ABSTRACT
            try:
                resumo = diver('MedlineCitation.Article.Abstract.AbstractText')
                resumo = get_all_text(resumo)
                doc['resumo'] = "\n".join(resumo)
            except:
                print("resumo")
                if debug:
                    ipdb.set_trace()
            
            ### PALAVRAS-CHAVE
            try:
                keywords = diver('MedlineCitation.KeywordList.Keyword')
                keywords = get_all_text(keywords)
                doc['palavras_chave'] = ",".join(keywords)
            except:
                print("keywords")
                if debug:
                    ipdb.set_trace()
            
            ### AUTORES
            authors = diver('MedlineCitation.Article.AuthorList.Author')
            try:
                if isinstance(authors, list):
                    authors = ",".join(["%s %s %s" % (a.get('ForeName',''), a.get('LastName',''), a.get('CollectiveName','')) for a in authors])
                elif isinstance(authors, dict):
                    authors = "%s %s %s" % (authors.get('ForeName',''), authors.get('LastName',''), authors.get('CollectiveName',''))
                doc['autores'] = authors
            except:
                print("authors")
                if debug:
                    ipdb.set_trace()
             
            ### PMID (URL HTML)
            try:
                html_url = diver('MedlineCitation.PMID.#text')
                doc['html_url'] = "%s%s" % (self._article_url, html_url)
            except:
                print('html_url')
                if debug:
                    ipdb.set_trace()
                
            ### DOI
            try:
                doi = diver('PubmedData.ArticleIdList.ArticleId')
                doi = [d['#text'] for d in doi if d['@IdType'] == 'doi']
                if not doi:
                    doi = ''
                else:
                    doi = doi[0]
                doc['doi'] = doi
            except:
                print('doi')
                if debug:
                    ipdb.set_trace()
                    
            ### DATA
            try:
                data = diver(['MedlineCitation.Article.ArticleDate','PubmedData.History.PubMedPubDate'])
                if isinstance(data, dict):
                    data = "%s %s" % (data['Year'], data.get('Month', '01'))
                if isinstance(data, list):
                    data = "%s %s" % (data[0]['Year'], data[0].get('Month', '01'))
                doc['data'] = datetime.strptime(data, "%Y %m").date()
            except:
                print('data')
                if debug:
                    ipdb.set_trace()

            append(doc)
        
        print('{:15s}{:6.3f}'.format("fetch", time.time() - t_04))

        return documentos

In [48]:
# termos de pesquisa relacionados a tecnologia
technology_queryterms = [
    'machine learning', 'deep learning', 'artificial intelligence', 
    'neural network', 'scoring system'
]

# termos de pesquisa relacionados a area da saude
health_queryterms = [
    'coronary artery disease', 'chest pain', 'heart disease', 'MACE', 
    'Acute Cardiac Complications'
]

queryterms = [technology_queryterms, health_queryterms]

scimago_journals = ["Journal of the American College of Cardiology","Circulation","European Heart Journal"
                    ,"Circulation Research","Nature Biotechnology","Current Opinion in Biotechnology"
                   ,"Annual Review of Biomedical Engineering", "Circulation: Cardiovascular Interventions"]
eigenfactor_journals = ["Medical image Analysis","Biomaterials","Acta Biomaterialia","Physics in medicine and biology",
                        "IEEE TRANSACTIONS ON MEDICAL IMAGING","COMPUTER METHODS AND PROGRAMS IN BIOMEDICINE "
                       ,"INTERNATIONAL JOURNAL OF CARDIOLOGY", "CARDIOVASCULAR RESEARCH", "HEART RHYTHM"
                       ,"EUROPEAN JOURNAL OF CARDIO-THORACIC SURGERY","JACC-Cardiovascular Interventions"
                        ,"JOURNAL OF MOLECULAR AND CELLULAR CARDIOLOGY ", "JACC-Cardiovascular Imaging "
                        ,"Circulation-Heart Failure", "EUROPEAN JOURNAL OF HEART FAILURE", "EUROPACE"
                       ,"CATHETERIZATION AND CARDIOVASCULAR INTERVENTIONS", "Journal of the American Heart Association"
                       ,"JOURNAL OF THE AMERICAN SOCIETY OF ECHOCARDIOGRAPHY", "Circulation-Cardiovascular Imaging"]

journal = ["BioMedical Engineering OnLine",
           "Biomedical Engineering"] + scimago_journals + eigenfactor_journals


# queryterms = [['machine learning'], ['ck mb']]

In [49]:
print(','.join(technology_queryterms))
print()
print(','.join(health_queryterms))
print()
print(','.join(journal))

machine learning,deep learning,artificial intelligence,neural network,scoring system

coronary artery disease,chest pain,heart disease,MACE,Acute Cardiac Complications

BioMedical Engineering OnLine,Biomedical Engineering,Journal of the American College of Cardiology,Circulation,European Heart Journal,Circulation Research,Nature Biotechnology,Current Opinion in Biotechnology,Annual Review of Biomedical Engineering,Circulation: Cardiovascular Interventions,Medical image Analysis,Biomaterials,Acta Biomaterialia,Physics in medicine and biology,IEEE TRANSACTIONS ON MEDICAL IMAGING,COMPUTER METHODS AND PROGRAMS IN BIOMEDICINE ,INTERNATIONAL JOURNAL OF CARDIOLOGY,CARDIOVASCULAR RESEARCH,HEART RHYTHM,EUROPEAN JOURNAL OF CARDIO-THORACIC SURGERY,JACC-Cardiovascular Interventions,JOURNAL OF MOLECULAR AND CELLULAR CARDIOLOGY ,JACC-Cardiovascular Imaging ,Circulation-Heart Failure,EUROPEAN JOURNAL OF HEART FAILURE,EUROPACE,CATHETERIZATION AND CARDIOVASCULAR INTERVENTIONS,Journal of the American He

In [None]:
r = PMC_Searcher().search(queryterms=queryterms, journal=journal)

((machine learning OR deep learning OR artificial intelligence OR neural network OR scoring system) AND (coronary artery disease OR chest pain OR heart disease OR MACE OR Acute Cardiac Complications)) AND ("BioMedical Engineering OnLine"[Journal] OR "Biomedical Engineering"[Journal] OR "Journal of the American College of Cardiology"[Journal] OR "Circulation"[Journal] OR "European Heart Journal"[Journal] OR "Circulation Research"[Journal] OR "Nature Biotechnology"[Journal] OR "Current Opinion in Biotechnology"[Journal] OR "Annual Review of Biomedical Engineering"[Journal] OR "Circulation: Cardiovascular Interventions"[Journal] OR "Medical image Analysis"[Journal] OR "Biomaterials"[Journal] OR "Acta Biomaterialia"[Journal] OR "Physics in medicine and biology"[Journal] OR "IEEE TRANSACTIONS ON MEDICAL IMAGING"[Journal] OR "COMPUTER METHODS AND PROGRAMS IN BIOMEDICINE "[Journal] OR "INTERNATIONAL JOURNAL OF CARDIOLOGY"[Journal] OR "CARDIOVASCULAR RESEARCH"[Journal] OR "HEART RHYTHM"[Journa

response        1.331
URL META: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?id=4830908%2C5035208%2C4105825%2C5002995%2C5586421%2C4262069%2C3858079%2C5519142%2C4499869%2C4340094%2C4419782%2C5580027%2C5393905%2C3506927%2C4086864%2C5437366%2C4845210%2C4756468%2C3124072%2C5634271%2C4258873%2C4155404%2C3971383%2C3104928%2C5548137%2C5491385%2C3395835%2C3992174%2C3182143%2C3471090%2C5210400%2C4429294%2C5332127%2C4536577%2C4845242%2C4299916%2C4425127%2C3955029%2C5121484%2C4135373%2C3835239%2C3881188%2C3117288%2C3970195%2C4996262%2C4073645%2C5391045%2C3724582%2C2730965%2C5099118&db=pmc&retmode=xml&tool=Atena&email=ddddiegolima%40gmail.com
response_M      2.837
parse           0.607
fetch           0.004
None
URL SEARCH: https://eutils.ncbi.nlm.nih.gov/entrez/eutils/esearch.fcgi?term=%28%28machine+learning+OR+deep+learning+OR+artificial+intelligence+OR+neural+network+OR+scoring+system%29+AND+%28coronary+artery+disease+OR+chest+pain+OR+heart+disease+OR+MACE+OR+Acute+Cardiac+Complica

In [14]:
xml = requests.get("https://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?id=4830908%2C5035208%2C4105825%2C5002995%2C5586421%2C4262069%2C3858079%2C5519142%2C4499869%2C4340094%2C4419782%2C5580027%2C5393905%2C3506927%2C4086864%2C5437366%2C4845210%2C4756468%2C3124072%2C5634271%2C4258873%2C4155404%2C3971383%2C3104928%2C5548137%2C5491385%2C3395835%2C3992174%2C3182143%2C3471090%2C5210400%2C4429294%2C5332127%2C4536577%2C4845242%2C4299916%2C4425127%2C3955029%2C5121484%2C4135373%2C3835239%2C3881188%2C3117288%2C3970195%2C4996262%2C4073645%2C5391045%2C3724582%2C2730965%2C5099118&db=pmc&retmode=xml&tool=Atena&email=ddddiegolima%40gmail.com").content
d = json.loads(json.dumps(xmltodict.parse(xml)))
artigos = d['pmc-articleset']['article']

In [37]:
for i,artigo in enumerate(artigos):
    diver = lambda route: dive(artigo, route)
    doc = {}
    
    ### TITULO
    titulo = diver('front.article-meta.title-group.article-title')
    titulo = get_all_text(titulo)
    doc['titulo'] = titulo
    
    ### RESUMO / ABSTRACT
    resumo = diver('front.article-meta.abstract')
    resumo = get_all_text(resumo, ['sec'])
    doc['resumo'] = "\n".join(resumo)
    
    ### PALAVRAS-CHAVE
    keywords = diver('front.article-meta.kwd-group')
    keywords = get_all_text(keywords, ['kwd'])
    doc['palavras_chave'] = keywords
    
    
    ### AUTORES    
#     authors = diver('front.article-meta.contrib-group')
#     if isinstance(authors, dict):
#         authors = authors['contrib']
        
#     elif isinstance(authors, list):
#         authors = authors[0]['contrib']
    
#     if isinstance(authors, dict):
#         name = authors.get('name', None)
#         if name:
#             authors = "%s %s" % (name['given-names'], name['surname'])
#     elif isinstance(authors, list):
#         list_ = []
#         for a in authors:
#             name = a.get('name', None)
#             if name:
#                 list_.append("%s %s" % (name['given-names'], name['surname']))
#         authors = ",".join(list_)
        
#     doc['autores'] = authors
    
    ### HTML_URL E DOI
#     article_ids = diver('front.article-meta.article-id')
#     pmc_id = [a for a in article_ids if a["@pub-id-type"] =='pmc'][0]['#text']
#     doi = [a for a in article_ids if a["@pub-id-type"] == 'doi'][0]['#text']
    
#     doc['html_url'] = "%s%s" % ('www/', pmc_id)
#     doc['doi'] = doi
    
    ### DATA
#     data = diver('front.article-meta.pub-date')[0]
#     data = "%s %s" % (data['year'], data['month'])
#     doc['data'] = datetime.strptime(data, "%Y %m").date()
    
    print(i,'---')
    print(doc['resumo'][:50])
#     for key in doc:
#         try:
#             print(doc[key][:100])
#         except:
#             print(doc[key])
    


0 ---
Myocardial strain is a principle for quantificatio
1 ---
error:abstract|KeyError('abstract',)<<<<<<<<<<<<<<
2 ---
A large number of papers are appearing in the biom
3 ---
Automatic and reliable segmentation of the prostat
4 ---
Data on the clinical utility of coronary computed 
5 ---
Cost-effectiveness of percutaneous coronary interv
6 ---
Outcomes evaluation is enhanced by assignment of o
7 ---
The number of patients surviving with congenital h
8 ---
The process of coronary artery disease progression
9 ---
This paper aims to update clinicians on “Hot Topic
10 ---
Although age and sex distributions of calcified pl
11 ---
The prognostic value of coronary artery calcium (C
12 ---
Sleep apnea is highly prevalent in patients with c
13 ---
Myocardial ischemia can be developed into more ser
14 ---
error:abstract|KeyError('abstract',)<<<<<<<<<<<<<<
15 ---
Despite advances in myocardial reperfusion therapi
16 ---
The mechanistic basis of the proposed relationship
17 ---
Advances in ather

In [26]:
dive(artigos,'4.front.article-meta.kwd-group')[0]

{'@kwd-group-type': 'author-generated',
 'kwd': [{'#text': 'computed tomography angiography',
   '@id': 'jah32374-kwd-0001'},
  {'#text': 'coronary artery disease', '@id': 'jah32374-kwd-0002'},
  {'#text': 'imaging', '@id': 'jah32374-kwd-0003'},
  {'#text': 'positron emission tomography', '@id': 'jah32374-kwd-0004'}]}

In [24]:
item

[{'@kwd-group-type': 'author-generated',
  'kwd': [{'#text': 'computed tomography angiography',
    '@id': 'jah32374-kwd-0001'},
   {'#text': 'coronary artery disease', '@id': 'jah32374-kwd-0002'},
   {'#text': 'imaging', '@id': 'jah32374-kwd-0003'},
   {'#text': 'positron emission tomography', '@id': 'jah32374-kwd-0004'}]},
 {'@kwd-group-type': 'subject-categories',
  'kwd': ['Computerized Tomography (CT)',
   'Diagnostic Testing',
   'Imaging',
   'Nuclear Cardiology and PET',
   'Angiography'],
  'title': 'Subject Categories'}]

In [33]:
def get_all_text(item, rota_extra=[]):
    """Recursivamente explora uma lista ou dicionario, em busca de campos de texto
    segundo o passado em rota_extra e seguindo o padrão de procurar 'p' e '#text'
    retornando todas as strings encontradas"""
    
    retorno = []
    
    if isinstance(item, str):
        retorno.append(item)
    if isinstance(item, list):
        for i in item:
            retorno.extend(get_all_text(i, rota_extra))
    if isinstance(item, dict):
        rota = ['p', '#text']
        rota.extend(rota_extra)
        item = dive(item, rota)
        retorno.extend(get_all_text(item, rota_extra))
        
    return retorno

get_all_text(item, ['kwd'])

['computed tomography angiography',
 'coronary artery disease',
 'imaging',
 'positron emission tomography',
 'Computerized Tomography (CT)',
 'Diagnostic Testing',
 'Imaging',
 'Nuclear Cardiology and PET',
 'Angiography']

In [31]:
item

[{'@kwd-group-type': 'author-generated',
  'kwd': [{'#text': 'computed tomography angiography',
    '@id': 'jah32374-kwd-0001'},
   {'#text': 'coronary artery disease', '@id': 'jah32374-kwd-0002'},
   {'#text': 'imaging', '@id': 'jah32374-kwd-0003'},
   {'#text': 'positron emission tomography', '@id': 'jah32374-kwd-0004'}]},
 {'@kwd-group-type': 'subject-categories',
  'kwd': ['Computerized Tomography (CT)',
   'Diagnostic Testing',
   'Imaging',
   'Nuclear Cardiology and PET',
   'Angiography'],
  'title': 'Subject Categories'}]