In [65]:
import json
from collections import defaultdict
import pandas as pd
from pathlib import Path
from lxml import etree

In [99]:
with open("output/dblp_map.json", "r") as f:
    dblp_map = json.load(f)

In [100]:
with open("output/collaborators.json", "r") as f:
    collaborators = json.load(f)

In [106]:
with open("output/professor_map.json", "r") as f:
    professor_map = json.load(f)

In [109]:
professor_pids = set()
for name, professor in professor_map.items():
    for author in professor:
        professor_pids.add(author['pid'])

In [126]:
substrings = [
    "brazil",
    "brasil",
    'acre',
    'alagoas',
    'amapá',
    'amazonas',
    'bahia',
    'ceará',
    'distrito federal',
    'espírito santo',
    'goiás',
    'maranhão',
    'mato grosso',
    'mato grosso do sul',
    'minas gerais',
    'pará',
    'paraíba',
    'paraná',
    'pernambuco',
    'piauí',
    'rio de janeiro',
    'rio grande do norte',
    'rio grande do sul',
    'rondônia',
    'roraima',
    'santa catarina',
    'são paulo',
    'sergipe',
    'tocantins',
    "fluminense"
]

count = 0
brazil_affiliation = 0
total = 0
orcid_count = 0
for pid, collaborator in collaborators.items():
    total += 1
    if pid in professor_pids: # UFF
        count += 1
        brazil_affiliation += 1
        continue
    dblp_entry = dblp_map.get(pid, {})
    affiliation = ""
    if len(dblp_entry.get("affiliation", [])) == 1:
        affiliation = dblp_entry["affiliation"][0].lower()
        count += 1
    if not affiliation and 'orcid' in dblp_entry:
        orcid_count += 1
    if not affiliation and 'url' in dblp_entry:
        urls = [url for url in dblp_entry['url'] if 'orcid.org' not in url]
        if urls:
            print(urls)

    for substring in substrings:
        if substring in affiliation:
            brazil_affiliation += 1
            break
    #else:
    #    if affiliation:
    #        print(affiliation)

total, count, brazil_affiliation, orcid_count

['https://zbmath.org/authors/?q=ai:ceschia.sara', 'https://www.wikidata.org/entity/Q85349339']
['http://www-di.inf.puc-rio.br/~afgarcia/', 'https://scholar.google.com/citations?user=rP1LYboAAAAJ']
['http://lattes.cnpq.br/0180606646463169']
['https://www.wikidata.org/entity/Q40942227', 'https://www.researcherid.com/rid/B-8177-2013', 'https://d-nb.info/gnd/1170895220']
['https://www.wikidata.org/entity/Q45854683', 'https://www.researcherid.com/rid/A-1305-2008']
['https://research.google/people/106908/', 'https://scholar.google.com/citations?user=tFHkg5MAAAAJ']
['https://www.wikidata.org/entity/Q122202506']
['https://scholar.google.com/citations?user=D8aXI2YAAAAJ', 'https://www.wikidata.org/entity/Q42389552', 'https://d-nb.info/gnd/131532847']
['http://www.berndresch.com/', 'https://scholar.google.com/citations?user=TlkErZoAAAAJ', 'https://www.wikidata.org/entity/Q57691095', 'https://d-nb.info/gnd/1033522686']
['https://ieeexplore.ieee.org/author/37086349432']
['https://www.wikidata.org/e

(1124, 224, 122, 168)

In [91]:
len(all_collaborators)

1124

In [92]:
all_collaborators

defaultdict(<function __main__.<lambda>()>,
            {'156/6577': {'name': 'Jonnathan Carvalho',
              'professors': {'Alexandre Plastino de Carvalho',
               'Aline Marins Paes Carvalho'},
              'orcid': '0000-0003-0983-2308'},
             '98/10070': {'name': 'Rafael B. Pereira',
              'professors': {'Alexandre Plastino de Carvalho'}},
             '77/5253': {'name': 'Bianca Zadrozny',
              'professors': {'Alexandre Plastino de Carvalho'}},
             '80/1853': {'name': 'Luiz H. C. Merschmann',
              'professors': {'Alexandre Plastino de Carvalho'},
              'orcid': '0000-0002-9948-2673'},
             '149/1519': {'name': 'Luiz Laerte Nunes da Silva Junior',
              'professors': {'Alexandre Plastino de Carvalho',
               'Leonardo Gresta Paulino Murta',
               'Troy Costa Kohwalter'}},
             '123/3298': {'name': 'Troy C. Kohwalter',
              'professors': {'Alexandre Plastino de Carvalho

In [30]:
cyear = list(map(str.strip, open("input/2024").readlines()))

In [32]:
professor = professor_map[cyear[0]]

In [34]:
pid = professor[0]['pid']

In [39]:
tree = etree.parse(f"output/pids/{pid}.xml")


In [40]:
root = tree.getroot()

In [48]:
year = "2024"
elements = root.xpath(f'//r/*/year[text()="{year}"]')

In [77]:
work = elements[0].getparent()
etree.tostring(work)

b'<article key="journals/ipm/MouraCPP24" mdate="2024-07-19">\n<author orcid="0000-0001-5108-7482" pid="96/128">Ricardo Moura</author>\n<author orcid="0000-0003-0983-2308" pid="156/6577">Jonnathan Carvalho</author>\n<author orcid="0000-0003-4039-0915" pid="54/401">Alexandre Plastino 0001</author>\n<author orcid="0000-0002-9089-7303" pid="95/4928">Aline Paes</author>\n<title>Less is more: Pruning BERTweet architecture in Twitter sentiment analysis.</title>\n<pages>103688</pages>\n<year>2024</year>\n<volume>61</volume>\n<journal>Inf. Process. Manag.</journal>\n<number>4</number>\n<ee>https://doi.org/10.1016/j.ipm.2024.103688</ee>\n<url>db/journals/ipm/ipm61.html#MouraCPP24</url>\n</article>\n'

In [78]:

authors = work.findall("author")

In [79]:
authors[0].text

'Ricardo Moura'

In [23]:
import requests
import shutil
from pathlib import Path
 
def download_dblp_xml(pid, output_dir="output/pids"):
    path = Path(output_dir) / (pid + ".xml")
    url = f"https://dblp.org/pid/{pid}.xml"
    path.parent.mkdir(parents=True, exist_ok=True)
    with requests.get(url, stream=True) as r:
        r.raise_for_status()
        with open(path, 'wb') as f:
            for chunk in r.iter_content(chunk_size=8192): 
                # If you have chunk encoded response uncomment if
                # and set chunk_size parameter to None.
                #if chunk: 
                f.write(chunk)
    return path

In [25]:
for _, authors in professor_map.items():
    for author in authors:
        download_dblp_xml(author['pid'])

In [16]:
requests.get("https://dblp.org/pid/25/3645.xml")

<Response [200]>

In [17]:
r = _

In [22]:
r.raw.text()

AttributeError: 'HTTPResponse' object has no attribute 'text'

In [None]:
https://dblp.org/pid/25/3645.html