In [2]:
import json
from collections import defaultdict
import pandas as pd

In [3]:
with open("output/professor_map.json", "r") as f:
    professor_map = json.load(f)

In [4]:
all_professor_names = set()
name_map = {}
for name, authors in professor_map.items():
    for author in authors:
        for dblp_name in author.get("author", []):
            all_professor_names.add(dblp_name)
            name_map[dblp_name] = name

In [5]:
import argparse
import html
import json
import xml.sax
import xml.sax.saxutils
from collections import defaultdict
from pathlib import Path


class PublicationFilter(xml.sax.saxutils.XMLFilterBase):
    def __init__(self, name_map, all_professor_names, *args, **kwargs):
        super().__init__(*args, **kwargs)
        self.stack = [None]
        self.attrs = []
        self.name_map = name_map
        self.all_professor_names = all_professor_names
        self.publications = defaultdict(list)
        self.text_active = []
        self.current = None
        self.count = 0

    def startElement(self, name, attrs):
        self.stack.append(name)
        if name == "www" and self.stack[-1] is None:
            pass # Ignore
        elif self.current is None:
            # all top level that are not www may be a work that we should consider
            self.current = defaultdict(list)
            self.current["_name"] = name
            self.current["_attrs"] = dict(attrs.items())
        elif self.current is not None:
            self.text_active.append([])
            self.attrs.append(dict(attrs.items()))

    def endElement(self, name):
        if name == "www":
            pass
        elif name == self.stack[-1] == self.current["_name"]: 
            intersection = set(self.current.get("author", set())) & self.all_professor_names 
            for professor in intersection:
                self.publications[self.name_map[professor]].append(self.current)
                self.count += 1
                if self.count < 20:
                    print(self.current)
            self.current = None
        elif self.current:
            key = name
            atribs = self.attrs.pop()
            value = html.unescape("".join(self.text_active.pop()))
            self.current[key].append(value)
            self.current[f"{key}-attrs"].append(atribs)
        self.stack.pop()
        

    def characters(self, content):
        if self.text_active:
            self.text_active[-1].append(content)

    def skippedEntity(self, name):
        if self.text_active:
            self.text_active[-1].append(f"&{name};")


In [None]:
parser = xml.sax.make_parser()
reader = PublicationFilter(name_map, all_professor_names, parser)
reader.parse("input/dblp.xml")

In [7]:
reader.publications

defaultdict(list, {})

In [146]:
professor_map = defaultdict(list)
for pid, author in pid_map.items():
    for name in author.get('author', []):
        if name in professors:
            professor_map[name].append(author)


In [151]:
skip_prompt = False
no = False
force_load = False
load_manual = "input/manual_map.csv"
missing = set(professors) - set(professor_map)
while (missing and not no) or force_load:
    if not skip_prompt:
        print(f"{len(missing)} professor not found:")
        for professor in missing:
            print(f"- {professor}")
        answer = input(f"Do you want to load map from '{load_manual}? [Y,n,other]").strip()
    else:
        answer = 'y'
    skip_prompt = False
    if answer and answer.lower()[0] == 'n':
        break
    if answer and answer.lower()[0] == 'o':
        new_file = input(f"Which file do you want to load? Leave it in blank to go back").strip()
        if new_file:
            load_manual = new_file
            answer = 'y'
    if not answer or answer.lower()[0] in ('y', 's'):
        df = pd.read_csv(load_manual)
        for i, row in df.iterrows():
            if row["professor"] in missing:
                professor_map[row["professor"]].append(pid_map[row["pid"]])
        missing = set(professors) - set(professor_map)
        force_load = False

if missing:
    print(f"WARNING: creating map without {len(missing)} professors:")
    for professor in missing:
        print(f"- {professor}")

    

In [152]:
professor_map

defaultdict(list,
            {'Aline Marins Paes Carvalho': [{'pid': '95/4928',
               'author': ['Aline Paes',
                'Aline Marins Paes',
                'Aline Marins Paes Carvalho'],
               'affiliation': ['Fluminense Federal University, Brazil'],
               'url': ['http://www2.ic.uff.br/~alinepaes/',
                'https://orcid.org/0000-0002-9089-7303'],
               'orcid': ['0000-0002-9089-7303']}],
             'Aura Conci': [{'pid': 'c/AuraConci',
               'author': ['Aura Conci'],
               'affiliation': ['Fluminense Federal University, Brazil'],
               'url': ['http://www.ic.uff.br/~aconci/',
                'https://orcid.org/0000-0003-0782-2501',
                'https://www.wikidata.org/entity/Q58727483'],
               'orcid': ['0000-0003-0782-2501']}],
             'Taiane Coelho Ramos': [{'pid': '352/3463',
               'author': ['Taiane Coelho Ramos']}],
             'Troy Costa Kohwalter': [{'pid': '123/32

In [133]:

df = pd.read_csv(load_manual)

professor    Alexandre Plastino de Carvalho
pid                                  54/401
Name: 0, dtype: object
professor    Celso da Cruz Carneiro Ribeiro
pid                                 26/4167
Name: 1, dtype: object
professor    Diego Gimenez Passos
pid                       47/4581
Name: 2, dtype: object
professor    Débora Christina Muchaluat Saade
pid                                   21/5891
Name: 3, dtype: object
professor    Eugene Francis Vinod Rebello
pid                               79/2476
Name: 4, dtype: object
professor    Isabel Cristina Mello Rosseti
pid                                42/6628
Name: 5, dtype: object
professor    Luís Felipe Ignácio Cunha
pid                           117/3901
Name: 6, dtype: object
professor    Marcos Vinícius Naves Bêdo
pid                            133/1121
Name: 7, dtype: object
professor    Marcos de Oliveira Lage Ferreira
pid                                   46/4560
Name: 8, dtype: object
professor    Pedro Cortez Fetter Lope

In [106]:
keys = set()
count = 0
for value in reader.pid_map.values():
    if value.get('deprecated', '') != '' and count < 10:
        count += 1
        print(value)
    keys |= set(value.keys())

defaultdict(<class 'list'>, {'pid': '157/1995', 'author': ['Alireza Khalili Golmankhaneh', 'Ali Khalili Golmankhaneh', 'Ali K. Golmankhaneh'], 'affiliation': ['Islamic Azad University, Urmia, Iran'], 'url': ['https://orcid.org/0000-0002-5008-0163', 'https://www.wikidata.org/entity/Q63148849'], 'deprecated': ['https://orcid.org/0000-0002-3490-7976']})
defaultdict(<class 'list'>, {'pid': '63/2223-2', 'author': ['Yimin Zhou 0002', 'Yi-Min Zhou 0002'], 'affiliation': ['Chengdu University of Information Technology, School of Cybersecurity, China', 'University of Electronic Science and Technology of China, School of Computer Science and Engineering, Chengdu, China'], 'url': ['https://orcid.org/0000-0001-8692-9635', 'https://ieeexplore.ieee.org/author/37399624300'], 'deprecated': ['https://orcid.org/0000-0002-6539-0316']})
defaultdict(<class 'list'>, {'pid': '63/8189', 'author': ['Xue-Yang Zhu', 'Xueyang Zhu'], 'url': ['http://lcs.ios.ac.cn/~zxy/', 'https://dl.acm.org/profile/81472645829', 'h

In [102]:
keys

{'affiliation',
 'archive',
 'author',
 'award',
 'cite',
 'crossref',
 'deprecated',
 'former',
 'isnot',
 'note',
 'pid',
 'uname',
 'url'}

In [76]:
set(reader.professors) - set(reader.professors_map)

{'Alexandre Plastino de Carvalho',
 'Celso da Cruz Carneiro Ribeiro',
 'Diego Gimenez Passos',
 'Débora Christina Muchaluat Saade',
 'Eugene Francis Vinod Rebello',
 'Isabel Cristina Mello Rosseti',
 'Luís Felipe Ignácio Cunha',
 'Marcos Vinícius Naves Bêdo',
 'Marcos de Oliveira Lage Ferreira',
 'Pedro Cortez Fetter Lopes'}

In [77]:
reader.professors_map

defaultdict(list,
            {'Aline Marins Paes Carvalho': ['95/4928'],
             'Aura Conci': ['c/AuraConci'],
             'Taiane Coelho Ramos': ['352/3463'],
             'Troy Costa Kohwalter': ['123/3298'],
             'Ricardo Leiderman': ['203/3036'],
             'Mariza Ferro': ['09/10314'],
             'Vanessa Braganholo Murta': ['84/2055'],
             'Mario Roberto Folhadela Benevides': ['29/5198'],
             'André Maués Brabo Pereira': ['28/7852'],
             'Fábio Protti': ['61/2851'],
             'Milton Brown Do Coutto Filho': ['184/9663'],
             'Julio Cesar Stacchini de Souza': ['184/9658'],
             'Yuri Abitbol de Menezes Frota': ['304/9105'],
             'João Felipe Nicolaci Pimentel': ['161/7127'],
             'Daniela Gorski Trevisan': ['t/DanielaGorskiTrevisan'],
             'Bruno Lopes Vieira': ['187/9712'],
             'Luiz Satoru Ochi': ['o/LuizSatoruOchi'],
             'Célio Vinicius Neves de Albuquerque': ['40/809'],

In [54]:
from xml.sax.saxutils import unescape
html.unescape('Jo&aacute;o Felipe Nicolaci Pimentel')

'Joáo Felipe Nicolaci Pimentel'