In [298]:
import requests as req
from bs4 import BeautifulSoup 
import re
import dblp
from requests import RequestException
import json
from lxml import etree
from dataclasses import dataclass
from typing import Optional, List

### Parse all available volumes

In [299]:
Web = req.get('http://ceurspt.wikidata.dbis.rwth-aachen.de/index.html') 
  
S = BeautifulSoup(Web.text, 'lxml') 
html_txt = S.prettify()
#extract all volumes
reg1 = r'Vol-(\d+)">'
#all volumes from the ceurspt api
volumes = re.findall(reg1, html_txt)

In [300]:
# Here we set the volume we want to consider in the comparisons below
cur_volumes = ['2350']
assert(all([vol_nr in volumes for vol_nr in cur_volumes]))

In [301]:
#extract all pages for each vol
papers = {}
for v in cur_volumes:
    url = 'http://ceurspt.wikidata.dbis.rwth-aachen.de/Vol-' + v 
    Web = req.get(url) 
    reg2 = r'paper(\d+).pdf'
    papers[int(v)] = re.findall(reg2, BeautifulSoup(Web.text, 'lxml').prettify())
papers

{2350: ['1',
  '2',
  '3',
  '4',
  '5',
  '6',
  '7',
  '8',
  '9',
  '10',
  '11',
  '12',
  '13',
  '14',
  '15',
  '16',
  '17',
  '18',
  '19',
  '20',
  '21',
  '22',
  '23',
  '24',
  '25',
  '26']}

In [302]:
def elem_to_text(elem = None):
    if elem:
        return elem.getText()
    return ''

In [303]:
@dataclass
class Author:
    name: str
    affiliation: Optional[List[str]] = None
    email: Optional[List[str]] = None
    aff_ok: Optional[bool] = None
    
    def __eq__(self, other):
        if isinstance(other, Author):
            return self.name == other.name and str(self.email) == str(other.email) and str(self.affiliation) == str(other.affiliation) 
        return False

### GROBID

In [304]:
class GrobitFile():
    def __init__(self, filename):
        self.grobidxml = BeautifulSoup(req.get(filename).text, 'lxml')
        self._title = ''


    @property
    def title(self):
        if not self._title:
            self._title = self.grobidxml.title.getText()
        return self._title


    @property
    def authors(self):
        authors_in_header = self.grobidxml.analytic.find_all('author')
        result = []
        authors_list = []
        affs = []
        emails = []
        for author in authors_in_header:
            persname = author.persname
            affiliations = author.findAll('affiliation')

            if persname: 
                firstname = elem_to_text(persname.find("forename", type="first"))
                middlename = elem_to_text(persname.find("forename", type="middle"))
                surname = elem_to_text(persname.surname)
                name = ''
                if middlename != '':
                    name = firstname + ' ' + middlename + ' ' + surname
                else:
                    name = firstname + ' ' + surname

                authors_list.append(name)
                emails.append(elem_to_text(author.email))
        
                if len(affiliations) != 0:
                    aff_list = []
                    for affiliation in affiliations:
                        root = etree.fromstring(str(affiliation))
                        affiliation_text = ' '.join(root.xpath('.//text()'))
                        aff = ', '.join([aff.strip() for aff in affiliation_text.split('\n') if aff.strip() != ''])
                        aff_list += [aff.strip()]
                    affs += [aff_list]

            elif len(affiliations) != 0: 
                aff_list = []
                for affiliation in affiliations:
                    root = etree.fromstring(str(affiliation))
                    affiliation_text = ' '.join(root.xpath('.//text()'))
                    aff = ', '.join([aff.strip() for aff in affiliation_text.split('\n') if aff.strip() != ''])
                    aff_list += [aff.strip()]
                affs += [aff_list]
        # assert(len(authors_list)==len(affiliations))
        # assert(len(authors_list)==len(emails))
        for i in range(len(authors_list)):
            author_name = authors_list[i]
            author_affiliation = affs[i] if affs[i] else []
            author_email = [emails[i]] if emails[i] else []
            author = Author(author_name, author_affiliation, author_email)
            result.append(author)

        return result

In [305]:
# Following lines are just for quick testing
filename = f'http://ceurspt.wikidata.dbis.rwth-aachen.de/Vol-2350/paper1'
grobid =  GrobitFile(filename+'.grobid')
grobid.authors

[Author(name='Michael Walch', affiliation=['University of Vienna Universit채tsring, 1, 1010, Vienna, Austria'], email=[], aff_ok=None)]

### CERMINE

In [306]:
class CermineFile():
    def __init__(self, filename):
        self.cermine = BeautifulSoup(req.get(filename).text, 'lxml')
        self._title = ''


    @property
    def title(self):
        if not self._title:
            self._title = elem_to_text(self.cermine.find('article-title'))
        return self._title


    @property
    def authors(self):
        authors_in_header = self.cermine.find('article-meta').find('contrib-group').findAll('contrib')
        result = []
        for author in authors_in_header:
            name = elem_to_text(author.find('string-name'))
            email = []
            for e in author.findAll('email'):
                email.append(elem_to_text(e))
            #email = elem_to_text(author.email)

            #xref_aff_ids = 'aff' + elem_to_text(author.xref)
            xref_aff = author.findAll('xref')
            xref_aff_id = ['aff' + elem_to_text(a) for a in xref_aff]
            affiliations = []
            for xref_id in xref_aff_id:
                aff_tag = self.cermine.find('article-meta').find('contrib-group').find('aff', {'id': xref_id})
                institutions = aff_tag.findAll('institution')
                addr = aff_tag.findAll('addr-line')
                countries = aff_tag.findAll('country')
                if aff_tag:
                    affl = ''
                    if len(institutions) == len(addr):
                        if len(countries) != 0:
                                if len(countries) == len(institutions):
                                    for i in range(len(institutions)): #aff_tag.findAll('institution'):
                                        affl = (', ').join([elem_to_text(institutions[i])]+  [elem_to_text(addr[i])] + [elem_to_text(countries[0])])
                                else:
                                    for i in range(len(institutions)): #aff_tag.findAll('institution'):                                        
                                        affl = (', ').join([elem_to_text(institutions[i])] + [elem_to_text(addr[i])])
                                    affl += elem_to_text(countries[0])
                        else:
                            for i in range(len(institutions)): #aff_tag.findAll('institution'):
                                affl = elem_to_text(institutions[i]) + " " +  elem_to_text(addr[i]) +  ' ' 
                        affiliations += [affl.strip()]
                
                else:
                    print(f"Author: {name}, Institution not found")
            
            author = Author(name=name, affiliation=affiliations, email=email)
            result.append(author)

        return result

In [307]:
filename = f'http://ceurspt.wikidata.dbis.rwth-aachen.de/Vol-2350/paper1'
cermine =  CermineFile(filename+'.cermine')
cermine.authors

[Author(name='Michael Walch', affiliation=['Proceedings of the AAAI 2019 Spring Symposium on Combining Machine Learning with Knowledge Engineering (AAAI-MAKE 2019). Stanford University, Palo Alto, CaliforniaUSA'], email=[], aff_ok=None)]

#### Multiple affiliations for a person cannot be detected

## Compare results from CERMINE and GROBID, enhance with attributes from other sources

In [308]:
def are_equal_list_authors(list1: List[Author], list2: List[Author]):
    if len(list1) != len(list2):
        return False
    for author in list1:
        if not any([author == a2 for a2 in list2]):
            return False
    for author in list2:
        if not any([author == a1 for a1 in list1]):
            return False
    return True
        
    


for k in papers.keys():
    for p in papers[k]:
        paper_path = f'http://ceurspt.wikidata.dbis.rwth-aachen.de/Vol-{k}/paper{p}'
        # extract metadata for each paper using GROBID 
        grobid =  GrobitFile(paper_path+'.grobid')

        # extract metadata for each paper using CERMINE 
        cermine =  CermineFile(paper_path+'.cermine')
        
        if grobid.title != cermine.title or not are_equal_list_authors(grobid.authors, cermine.authors):
            print(f'\n{"*"*100}\n')
            print(f'Paper {p} PDF: {paper_path}.pdf\n')
            print(f'Paper {p} grobid: {paper_path}.grobid\n')
            print(f'Paper {p} cermine: {paper_path}.cermine\n')

        # check if title is the same
        if grobid.title != cermine.title:
            print(f'Title discrepancies\n')
            print(f'grobid title: {grobid.title} \ncermin title: {cermine.title} \n' )
        
        if not are_equal_list_authors(grobid.authors, cermine.authors):
            print('Author discrepancies\n')
            print(f'grobid authors: {grobid.authors}')
            print('\n')
            print(f'cermine authors: {cermine.authors}')
        
        # (provided through API), including title, authors, affiliations, publication year



****************************************************************************************************

Paper 1 PDF: http://ceurspt.wikidata.dbis.rwth-aachen.de/Vol-2350/paper1.pdf

Paper 1 grobid: http://ceurspt.wikidata.dbis.rwth-aachen.de/Vol-2350/paper1.grobid

Paper 1 cermine: http://ceurspt.wikidata.dbis.rwth-aachen.de/Vol-2350/paper1.cermine

Author discrepancies

grobid authors: [Author(name='Michael Walch', affiliation=['University of Vienna Universit채tsring, 1, 1010, Vienna, Austria'], email=[], aff_ok=None)]


cermine authors: [Author(name='Michael Walch', affiliation=['Proceedings of the AAAI 2019 Spring Symposium on Combining Machine Learning with Knowledge Engineering (AAAI-MAKE 2019). Stanford University, Palo Alto, CaliforniaUSA'], email=[], aff_ok=None)]

****************************************************************************************************

Paper 2 PDF: http://ceurspt.wikidata.dbis.rwth-aachen.de/Vol-2350/paper2.pdf

Paper 2 grobid: http://ceurspt.wikidat

IndexError: list index out of range

### Findings:
- take email from grobid (Vol-2462/paper2.cermine)
- take title from grobid - cermine doesn't work properly with subtitles (Vol-2462/paper3.cermine)
- take affiliations from cermine - except in cases for papers with title and subtitle (Vol-2462/paper3.cermine)
- Paper titles and subtitles cannot be extracted properly with cermine


##### Further findings (Volume 1999)

	Paper 1:
		- cermine does not extract aff (it's problem in our code) 
	Paper 2:
		- cermine wrong author name (due to the star)
		- cermine does not extract aff (it's problem in our code)
	Paper 3:
		- cermine wrong author name (totally senseless)
	Paper 4:
		- cermine wrong author name (german umlauts not decoded)
		- There is a slight difference - a missing space (we can handle this in our code)
	Paper 5:
		- cermine missing email
		- grobid is not giving full affiliation name

##### Findings on volume 2350
	- Cannot extract authors with multiple affiliations correctly.

	- Grobid and cermine stores different affilations information on a same author. In paper 1, the author has 2 affiliations in cermine but only 1 in grobid.
	
	- Affs index error in grobid class (this error can be reproduced via replaceing cur_volumes with '2350'). A problem on our side and can be fix in the actual implementation. Confirm again that we extract affiliation from cermine.
	

### Compare with dblp 

In [None]:
results = dblp.search([grobid.title])

results.head()





Unnamed: 0,Type,Link,Authors,Title,Where,Year
0,inproceedings,https://ceur-ws.org/Vol-2346/paper5.pdf,"[Rudi Palmieri, Elena Musi]",Trust-oriented Argumentation in Rhetorical Sub...,CMNA@PERSUASIVE,2019


### ORCID Disambiguation

In [None]:
import orcid

In [None]:
api = orcid.PublicAPI('APP-WNBUUWPD8MWY07XM', 'a5b8023a-cea1-4aa0-92f0-263c186d5556')
search_token = api.get_search_token_from_orcid()

In [None]:
work = api.read_record_public('0000-0002-8997-7517', 'activities', search_token)
work

{'last-modified-date': {'value': 1693840577589},
 'educations': {'last-modified-date': None,
  'education-summary': [],
  'path': '/0000-0002-8997-7517/educations'},
 'employments': {'last-modified-date': {'value': 1561046952392},
  'employment-summary': [{'created-date': {'value': 1561046905359},
    'last-modified-date': {'value': 1561046952392},
    'source': {'source-orcid': {'uri': 'http://orcid.org/0000-0002-8997-7517',
      'path': '0000-0002-8997-7517',
      'host': 'orcid.org'},
     'source-client-id': None,
     'source-name': {'value': 'Marco Pegoraro'}},
    'department-name': 'Computer Science',
    'role-title': 'Doctoral Student',
    'start-date': {'year': {'value': '2018'},
     'month': {'value': '06'},
     'day': {'value': '11'}},
    'end-date': None,
    'organization': {'name': 'RWTH Aachen University',
     'address': {'city': 'Aachen', 'region': 'NRW', 'country': 'DE'},
     'disambiguated-organization': None},
    'visibility': 'PUBLIC',
    'put-code': 811

### join on orcid based on author name and affiliation/organization

In [None]:
author_name = work['employments']['employment-summary'][0]['source']['source-name']['value']
organization = work['employments']['employment-summary'][0]['organization']['name']
print(author_name, organization)

Marco Pegoraro RWTH Aachen University


# Wikidata API 


In [None]:
from wikidata.client import Client

client = Client() 
entity = client.get('Q57983801', load=True)
entity


<wikidata.entity.Entity Q57983801 'Yashar Deldjoo'>

In [None]:
entity.description

m'researcher, ORCID id # 0000-0002-6767-358X'

### A problem of Wikidata is that not all authors have wiki pages.