In [6]:
import requests as req
from bs4 import BeautifulSoup 
import re
import dblp
from requests import RequestException
import json


### Parse all available volumes

In [21]:
Web = req.get('http://ceurspt.wikidata.dbis.rwth-aachen.de/index.html') 
  
S = BeautifulSoup(Web.text, 'lxml') 
html_txt = S.prettify()
#extract all volumes
reg1 = r'Vol-(\d+)">'
#all volumes from the ceurspt api
volumes = re.findall(reg1, html_txt)

In [26]:
volumes = ['2462']

In [27]:
#extract all pages for each vol
papers = {}
for v in volumes:
    url = 'http://ceurspt.wikidata.dbis.rwth-aachen.de/Vol-' + v 
    Web = req.get(url) 
    reg2 = r'paper(\d+).pdf'
    papers[int(v)] = re.findall(reg2, BeautifulSoup(Web.text, 'lxml').prettify())

In [29]:
def elem_to_text(elem):
    if elem:
        return elem.getText()
    else:
        return ''

In [20]:
from dataclasses import dataclass

@dataclass
class Author:
    name: str
    affiliation:str
    email: str

### GROBID

In [34]:
class GrobitFile(object):
    def __init__(self, filename):
        self.grobidxml = BeautifulSoup(req.get(filename).text, 'lxml')
        self._title = ''


    @property
    def title(self):
        if not self._title:
            self._title = self.grobidxml.title.getText()
        return self._title


    @property
    def authors(self):
        authors_in_header = self.grobidxml.analytic.find_all('author')
        result = []
        authors_list = []
        affiliations = []
        emails = []
        for author in authors_in_header:
            persname = author.persname
            affiliation = author.affiliation
            if persname: 
                firstname = elem_to_text(persname.find("forename", type="first"))
                middlename = elem_to_text(persname.find("forename", type="middle"))
                surname = elem_to_text(persname.surname)
                name = ''
                if middlename != '':
                    name = firstname + ' ' + middlename + ' ' + surname
                else:
                    name = firstname + ' ' + surname

                authors_list.append(name)
                emails.append(elem_to_text(author.email))
                if affiliation:
                    aff = ''
                    aff += (elem_to_text(affiliation.find("orgname", type = "department"))) + ' '
                    aff += (elem_to_text(affiliation.find("orgname", type = "institution")))
                    affiliations.append(aff)
            elif affiliation: 
                aff = ''
                aff += (elem_to_text(affiliation.find("orgname", type = "department"))) + ' '
                aff += (elem_to_text(affiliation.find("orgname", type = "institution")))
                affiliations.append(aff)
        #assert(len(authors_list)==len(affiliations))
        #assert(len(authors_list)==len(emails))
        for i in range(len(authors_list)):
            author = Author(authors_list[i], affiliations[i], emails[i])
            result.append(author)

        return result

### CERMINE

In [36]:
class CermineFile(object):
    def __init__(self, filename):
        self.cermine = BeautifulSoup(req.get(filename).text, 'lxml')
        self._title = ''


    @property
    def title(self):
        if not self._title:
            self._title = elem_to_text(self.cermine.find('article-title'))
        return self._title


    @property
    def authors(self):
        authors_in_header = self.cermine.find('article-meta').find('contrib-group').find_all('contrib')
        result = []
        for author in authors_in_header:
            name = elem_to_text(author.find('string-name'))
            email = []
            for e in author.findAll('email'):
                email.append(elem_to_text(e))
            #email = elem_to_text(author.email)
            xref_aff_id = 'aff' + elem_to_text(author.xref)
        
            aff_tag = self.cermine.find('article-meta').find('contrib-group').find('aff', {'id': xref_aff_id})
            affiliation = []
            if aff_tag:
                for a in aff_tag.findAll('institution'):
                    affiliation += [elem_to_text(a)]
            else:
                print(f"Author: {name}, Institution not found")

            author = Author(name, affiliation, email)
            result.append(author)

        return result

#### Multiple affiliations for a person cannot be detected

## Compare results from CERMINE and GROBID, enhance with attributes from other sources

In [42]:
for k in papers.keys():
    for p in papers[k]:
        filename = f'http://ceurspt.wikidata.dbis.rwth-aachen.de/Vol-{k}/paper{p}'
        # extract metadata for each paper using GROBID 
        grobid =  GrobitFile(filename+'.grobid')

        # extract metadata for each paper using CERMINE 
        cermine =  CermineFile(filename+'.cermine')

        # check if title is the same
        if grobid.title != cermine.title:
            print(f'Title discrepancies')
        
        if grobid.authors != cermine.authors:
            print('Author discrepancies')
        # 
        # (provided through API), including title, authors, affiliations, publication year
        print(grobid.title)
        print(grobid.authors)
        print(cermine.title)
        print(cermine.authors)
        print('\n')



Author discrepancies
A Model for Evaluating Popularity and Semantic Information Variations in Radio Listening Sessions
[Author(name='Lorenzo Porcaro', affiliation='Music Technology Group Universitat Pompeu Fabra Barcelona', email='lorenzo.porcaro@upf.edu'), Author(name='Emilia Gómez', affiliation='Music Technology Group Universitat Pompeu Fabra Barcelona', email='emilia.gomez@upf.edu')]
A Model for Evaluating Popularity and Semantic Information Variations in Radio Listening Sessions
[Author(name='Lorenzo Porcaro', affiliation=['Music Technology Group, Universitat Pompeu Fabra'], email=['lorenzo.porcaro@upf.edu']), Author(name='Emilia Gómez', affiliation=['Music Technology Group, Universitat Pompeu Fabra', 'Joint Research Centre, European Commission'], email=['emilia.gomez@upf.edu'])]


Title discrepancies
Author discrepancies
Assessing the Impact of a User-Item Collaborative Attack on Class of Users *
[Author(name='Yashar Deldjoo', affiliation=' Polytechnic University of Bari Bari', em

### Findings:
- take email from grobid (Vol-2462/paper2.cermine)
- take title from grobid - cermine doesn't work properly with subtitles (Vol-2462/paper3.cermine)
- take affiliations from cermine - except in cases for papers with title and subtitle (Vol-2462/paper3.cermine)

### Compare with dblp 

In [None]:
results = dblp.search([grobid.title])

results.head()





Unnamed: 0,Type,Link,Authors,Title,Where,Year
0,inproceedings,https://ceur-ws.org/Vol-2462/paper2.pdf,"[Yashar Deldjoo, Tommaso Di Noia, Felice Anton...",Assessing the Impact of a User-Item Collaborat...,ImpactRS@RecSys,2019
1,informal,http://arxiv.org/abs/1908.07968,"[Yashar Deldjoo, Tommaso Di Noia, Felice Anton...",Assessing the Impact of a User-Item Collaborat...,CoRR,2019


### ORCID Disambiguation

In [1]:
import orcid

In [2]:
api = orcid.PublicAPI('APP-WNBUUWPD8MWY07XM', 'a5b8023a-cea1-4aa0-92f0-263c186d5556')
search_token = api.get_search_token_from_orcid()

In [16]:
work = api.read_record_public('0000-0002-8997-7517', 'activities', search_token)
work

{'last-modified-date': {'value': 1693840577589},
 'educations': {'last-modified-date': None,
  'education-summary': [],
  'path': '/0000-0002-8997-7517/educations'},
 'employments': {'last-modified-date': {'value': 1561046952392},
  'employment-summary': [{'created-date': {'value': 1561046905359},
    'last-modified-date': {'value': 1561046952392},
    'source': {'source-orcid': {'uri': 'http://orcid.org/0000-0002-8997-7517',
      'path': '0000-0002-8997-7517',
      'host': 'orcid.org'},
     'source-client-id': None,
     'source-name': {'value': 'Marco Pegoraro'}},
    'department-name': 'Computer Science',
    'role-title': 'Doctoral Student',
    'start-date': {'year': {'value': '2018'},
     'month': {'value': '06'},
     'day': {'value': '11'}},
    'end-date': None,
    'organization': {'name': 'RWTH Aachen University',
     'address': {'city': 'Aachen', 'region': 'NRW', 'country': 'DE'},
     'disambiguated-organization': None},
    'visibility': 'PUBLIC',
    'put-code': 811

### join on orcid based on author name and affiliation/organization

In [18]:
author_name = work['employments']['employment-summary'][0]['source']['source-name']['value']
organization = work['employments']['employment-summary'][0]['organization']['name']
print(author_name, organization)

Marco Pegoraro RWTH Aachen University
