In [1]:
import requests as req
from bs4 import BeautifulSoup 
import re
import dblp

### Parse all available volumes

In [3]:
Web = req.get('http://ceurspt.wikidata.dbis.rwth-aachen.de/index.html') 
  
S = BeautifulSoup(Web.text, 'lxml') 
html_txt = S.prettify()
#extract all volumes
reg1 = r'Vol-(\d+)">'
#all volumes from the ceurspt api
volumes = re.findall(reg1, html_txt)

In [5]:
#extract all pages for each vol
papers = {}
for v in volumes:
    url = 'http://ceurspt.wikidata.dbis.rwth-aachen.de/Vol-' + v 
    Web = req.get(url) 
    reg2 = r'paper(\d+).pdf'
    papers[int(v)] = re.findall(reg2, BeautifulSoup(Web.text, 'lxml').prettify())

In [7]:
def elem_to_text(elem, default=''):
    if elem:
        return elem.getText()
    else:
        return default

### GROBID

In [80]:
from dataclasses import dataclass

@dataclass
class Author_G:
    firstname: str
    middlename: str
    surname: str
    affiliation:str
    email: str

In [117]:
papers[2462] = ['3']

In [139]:
class GrobitFile(object):
    def __init__(self, filename):
        self.grobidxml = BeautifulSoup(req.get(filename).text, 'lxml')
        self._title = ''


    @property
    def title(self):
        if not self._title:
            self._title = self.grobidxml.title.getText()
        return self._title


    @property
    def authors(self):
        authors_in_header = self.grobidxml.analytic.find_all('author')
        result = []
        authors_list = []
        affiliations = []
        emails = []
        for author in authors_in_header:
            persname = author.persname
            affiliation = author.affiliation
            if persname: 
                firstname = elem_to_text(persname.find("forename", type="first"))
                middlename = elem_to_text(persname.find("forename", type="middle"))
                surname = elem_to_text(persname.surname)
                authors_list.append((firstname, middlename, surname))
                emails.append(elem_to_text(author.email))
                if affiliation:
                    aff = ''
                    aff += (elem_to_text(affiliation.find("orgname", type = "department"))) + " " 
                    aff += (elem_to_text(affiliation.find("orgname", type = "institution")))
                    affiliations.append(aff)
            elif affiliation: 
                aff = ''
                aff += (elem_to_text(affiliation.find("orgname", type = "department"))) + '\n'
                aff += (elem_to_text(affiliation.find("orgname", type = "institution")))
                affiliations.append(aff)
        assert(len(authors_list)==len(affiliations))
        assert(len(authors_list)==len(emails))
        for i in range(len(authors_list)):
            firstname, middlename, surname = authors_list[i]
            author = Author_G(firstname, middlename, surname, affiliations[i], emails[i])
            result.append(author)

        return result

#### Multiple affiliations for a person cannot be detected

In [140]:
for k in papers.keys():
    for p in papers[k]:
        filename = f'http://ceurspt.wikidata.dbis.rwth-aachen.de/Vol-{k}/paper{p}.grobid'
        # extract metadata for each paper using CERMINE and GROBID (provided through API), including title, authors, affiliations, publication year
        grobid =  GrobitFile(filename)
        print(grobid.title)
        print(grobid.authors)
        print('\n')

Does the Bubble Go Beyond? An Exploration of the Urban Filter Bubble
[Author_G(firstname='Annelien', middlename='', surname='Smets', affiliation='imec-SMIT Vrije Universiteit Brussel', email=''), Author_G(firstname='Eladio', middlename='', surname='Montero', affiliation='imec-SMIT Vrije Universiteit Brussel', email=''), Author_G(firstname='Pieter', middlename='', surname='Ballon', affiliation='imec-SMIT Vrije Universiteit Brussel', email='')]






### Compare with dblp 

In [108]:
results = dblp.search([grobid.title])

results.head()





Unnamed: 0,Type,Link,Authors,Title,Where,Year
0,inproceedings,https://ceur-ws.org/Vol-2462/paper2.pdf,"[Yashar Deldjoo, Tommaso Di Noia, Felice Anton...",Assessing the Impact of a User-Item Collaborat...,ImpactRS@RecSys,2019
1,informal,http://arxiv.org/abs/1908.07968,"[Yashar Deldjoo, Tommaso Di Noia, Felice Anton...",Assessing the Impact of a User-Item Collaborat...,CoRR,2019


# CERMINE


In [78]:
xml_file = BeautifulSoup(req.get('http://ceurspt.wikidata.dbis.rwth-aachen.de/Vol-2462/paper2.cermine').text, 'lxml')
article_title = xml_file.find('article-title').text
article_title

'Assessing the Impact of a User-Item Collaborative Atack on Class of Users∗'

In [81]:
from dataclasses import dataclass

@dataclass
class Author_C:
    fullname: str
    affiliation: str
    email: str

In [196]:
class CermineFile(object):
    def __init__(self, filename):
        self.cermine = BeautifulSoup(req.get(filename).text, 'lxml')
        self._title = ''


    @property
    def title(self):
        if not self._title:
            self._title = elem_to_text(self.cermine.find('article-title'))
        return self._title


    @property
    def authors(self):
        authors_in_header = self.cermine.find('article-meta').find('contrib-group').find_all('contrib')
        result = []
        for author in authors_in_header:
            name = elem_to_text(author.find('string-name'))
            email = []
            for e in author.findAll('email'):
                email.append(elem_to_text(e))
            #email = elem_to_text(author.email)
            xref_aff_id = 'aff' + elem_to_text(author.xref)
        
            aff_tag = self.cermine.find('article-meta').find('contrib-group').find('aff', {'id': xref_aff_id})
            affiliation = []
            if aff_tag:
                for a in aff_tag.findAll('institution'):
                    affiliation += [elem_to_text(a)]
            else:
                print(f"Author: {name}, Institution not found")

            author = Author_C(name, affiliation, email)
            result.append(author)

        return result

#### Emails dont work properly for cermine /paper 2

In [198]:
filename = 'http://ceurspt.wikidata.dbis.rwth-aachen.de/Vol-2462/paper2.cermine'
grobid =  CermineFile(filename)
print(grobid.title)
print(grobid.authors)

Assessing the Impact of a User-Item Collaborative Atack on Class of Users∗
[Author_C(fullname='Yashar Deldjoo', affiliation=['Polytechnic University of Bari'], email=['R@k', 'yashar.deldjoo@poliba.it']), Author_C(fullname='Tommaso Di Noia', affiliation=['Polytechnic University of Bari'], email=['tommaso.dinoia@poliba.it']), Author_C(fullname='Felice Antonio Merra†', affiliation=['Polytechnic University of Bari'], email=['felice.merra@poliba.it'])]
