In [1]:
import os
SOURCE_DIR = '../data/arxiv_dump/sources'
sources = [f for f in os.listdir(SOURCE_DIR) if not f.endswith("tar.gz")]

In [2]:
def get_all_files(s):
    paper_dir = os.path.join(SOURCE_DIR, s)
    return [os.path.join(paper_dir, f) for f in os.listdir(paper_dir)]


def get_tex_files(s):
    return [f for f in get_all_files(s) if f.endswith(".tex")]

def clean_text(text, prune_document=True):
    pos = text.find("\\begin{abstract}")
    if pos >= 0:
        text = text[:pos]
    lines = text.split("\n")
    lines = [line for line in lines if not line.startswith("%")]
    return "\n".join(lines)


def get_text(path):
    with open(path, "r") as f:
        try:
            return f.read()
        except UnicodeDecodeError:
            # print('Failed to read the file {}'.format(path))
            return ""

In [43]:
import re

def extract_mail_domains(s):
    emails = []
    tex_files = get_tex_files(s)
    for f in tex_files:
        text = clean_text(get_text(f))
        emails += re.findall('@[a-z-_\.]+\.[a-z-_\.]+', text)
    emails = [m[1:] for m in emails]
    return emails

In [44]:
len(sources)

587

In [57]:
import numpy as np
np.random.shuffle(sources)

In [58]:
from collections import OrderedDict

In [59]:
annotated_set = OrderedDict()

In [70]:
def generate_affiliations_entry(name_version):
    d = dict()
    d['names'] = []
    d['mail_domains'] = extract_mail_domains(name_version)
    d['dbpedia_ids'] = []
    d['types'] = []
    return d

def generate_entry(name, name_version):
    entry = dict()
    entry['id'] = name
    entry['versioned_id'] = name_version
    entry['manually_annotated'] = False
    entry['affiliations'] = generate_affiliations_entry(name_version)
    return entry

In [77]:
for s in sources:
    pruned_version = s.split('v')[0]
    annotated_set[pruned_version] = generate_entry(pruned_version, s)

In [165]:
from babelpy import babelfy
API_KEY = '62cb486d-98aa-4f4b-b0cc-f1b6430491b4'
babelfy_params = {
    'lang': 'EN'
    }
babel_client = babelfy.BabelfyClient(API_KEY, babelfy_params)

In [168]:
def get_dbpedia_links(text):
    babel_client.babelfy(text)
    ents_info = babel_client.merged_entities
    return [ent['DBpediaURL'] for ent in ents_info]

In [174]:
uni_phrases = {
    "universit",
    "school",
    "college",
    "institut",
    "academ",
    "universidad",
    "polyte",
    "schule",
    "ecole",
    "escuela",
}

def detect_type(aff_name):
    for uni in uni_phrases:
        if uni in str.lower(aff_name):
            return 'academic'
    return 'company'

def annotate_types(aff):
    if aff['types'] == []:
        return [detect_type(aff_name) for aff_name in aff['names']]
    else:
        return aff['types']
    
def flatten(ls):
    return [el for l in ls for el in l]
    
def link_dbpedia(aff):
    return flatten([get_dbpedia_links(name) for name in aff['names']])

In [175]:
link_dbpedia({'names': ['University of Warsaw', 'Google Research']})

['http://dbpedia.org/resource/University_of_Warsaw',
 'http://dbpedia.org/resource/Google']

In [178]:
import json
dataset = '../data/affiliations_annotated.json'
try:
    with open(dataset, 'r') as f:
        old_data = json.load(f)
except FileNotFoundError:
    old_data = dict()
    
new_data = old_data.copy()
manual_count = 0

for k, v in annotated_set.items():
    if k not in old_data or old_data[k]['manually_annotated'] == False:
        new_data[k] = v
    else:
        affiliation_dict = new_data[k]["affiliations"]
        types = annotate_types(affiliation_dict)
        dbpedia_ids = link_dbpedia(affiliation_dict)
        print(types)
        print(affiliation_dict["names"])
        print(dbpedia_ids)
        new_data[k]["affiliations"]["types"] = types
        new_data[k]["affiliations"]["dbpedia_ids"] = dbpedia_ids
        manual_count += 1
    
print('Manually labelled: {}/{}'.format(manual_count, len(sources)))
        
        
with open('../data/affiliations_annotated.json', 'w') as f:
    json.dump(new_data, f, indent=2)

['academic']
['Universidad Autonoma de Madrid']
['http://dbpedia.org/resource/Autonomous_University_of_Madrid']
['academic', 'academic', 'academic', 'company', 'academic', 'academic']
['TU Berlin', 'MPII', 'Korea University', 'Google Research, Brain Team', 'RIKEN AIP', 'TU Kaiserslautern']
['http://dbpedia.org/resource/Technical_University_of_Berlin', 'http://dbpedia.org/resource/Max_Planck_Institute_for_Informatics', 'http://dbpedia.org/resource/Korea_University', 'http://dbpedia.org/resource/Google', 'http://dbpedia.org/resource/Brain', 'http://dbpedia.org/resource/Musical_ensemble', 'http://dbpedia.org/resource/RIKEN', 'http://dbpedia.org/resource/AH_receptor-interacting_protein', 'http://dbpedia.org/resource/Kaiserslautern_University_of_Technology']
['company']
['Diveplane Corporation']
['http://dbpedia.org/resource/Corporation']
['academic', 'academic', 'academic']
['Manchester Metropolitan University', 'Chinese Academy of Sciences', 'Tongji University']
['http://dbpedia.org/resou

In [111]:
def display(s):
    tex_files = get_tex_files(s)
    for f in tex_files:
        text = clean_text(get_text(f))
        print(text)
        print()
        print('-' * 50)
        print()

#### Types of affiliation
* academic
* company
* government

In [162]:
no = 49
print(sources[no])
display(sources[no])

2009.10990v1
\documentclass[letterpaper]{article} % DO NOT CHANGE THIS
\usepackage{aaai21}  % DO NOT CHANGE THIS
\usepackage{times}  % DO NOT CHANGE THIS
\usepackage{helvet} % DO NOT CHANGE THIS
\usepackage{courier}  % DO NOT CHANGE THIS
\usepackage[hyphens]{url}  % DO NOT CHANGE THIS
\usepackage{graphicx} % DO NOT CHANGE THIS
\urlstyle{rm} % DO NOT CHANGE THIS
\def\UrlFont{\rm}  % DO NOT CHANGE THIS
\usepackage{natbib}  % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\usepackage{lipsum}
\usepackage{booktabs}
\usepackage{amsmath}
\usepackage{caption} % DO NOT CHANGE THIS AND DO NOT ADD ANY OPTIONS TO IT
\frenchspacing  % DO NOT CHANGE THIS
\setlength{\pdfpagewidth}{8.5in}  % DO NOT CHANGE THIS
\setlength{\pdfpageheight}{11in}  % DO NOT CHANGE THIS
\usepackage{endnotes}
\let\footnote=\endnote
\usepackage{etoolbox}
\makeatletter
\patchcmd{\@verbatim}
  {\verbatim@font}
  {\verbatim@font\tiny}
  {}{}
\makeatother

\setcounter{secnumdepth}{0} %May be changed to 1 or 2 if section numb