In [2]:
import os
SOURCE_DIR = '../data/arxiv_dump/sources'
sources = [f for f in os.listdir(SOURCE_DIR) if not f.endswith("tar.gz")]

In [8]:
def get_all_files(s):
    paper_dir = os.path.join(SOURCE_DIR, s)
    return [os.path.join(paper_dir, f) for f in os.listdir(paper_dir)]


def get_tex_files(s):
    return [f for f in get_all_files(s) if f.endswith(".tex")]

def clean_text(text, prune_document=True):
    pos = text.find("\\begin{abstract}")
    if pos >= 0:
        text = text[:pos]
    lines = text.split("\n")
    lines = [line for line in lines if not line.startswith("%")]
    return "\n".join(lines)


def get_text(path):
    with open(path, "r") as f:
        try:
            return f.read()
        except UnicodeDecodeError:
            # print('Failed to read the file {}'.format(path))
            return ""


In [9]:
get_tex_files(sources[10])

['../data/arxiv_dump/sources/1811.04896v2/contents.tex',
 '../data/arxiv_dump/sources/1811.04896v2/main.tex']

In [21]:
import re
affiliations_dict = dict()
for s in sources:
    emails = []
    tex_files = get_tex_files(s)
    for f in tex_files:
        text = clean_text(get_text(f))
        emails += re.findall('@[a-z-_\.]+\.[a-z-_\.]+', text)
    affiliations_dict[s] = set(emails)

In [22]:
affiliations_dict

{'1903.12542v1': {'@sheffield.ac.uk'},
 '1912.05100v1': set(),
 '2006.13427v1': set(),
 '2007.05461v1': {'@aspiringminds.com'},
 '1911.01700v1': {'@jpmchase.com', '@jpmorgan.com'},
 '2011.03156v1': {'@discover.com'},
 '2006.10965v1': {'@usc.edu'},
 '2009.10990v1': {'@lumiata.com'},
 '2003.00201v1': set(),
 '1905.04610v1': {'@cs.washington.edu'},
 '1811.04896v2': {'@us.ibm.com'},
 '2006.04766v1': {'@cranfield.ac.uk',
  '@gmail.com',
  '@hotmail.com',
  '@outlook.com'},
 '1710.10967v3': set(),
 '1911.05647v1': {'@uchicago.edu'},
 '1807.00130v1': {'@mit.edu'},
 '1910.12389v2': {'@hitachi.cn'},
 '1812.04608v2': set(),
 '2010.08146v1': {'@emory.edu', '@umbc.edu'},
 '2005.11638v1': {'@gmail.com', '@qq.com', '@sjtu.edu.cn'},
 '2001.09464v1': set(),
 '2008.10740v1': {'@uw.edu'},
 '1711.07111v1': set(),
 '1806.08716v2': {'@g.harvard.edu'},
 '1809.06995v1': set(),
 '1912.07211v1': {'@atb.com', '@gmail.com'},
 '1908.04696v1': {'@rice.edu', '@umn.edu'},
 '2002.07325v1': set(),
 '1904.10016v1': set

In [23]:
len(affiliations_dict)

587

In [26]:
sum(len(values) == 0 for values in affiliations_dict.values())

177

In [27]:
from collections import Counter

In [29]:
affiliations_count = Counter()

In [30]:
for aff in affiliations_dict.values():
    affiliations_count.update(aff)

In [32]:
affiliations_count.most_common(100)

[('@gmail.com', 54),
 ('@google.com', 13),
 ('@mit.edu', 11),
 ('@us.ibm.com', 10),
 ('@tu-berlin.de', 10),
 ('@tamu.edu', 9),
 ('@cs.cmu.edu', 7),
 ('@hhi.fraunhofer.de', 7),
 ('@microsoft.com', 6),
 ('@unimelb.edu.au', 6),
 ('@gatech.edu', 5),
 ('@bristol.ac.uk', 5),
 ('@andrew.cmu.edu', 5),
 ('@ufl.edu', 5),
 ('@usc.edu', 4),
 ('@outlook.com', 4),
 ('@g.harvard.edu', 4),
 ('@cs.duke.edu', 4),
 ('@sutd.edu.sg', 4),
 ('@ibm.com', 4),
 ('@colorado.edu', 4),
 ('@insight-centre.org', 4),
 ('@ucd.ie', 4),
 ('@buffalo.edu', 4),
 ('@ucla.edu', 4),
 ('@kaist.ac.kr', 4),
 ('@example.com', 4),
 ('@umn.edu', 3),
 ('@berkeley.edu', 3),
 ('@umich.edu', 3),
 ('@ucl.ac.uk', 3),
 ('@kcl.ac.uk', 3),
 ('@uwaterloo.ca', 3),
 ('@icmc.usp.br', 3),
 ('@cardiff.ac.uk', 3),
 ('@northeastern.edu', 3),
 ('@tue.nl', 3),
 ('@cmu.edu', 3),
 ('@psu.edu', 3),
 ('@cs.uw.edu', 3),
 ('@uci.edu', 3),
 ('@ntu.edu.sg', 3),
 ('@unice.fr', 3),
 ('@uni-tuebingen.de', 3),
 ('@ieee.org', 3),
 ('@stat.uni-muenchen.de', 3),
 (