In [1]:
import csv
import datetime
import pandas as pd

dparser = lambda x: datetime.datetime.strptime(x, '%Y-%m-%dT%H:%M:%SZ')
df = pd.read_csv(
    'in_lang_cleaned_extended.tsv',
    header=None,
    names = ['uuid', 'lang', 'aid', 'published', 'category', 'journal_ref', 'bibitem_str'],
    sep='\t',
    parse_dates=['published'],
    date_parser=dparser,
    low_memory=False,
    quoting=csv.QUOTE_NONE
)

In [2]:
# Absolute number of papers with E-to-nE references per year per discipline

disc_map = {  # see: https://arxiv.org/category_taxonomy
    'math': 'math',
    'cond-mat': 'phys',
    'math-ph': 'phys',
    'physics': 'phys',
    'cs': 'cs',
    'hep-th': 'phys',
    'nlin': 'phys',
    'gr-qc': 'phys',
    'hep-ph': 'phys',
    'astro-ph': 'phys',
    'quant-ph': 'phys',
    'nucl-th': 'phys',
    'q-fin': 'other',
    'hep-ex': 'phys',
    'stat': 'other',
    'nucl-ex': 'phys',
    'q-alg': 'math',
    'q-bio': 'other',
    'solv-int': 'phys',
    'chao-dyn': 'phys',
    'hep-lat': 'phys',
    'funct-an': 'math',
    'patt-sol': 'phys',
    'dg-ga': 'other',
    'alg-geom': 'math',
    'cmp-lg': 'cs',
    'adap-org': 'phys',
    'econ': 'other',
    'eess': 'other',
    'acc-phys': 'phys',
    'comp-gas': 'cs'
}

df.groupby(
    [
        df['category'].apply(lambda x: disc_map[x.split('.')[0]]),
        df['published'].dt.year
    ]
).agg({'aid': pd.Series.nunique}).unstack(fill_value=0)['aid'].to_dict()

{1992: {'cs': 0, 'math': 0, 'other': 0, 'phys': 18},
 1993: {'cs': 0, 'math': 2, 'other': 0, 'phys': 62},
 1994: {'cs': 1, 'math': 8, 'other': 0, 'phys': 74},
 1995: {'cs': 1, 'math': 24, 'other': 1, 'phys': 113},
 1996: {'cs': 3, 'math': 22, 'other': 5, 'phys': 135},
 1997: {'cs': 1, 'math': 18, 'other': 5, 'phys': 189},
 1998: {'cs': 2, 'math': 33, 'other': 0, 'phys': 260},
 1999: {'cs': 4, 'math': 46, 'other': 0, 'phys': 297},
 2000: {'cs': 7, 'math': 62, 'other': 0, 'phys': 378},
 2001: {'cs': 12, 'math': 70, 'other': 0, 'phys': 318},
 2002: {'cs': 6, 'math': 86, 'other': 0, 'phys': 359},
 2003: {'cs': 7, 'math': 101, 'other': 1, 'phys': 381},
 2004: {'cs': 9, 'math': 141, 'other': 2, 'phys': 387},
 2005: {'cs': 10, 'math': 169, 'other': 2, 'phys': 410},
 2006: {'cs': 24, 'math': 217, 'other': 4, 'phys': 435},
 2007: {'cs': 28, 'math': 274, 'other': 6, 'phys': 407},
 2008: {'cs': 37, 'math': 315, 'other': 15, 'phys': 363},
 2009: {'cs': 39, 'math': 383, 'other': 17, 'phys': 456},
 

In [3]:
# Load unarXive ID set (arXiv IDs w/o slashes)

unarXive_citing_aids = []
with open('unarXive_unique_aids') as f:
    for line in f:
        unarXive_citing_aids.append(line.strip())
len(unarXive_citing_aids)

1192097

In [4]:
# Load arXiv metadata (from arXiv metadata dump; arXiv IDs w/ slashes)

fos_map = {
    'physics': 'phys',
    'math': 'math',
    'cs': 'cs',
    'eess': 'other',
    'econ': 'other'
}

dparser2 = lambda x: datetime.datetime.strptime(x, '%Y-%m-%d')
df_ameta = pd.read_csv(
    'aid_to_cat_n_date.csv',
    sep=',',
    parse_dates=['date'],
    date_parser=dparser2,
    low_memory=False,
    quoting=csv.QUOTE_NONE
)
df_ameta.fos = df_ameta.fos.apply(lambda x: fos_map[x])  # align disciplines
df_ameta.aid = df_ameta.aid.apply(lambda x: x.replace('/', ''))  # align IDs

In [5]:
# Check validity against monthly sumbmission numbers from arXiv website
#
# (slight differences possible b/c metadata dump dating works a bit different;
#  e.g. also contains dates prior to 1991)

arxiv_monthly_df = pd.read_csv('arXiv_monthly_submissions.csv')
total_submissions_per_year = {}
for i, row in arxiv_monthly_df.iterrows():
    year, month = row.month.split('-')
    if int(year) == df['published'].max().year and int(month) > df['published'].max().month:
        continue
    if year not in total_submissions_per_year:
        total_submissions_per_year[year] = 0
    total_submissions_per_year[year] += row.submissions
calc_subs_per_year = df_ameta.date.dt.year.value_counts().to_dict()
print('<year>: <control> -> <calculated> (<error>)')
for year, control_count in total_submissions_per_year.items():
    print('{}: {} -> {} ({:.2f}%)'.format(
        year,
        control_count,
        calc_subs_per_year[int(year)],
        (abs(control_count-calc_subs_per_year[int(year)])/control_count)*100
    ))

<year>: <control> -> <calculated> (<error>)
1991: 306 -> 370 (20.92%)
1992: 3263 -> 3181 (2.51%)
1993: 6743 -> 6524 (3.25%)
1994: 10097 -> 9893 (2.02%)
1995: 13014 -> 12748 (2.04%)
1996: 15866 -> 15578 (1.82%)
1997: 19624 -> 19231 (2.00%)
1998: 24172 -> 23684 (2.02%)
1999: 27704 -> 27174 (1.91%)
2000: 30601 -> 30245 (1.16%)
2001: 33214 -> 32719 (1.49%)
2002: 36121 -> 35664 (1.27%)
2003: 39414 -> 38849 (1.43%)
2004: 43727 -> 42914 (1.86%)
2005: 46855 -> 46021 (1.78%)
2006: 50227 -> 49377 (1.69%)
2007: 55638 -> 54501 (2.04%)
2008: 58915 -> 57404 (2.56%)
2009: 64047 -> 62663 (2.16%)
2010: 70131 -> 68951 (1.68%)
2011: 76578 -> 75120 (1.90%)
2012: 84603 -> 82561 (2.41%)
2013: 92641 -> 90320 (2.51%)
2014: 97517 -> 94798 (2.79%)
2015: 105280 -> 101971 (3.14%)
2016: 113380 -> 109946 (3.03%)
2017: 123523 -> 119812 (3.00%)
2018: 140616 -> 136680 (2.80%)
2019: 11537 -> 152210 (1219.32%)
2020: 67286 -> 5127 (92.38%)


In [9]:
# For some reason the arXiv metadata dump doesn't contain any
# non-phys/math/cs records until 2004

df_ameta.groupby(
    [
        df_ameta.fos,
        df_ameta.date.dt.year
    ]
).agg({'aid': pd.Series.count}).unstack(fill_value=0)['aid'].to_dict()

{1990: {'cs': 0, 'math': 7, 'other': 0, 'phys': 0},
 1991: {'cs': 0, 'math': 15, 'other': 0, 'phys': 84},
 1992: {'cs': 0, 'math': 40, 'other': 0, 'phys': 1274},
 1993: {'cs': 0, 'math': 86, 'other': 0, 'phys': 3218},
 1994: {'cs': 39, 'math': 172, 'other': 0, 'phys': 5280},
 1995: {'cs': 43, 'math': 369, 'other': 0, 'phys': 7241},
 1996: {'cs': 59, 'math': 454, 'other': 0, 'phys': 10652},
 1997: {'cs': 80, 'math': 647, 'other': 0, 'phys': 13885},
 1998: {'cs': 129, 'math': 945, 'other': 0, 'phys': 17143},
 1999: {'cs': 188, 'math': 1228, 'other': 0, 'phys': 19646},
 2000: {'cs': 283, 'math': 1589, 'other': 0, 'phys': 21956},
 2001: {'cs': 271, 'math': 1763, 'other': 0, 'phys': 24069},
 2002: {'cs': 351, 'math': 2302, 'other': 0, 'phys': 26231},
 2003: {'cs': 431, 'math': 3180, 'other': 0, 'phys': 27688},
 2004: {'cs': 530, 'math': 4334, 'other': 0, 'phys': 30115},
 2005: {'cs': 716, 'math': 4947, 'other': 0, 'phys': 31727},
 2006: {'cs': 880, 'math': 6510, 'other': 0, 'phys': 32748},


In [10]:
# Filter sampling candidates to arXiv IDs included in unarXive

df_sampling_candidates = df_ameta[df_ameta.aid.isin(unarXive_citing_aids)]

In [11]:
# Generate random sample

from random import sample

year_disc_distrib = df.groupby(
    [
        df['category'].apply(lambda x: disc_map[x.split('.')[0]]),
        df['published'].dt.year
    ]
).agg({'aid': pd.Series.nunique}).unstack(fill_value=0)['aid'].to_dict()

sample_dict = {}
sample_list = []
other_compensation = 0
total_count = 0
for year, distrib_dict in year_disc_distrib.items():
    # iterate through years
    sample_dict[year] = {}
    for disc, count in distrib_dict.items():
        # iterate through disciplines
        total_count += count
        candidate_list = df_sampling_candidates[
            (df_sampling_candidates.date.dt.year == year) & (df_sampling_candidates.fos == disc)
        ]['aid'].to_list()
        if disc == 'other':
            ocount = count + other_compensation
            if ocount > len(candidate_list):
                # take compensation from following year
                smpl = candidate_list
                other_compensation += count - len(candidate_list)
            else:
                smpl = sample(candidate_list, ocount)
                other_compensation = 0
        else:
            smpl = sample(candidate_list, count)
        sample_dict[year][disc] = smpl
        sample_list.extend(smpl)
print('Sample size: {}'.format(len(sample_list)))
print('Control size: {}'.format(total_count))

Sample size: 18171
Control size: 18171


In [12]:
# write to file

with open('control_sample_18171', 'w') as f:
    for aid in sample_list:
        f.write(f'{aid}\n')