## Prepare refs.db data

In [1]:
from sqlalchemy import create_engine

db_uri = 'sqlite:///arxiv_v2.1_refs.db'
db_engine = create_engine(db_uri)

q_citing_aids = 'select distinct in_doc from bibitem'
q_cited_mids = 'select distinct mag_id from bibitemmagidmap'
q_refs_matched = ('select bibitem.uuid, mag_id, in_doc'
                  ' from bibitemmagidmap join bibitem'
                  ' on bibitemmagidmap.uuid = bibitem.uuid')

print('query 1')
citing_aids = db_engine.execute(q_citing_aids).fetchall()
print('query 2')
cited_mids = db_engine.execute(q_cited_mids).fetchall()
print('query 3')
refs_matched = db_engine.execute(q_refs_matched).fetchall()
print('done')

query 1
query 2
query 3
done


## Prepare arXiv ID → FoS mapping

In [36]:
db_uri_fos = 'sqlite:///aid_fos_subj.db'
db_engine_fos = create_engine(db_uri_fos)

q_aid_cat_subj = 'select aid, fos, subj from paper'

print('query 1')
aid_cat_subj = db_engine_fos.execute(q_aid_cat_subj).fetchall()

aid_fos_map = {}

for tup in aid_cat_subj:
        aid = tup[0].replace('/', '')
        cat = tup[1]
        subj = tup[2]
        aid_fos_map[aid] = [cat, subj]

print('done')

query 1
done


## Prepare MAG ID ↔ arXiv ID mappings

In [11]:
import csv

mid_aid_map = {}
aid_mid_map = {}

with open('mag_id_2_arxiv_id.csv') as f:
    csv_reader = csv.reader(f, delimiter=',')
    for row in csv_reader:
        mid = row[0]
        aid = row[3]
        mid_aid_map[mid] = aid
        aid_mid_map[aid] = mid
        
print('done')

done


## Analyze

In [19]:
cited_total = 0
cited_in_arxiv = 0
for tup in cited_mids:
    mid = tup[0]
    aid = mid_aid_map.get(mid)
    if aid:
        cited_in_arxiv += 1
    cited_total += 1

print('Cited documents:')
print('total: {}'.format(cited_total))
print('in arXiv: {} ({:.2f})'.format(cited_in_arxiv, cited_in_arxiv/cited_total))

Cited documents:
total: 2820381
in arXiv: 741402 (0.26)


#### Cited documents:
```
total: 2820381
in arXiv: 741402 (0.26)
```

In [42]:
refs_total = 0
refs_to_arxiv = 0
refs_to_arxiv_by_fos = {}
no_arxiv_metadata = 0
fos_total = 0
for tup in refs_matched:
    mid = tup[1]
    aid = mid_aid_map.get(mid)
    if aid:
        refs_to_arxiv += 1
        fos_tup = aid_fos_map.get(aid)
        if not fos_tup:
            no_arxiv_metadata += 1
            continue
        # fos = fos_tup[1]  # use subject (fine grained)
        fos = fos_tup[0]  # use category (coarse)
        fos = fos.split(':')[0]  # even more coarse
        if fos not in refs_to_arxiv_by_fos:
            refs_to_arxiv_by_fos[fos] = 0
        refs_to_arxiv_by_fos[fos] += 1
        fos_total += 1
    refs_total += 1

print('References:')
print('total: {}'.format(refs_total))
print('to arXiv: {} ({:.2f})'.format(refs_to_arxiv, refs_to_arxiv/refs_total))
print('')
for fos, count in refs_to_arxiv_by_fos.items():
    print('{}: {} ({:.2f})'.format(fos, count, count/fos_total))
print('')
print('(no arxiv metadata: {} ({:.2f}))'.format(no_arxiv_metadata, no_arxiv_metadata/refs_to_arxiv))
    

References:
total: 16723705
to docs in arXiv: 7181576 (0.43)

math: 1015690 (0.14)
cs: 618217 (0.09)
physics: 5456684 (0.77)
q-bio: 3847 (0.00)
stat: 6209 (0.00)
eess: 1343 (0.00)
q-fin: 426 (0.00)
econ: 367 (0.00)

(no arxiv metadata: 78793 (0.01))


#### References:
```
total: 16723705
to arXiv: 7181576 (0.43)
```

##### by Fos:
```
math: 1015690 (0.14)
cs: 618217 (0.09)
physics: 5456684 (0.77)
q-bio: 3847 (0.00)
stat: 6209 (0.00)
eess: 1343 (0.00)
q-fin: 426 (0.00)
econ: 367 (0.00)

(no arxiv metadata: 78793 (0.01))
```

In [26]:
citing_total = 0
citing_with_mid = 0
for tup in citing_aids:
    mid = aid_mid_map.get(aid)
    if mid:
        citing_with_mid += 1
    citing_total += 1

print('Citing documents:')
print('total: {}'.format(citing_total))
print('to docs in arXiv: {} ({:.2f})'.format(citing_with_mid, citing_with_mid/citing_total))

Citing documents:
total: 1192097
to docs in arXiv: 1192097 (1.00)


#### Citing documents:
```
total: 1192097
to docs in arXiv: 1192097 (1.00)
```