## Prepare refs.db

In [3]:
from sqlalchemy import create_engine

db_uri = 'sqlite:///arxiv_v2.1_refs.db'
db_engine = create_engine(db_uri)

q = '''CREATE TABLE bibitem (
	uuid VARCHAR(36) NOT NULL,
	citing_mag_id VARCHAR(36),
	cited_mag_id VARCHAR(36),
	citing_arxiv_id VARCHAR(36),
	cited_arxiv_id VARCHAR(36),
	bibitem_string TEXT,
	PRIMARY KEY (uuid)
);'''

foo = db_engine.execute(q)

manually executed

```
alter table bibitem rename to bibitemOld;

<then ran above>

insert into bibitem (uuid, cited_mag_id, citing_arxiv_id, bibitem_string) select bibitemOld.uuid, mag_id, in_doc, bibitem_string from bibitemOld left join bibitemmagidmap on bibitemOld.uuid = bibitemmagidmap.uuid;sqlite> select count(*) from bibitem;
```

## Prepare MAG ID ↔ arXiv ID mappings

In [4]:
import csv

mid_aid_map = {}
aid_mid_map = {}

with open('mag_id_2_arxiv_id.csv') as f:
    csv_reader = csv.reader(f, delimiter=',')
    for row in csv_reader:
        mid = row[0]
        aid = row[3]
        mid_aid_map[mid] = aid
        aid_mid_map[aid] = mid
        
print('done')

done


In [9]:
q_idx1 = 'CREATE INDEX bibitem_citing_aid_idx ON bibitem (citing_arxiv_id);'
q_idx2 = 'CREATE INDEX bibitem_cited_mid_idx ON bibitem (cited_mag_id);'
foo = db_engine.execute(q_idx1);
print('index 1 built')
foo = db_engine.execute(q_idx2);
print('index 2 built')

index 1 built
index 2 built


In [None]:
import datetime
i = 0
count_all = len(mid_aid_map)
for mid, aid in mid_aid_map.items():
    if i%100000 == 0:
        print('{}: {}/{}  ({:.2f})'.format(datetime.datetime.now(), i, count_all, i/count_all))
    q = 'UPDATE bibitem SET citing_mag_id = "{}" where citing_arxiv_id = "{}";'.format(mid, aid)
    foo = db_engine.execute(q)
    q = 'UPDATE bibitem SET cited_arxiv_id = "{}" where cited_mag_id = "{}";'.format(aid, mid)
    foo = db_engine.execute(q)
    i += 1

```
2019-08-09 18:32:36.920674: 0/1412190  (0.00)
2019-08-09 19:02:41.504153: 100000/1412190  (0.07)
2019-08-09 19:30:45.352849: 200000/1412190  (0.14)
2019-08-09 20:01:49.597559: 300000/1412190  (0.21)
2019-08-09 20:39:33.143241: 400000/1412190  (0.28)
2019-08-09 21:17:19.208086: 500000/1412190  (0.35)
2019-08-09 21:54:53.519787: 600000/1412190  (0.42)
2019-08-09 22:32:23.416555: 700000/1412190  (0.50)
2019-08-09 23:08:59.816808: 800000/1412190  (0.57)
2019-08-09 23:45:21.860331: 900000/1412190  (0.64)
2019-08-10 00:17:21.326966: 1000000/1412190  (0.71)
2019-08-10 00:49:14.437472: 1100000/1412190  (0.78)
2019-08-10 01:20:52.882915: 1200000/1412190  (0.85)
2019-08-10 01:50:09.262802: 1300000/1412190  (0.92)
2019-08-10 02:16:09.689792: 1400000/1412190  (0.99)
```

manually executed

```
drop table bibitemOld;
drop tabel bibitemmagidmap;
drop index bibitem_citing_aid_idx;
drop index bibitem_cited_mid_idx;
```