## Arxiv Mongo Import

This function takes a set of CERMXML files created by CERM and converts them into json files before adding them to the MongoDB.  A parse status is added to each document in order to define what was extracted from the pdf.

In [1]:
import json
import xmltodict
import os
from pymongo import MongoClient
from tqdm import tqdm

In [14]:
db = MongoClient().ds_documents
col = db.papers

prefix = 'arxiv/'
folders = os.listdir(prefix)
folders = [x for x in folders if len(x) ==2]
for folder in folders:
    print('# Now completing folder:' + folder)
    files = os.listdir(prefix+folder)
    files = [x for x in files if x[-7:] == 'cermxml']
    for af in tqdm(files):
        doc = {}
        parse_keys = set()
        fi = prefix + folder + '/' + af
        with open(fi) as fp:
            entry = fp.read()
        data = xmltodict.parse(entry)
        doc['source'] = 'arxiv'
        doc['category'] = folder
        doc['doc_name'] = af
        doc['fpath'] = fi
        doc['original'] = data
        doc['parse_status'] = 'no_parse'
        try:
            doc['title'] = data['article']['front']['article-meta']['title-group']['article-title']
            doc['authors'] = data['article']['front']['article-meta']['contrib-group']['contrib']
            doc['abstract'] = data['article']['front']['article-meta']['abstract']
            doc['references'] = data['article']['back']['ref-list']['ref']
            doc['body'] = data['article']['body']['sec']
            doc['parse_status'] = 'full_parse'
            doc['parse_keys'] = str({'title','authors','abstract','references','body'})
        except:
            if data['article']['front'].get('article-meta',{}):
                if data['article']['front']['article-meta'].get('abstract'):
                    doc['abstract'] = data['article']['front']['article-meta']['abstract']
                    parse_keys.add('abstract')
            if data['article']['back'].get('ref-list',{}):
                doc['references'] = data['article']['back']['ref-list']['ref']
                parse_keys.add('references')
            if  data['article']['front'].get('article-meta',{}):
                if data['article']['front']['article-meta'].get('contrib-group'):
                    doc['authors'] = data['article']['front']['article-meta']['contrib-group']
                    parse_keys.add('authors')
            if data['article']['front'].get('article-meta',{}):
                if data['article']['front']['article-meta'].get('title-group',{}):
                    if data['article']['front']['article-meta']['title-group'].get('article-title'):
                        doc['title'] = data['article']['front']['article-meta']['title-group']['article-title']
                        parse_keys.add('title')
            if data['article'].get('body',{}):
                doc['body'] = data['article']['body']['sec']
                parse_keys.add('body')
            else:
                doc['parse_status'] = 'no_body'
            if len({'abstract', 'body', 'references'} - parse_keys) == 0:
                doc['parse_status'] = 'partial_parse'
            doc['parse_keys'] = str(parse_keys)
        result = col.insert_one(doc)

  1%|          | 11/1922 [00:00<00:17, 109.49it/s]

# Now completing folder:IR


100%|██████████| 1922/1922 [00:16<00:00, 115.72it/s]
  1%|          | 7/1242 [00:00<00:19, 62.08it/s]

# Now completing folder:NA


100%|██████████| 1242/1242 [00:12<00:00, 99.78it/s]
  0%|          | 12/19457 [00:00<02:45, 117.53it/s]

# Now completing folder:IT


100%|██████████| 19457/19457 [03:26<00:00, 94.26it/s]
  1%|          | 11/889 [00:00<00:08, 105.08it/s]

# Now completing folder:SC


100%|██████████| 889/889 [00:08<00:00, 102.17it/s]
  1%|          | 12/1382 [00:00<00:11, 115.61it/s]

# Now completing folder:FL


100%|██████████| 1382/1382 [00:13<00:00, 106.13it/s]
  0%|          | 9/9089 [00:00<02:06, 71.77it/s]

# Now completing folder:CV


100%|██████████| 9089/9089 [01:44<00:00, 86.64it/s]
  0%|          | 11/8813 [00:00<01:20, 109.57it/s]

# Now completing folder:LG


100%|██████████| 8813/8813 [01:30<00:00, 97.35it/s]
  0%|          | 11/3087 [00:00<00:28, 107.80it/s]

# Now completing folder:SE


100%|██████████| 3087/3087 [00:25<00:00, 121.63it/s]
  0%|          | 10/3969 [00:00<00:40, 98.35it/s]

# Now completing folder:SY


100%|██████████| 3969/3969 [00:43<00:00, 92.28it/s]
  1%|          | 6/506 [00:00<00:08, 59.44it/s]

# Now completing folder:GR


100%|██████████| 506/506 [00:04<00:00, 105.96it/s]
  0%|          | 11/2276 [00:00<00:23, 98.18it/s]

# Now completing folder:CG


100%|██████████| 2276/2276 [00:20<00:00, 110.33it/s]
  1%|▏         | 9/708 [00:00<00:08, 85.44it/s]

# Now completing folder:ET


100%|██████████| 708/708 [00:07<00:00, 96.61it/s]
  2%|▏         | 11/712 [00:00<00:06, 103.43it/s]

# Now completing folder:MA


100%|██████████| 712/712 [00:07<00:00, 100.57it/s]
  0%|          | 11/2353 [00:00<00:22, 102.02it/s]

# Now completing folder:DB


100%|██████████| 2353/2353 [00:25<00:00, 91.22it/s]
  0%|          | 12/2975 [00:00<00:25, 116.79it/s]

# Now completing folder:GT


100%|██████████| 2975/2975 [00:31<00:00, 95.77it/s]
  1%|          | 10/1777 [00:00<00:20, 86.58it/s]

# Now completing folder:PL


100%|██████████| 1777/1777 [00:18<00:00, 96.61it/s]
  9%|▉         | 7/74 [00:00<00:00, 69.07it/s]

# Now completing folder:GL


100%|██████████| 74/74 [00:00<00:00, 114.93it/s]
  0%|          | 0/1527 [00:00<?, ?it/s]

# Now completing folder:OH


100%|██████████| 1527/1527 [00:10<00:00, 148.36it/s]
  1%|          | 13/1366 [00:00<00:10, 129.32it/s]

# Now completing folder:HC


100%|██████████| 1366/1366 [00:11<00:00, 117.77it/s]
  0%|          | 11/4040 [00:00<00:37, 108.59it/s]

# Now completing folder:DM


100%|██████████| 4040/4040 [00:34<00:00, 117.27it/s]
  2%|▏         | 10/592 [00:00<00:05, 97.81it/s]

# Now completing folder:MS


100%|██████████| 592/592 [00:05<00:00, 111.81it/s]
  3%|▎         | 16/582 [00:00<00:03, 157.53it/s]

# Now completing folder:SD


100%|██████████| 582/582 [00:05<00:00, 110.85it/s]
  5%|▍         | 10/202 [00:00<00:01, 98.18it/s]

# Now completing folder:OS


100%|██████████| 202/202 [00:01<00:00, 115.57it/s]
  1%|          | 12/2072 [00:00<00:17, 114.45it/s]

# Now completing folder:NE


100%|██████████| 2072/2072 [00:19<00:00, 108.33it/s]
  0%|          | 16/5094 [00:00<00:34, 145.58it/s]

# Now completing folder:CL


100%|██████████| 5094/5094 [00:41<00:00, 122.33it/s]
  1%|          | 11/1682 [00:00<00:15, 108.16it/s]

# Now completing folder:CE


100%|██████████| 1682/1682 [00:16<00:00, 104.36it/s]
  0%|          | 12/5685 [00:00<00:50, 112.04it/s]

# Now completing folder:AI


100%|██████████| 5685/5685 [00:48<00:00, 117.55it/s]
  2%|▏         | 11/463 [00:00<00:04, 99.44it/s]

# Now completing folder:PF


100%|██████████| 463/463 [00:04<00:00, 109.37it/s]
  1%|▏         | 9/659 [00:00<00:08, 78.71it/s]

# Now completing folder:AR


100%|██████████| 659/659 [00:05<00:00, 130.21it/s]
  0%|          | 12/5016 [00:00<00:43, 115.91it/s]

# Now completing folder:LO


100%|██████████| 5016/5016 [00:53<00:00, 93.34it/s]
  0%|          | 12/6746 [00:00<00:56, 118.63it/s]

# Now completing folder:DS


100%|██████████| 6746/6746 [01:04<00:00, 104.00it/s]
  2%|▏         | 11/720 [00:00<00:06, 108.26it/s]

# Now completing folder:MM


100%|██████████| 720/720 [00:06<00:00, 118.87it/s]
  0%|          | 12/4328 [00:00<00:39, 109.58it/s]

# Now completing folder:DC


100%|██████████| 4328/4328 [00:41<00:00, 105.06it/s]
  0%|          | 10/3659 [00:00<00:37, 98.15it/s]

# Now completing folder:CC


100%|██████████| 3659/3659 [00:33<00:00, 109.25it/s]
  0%|          | 11/4983 [00:00<00:45, 109.25it/s]

# Now completing folder:CR


100%|██████████| 4983/4983 [00:44<00:00, 111.40it/s]
  0%|          | 11/2445 [00:00<00:22, 106.77it/s]

# Now completing folder:CY


100%|██████████| 2445/2445 [00:19<00:00, 123.44it/s]
  1%|          | 11/1898 [00:00<00:18, 104.59it/s]

# Now completing folder:RO


100%|██████████| 1898/1898 [00:18<00:00, 103.86it/s]
  1%|          | 9/1610 [00:00<00:19, 80.99it/s]

# Now completing folder:DL


100%|██████████| 1610/1610 [00:12<00:00, 132.71it/s]
  0%|          | 10/5275 [00:00<00:54, 96.56it/s]

# Now completing folder:SI


100%|██████████| 5275/5275 [00:56<00:00, 92.74it/s]
  0%|          | 10/6543 [00:00<01:14, 87.12it/s]

# Now completing folder:NI


100%|██████████| 6543/6543 [01:00<00:00, 108.47it/s]


In [15]:
col.count()

128418

In [19]:
t = col.find_one()