# Download the S2ORC Dataset

This notebook processes the latest release (2020-07-05) of the [Semantic Scholar Open Research Corpus](https://www.aclweb.org/anthology/2020.acl-main.447). It is split into 100 uniformly shuffled shards. 

The script will download each shard, filter computer science papers from it and extract the required data, then delete the superfluous data files.

In [1]:
import os
import subprocess
import urllib
import gzip
import io
import json
import re
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
# # Open raw file
# with open("../dataset/full_urls.txt") as f:
#     lines = f.readlines()
# 
# getlines = [line[:-1] for line in lines if line.startswith('wget')]
# getlines = getlines[2:]
# metalines = [str(line.split()[-1]) for line in getlines if line.find('meta') != -1]
# metalines = [line[1:-1] for line in metalines]
# pdflines = [str(line.split()[-1]) for line in getlines if line.find('meta') == -1]
# pdflines = [line[1:-1] for line in pdflines]

# df = pd.DataFrame(list(zip(metalines, pdflines)), columns=['metadata', 'pdf_parses'])
# df.to_csv('../dataset/ss_urls.csv', index=None)
# df.head()

In [3]:
data_url = pd.read_csv("../dataset/ss_urls.csv")
data_url.head()

Unnamed: 0,metadata,pdf_parses
0,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...
1,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...
2,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...
3,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...
4,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...


In [4]:
len(data_url)

100

### Download the data that we need

This part of codes are from HESCapstone

In [9]:
# Create required folder structure
ROOT = '../dataset'
CLEAN_DATA = os.path.join(ROOT, 'SS/clean')
METADATA_INPUT_DIR = os.path.join(ROOT,'SS/metadata/raw/') 
METADATA_OUTPUT_DIR = os.path.join(ROOT,'SS/metadata/business/')
PDF_PARSES_INPUT_DIR = os.path.join(ROOT,'SS/pdf_parses/raw/')
PDF_PARSES_OUTPUT_DIR = os.path.join(ROOT,'SS/pdf_parses/business/')

os.makedirs(CLEAN_DATA, exist_ok=True)
os.makedirs(METADATA_INPUT_DIR, exist_ok=True)
os.makedirs(METADATA_OUTPUT_DIR, exist_ok=True)
os.makedirs(PDF_PARSES_INPUT_DIR, exist_ok=True)
os.makedirs(PDF_PARSES_OUTPUT_DIR, exist_ok=True)

In [6]:
class DownloadBatch:
    """Filter raw SS dataset to extract only the papers we need"""

    field = "Business"

    def __init__(self, df):
        self.df = df

    @staticmethod
    def get_filename(url):
        return os.path.basename(url.split("?")[0])

    def get_basename(self, url):
        filename = self.get_filename(url).split(".")[0]
        return filename
    
    
    def create_batch(self, row):
        """Create links for each row."""
        input_metadata_url = row[1]["metadata"]
        input_pdf_parses_url = row[1]["pdf_parses"]
        batch = dict(
            number=self.get_basename(input_metadata_url)[-1],
            input_metadata_url=input_metadata_url,
            input_metadata_path=os.path.join(
                METADATA_INPUT_DIR, self.get_filename(input_metadata_url)
            ),
            output_metadata_path=os.path.join(
                METADATA_OUTPUT_DIR, self.get_filename(input_metadata_url)
            )[:-3],
            input_pdf_parses_url=input_pdf_parses_url,
            input_pdf_parses_path=os.path.join(
                PDF_PARSES_INPUT_DIR, self.get_filename(input_pdf_parses_url)
            ),
            output_pdf_parses_path=os.path.join(
                PDF_PARSES_OUTPUT_DIR,
                self.get_filename(input_pdf_parses_url),
            )[:-3],
        )
        return batch

    def download_data(self, batch: dict):
        """Download the metadata and the pdf parses for the shard"""

        # Download metadata
        print(f"Downloading raw data:")
        
        if not os.path.isfile(batch["input_metadata_path"]):
            print('    Downloading metadata...')
            urllib.request.urlretrieve(batch["input_metadata_url"], batch["input_metadata_path"])
            print('    Done')
        else:
            print('    Metadata already downloaded')
        
        # Download pdf parse
        if not os.path.isfile(batch["input_pdf_parses_path"]):
            print('    Downloading pdf parses...')
            urllib.request.urlretrieve(batch["input_pdf_parses_url"], batch["input_pdf_parses_path"])
            print('    Done')
        else:
            print('    PDF data already downloaded')
        

    def check_pdf_parse(self, metadata_dict):
        """Only keep files that have a corresponding pdf parse"""
        if metadata_dict.get("has_pdf_parsed_body_text"):
            return True
        return False

    def find_topics(self, metadata_dict: dict):
        """Return papers that are in the field of interest"""
        mag_field_of_study = metadata_dict["mag_field_of_study"]
        if mag_field_of_study and self.field in mag_field_of_study:
            return True
        return False

    def filter_metadata(self, metadata_dict):
        """Filter papers of interest"""
        # Filter papers with no pdf parse
        if self.check_pdf_parse(metadata_dict):
            # Filter to selected topic
            if self.find_topics(metadata_dict):
                return True    
        return False

    def filter_batch(self, batch: dict):
        """Download raw data and filter papers of interest"""
        number = batch['number']
        print(f'\nProcessing shard {number}')
        self.download_data(batch)

        print("Filtering metadata:")
        paper_ids_to_keep = set()
        with gzip.open(batch["input_metadata_path"], "rb") as gz, open(batch["output_metadata_path"], "wb") as f_out:
            f = io.BufferedReader(gz)
            for line in tqdm(f.readlines()):
                metadata_dict = json.loads(line)
                if self.filter_metadata(metadata_dict):
                    paper_id = metadata_dict.get("paper_id")
                    paper_ids_to_keep.add(paper_id)
                    f_out.write(line)                
                
        print(f'    {len(paper_ids_to_keep)} parsed papers found')
        
        print("Parsing pdfs:")
        with gzip.open(batch['input_pdf_parses_path'], 'rb') as gz, open(batch['output_pdf_parses_path'], 'wb') as f_out:
            f = io.BufferedReader(gz)
            for line in tqdm(f.readlines()):
                metadata_dict = json.loads(line)
                paper_id = metadata_dict['paper_id']
                if paper_id in paper_ids_to_keep:
                    f_out.write(line)
        print('Done')
                 
    def cleanup(self, batch):
        """Delete the raw files to clear up space for other shards"""
        os.remove(batch['input_metadata_path'])
        os.remove(batch['input_pdf_parses_path'])

    def __call__(self, cleanup=True):
        for row in self.df.iterrows():
            batch = self.create_batch(row)
            self.filter_batch(batch)
            if cleanup:
                self.cleanup(batch)

In [10]:
# Full dataset
download = DownloadBatch(data_url[:20])
download(cleanup=True)


Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366661/1366661 [00:21<00:00, 62777.51it/s]


    1381 parsed papers found
Parsing pdfs:


100%|██████████| 310736/310736 [00:58<00:00, 5329.10it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365929/1365929 [00:22<00:00, 61079.66it/s]


    1399 parsed papers found
Parsing pdfs:


100%|██████████| 310316/310316 [00:59<00:00, 5253.98it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365724/1365724 [00:22<00:00, 60664.07it/s]


    1419 parsed papers found
Parsing pdfs:


100%|██████████| 311205/311205 [00:58<00:00, 5362.62it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364981/1364981 [00:22<00:00, 60815.69it/s]


    1334 parsed papers found
Parsing pdfs:


100%|██████████| 309634/309634 [00:58<00:00, 5330.71it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1368337/1368337 [00:22<00:00, 61127.23it/s]


    1439 parsed papers found
Parsing pdfs:


100%|██████████| 310881/310881 [00:58<00:00, 5340.99it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365293/1365293 [00:22<00:00, 60975.97it/s]


    1421 parsed papers found
Parsing pdfs:


100%|██████████| 310782/310782 [00:58<00:00, 5337.40it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366174/1366174 [00:22<00:00, 61446.73it/s]


    1460 parsed papers found
Parsing pdfs:


100%|██████████| 310864/310864 [00:58<00:00, 5338.84it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365392/1365392 [00:22<00:00, 61583.32it/s]


    1466 parsed papers found
Parsing pdfs:


100%|██████████| 310734/310734 [00:58<00:00, 5346.75it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365591/1365591 [00:22<00:00, 60510.15it/s]


    1417 parsed papers found
Parsing pdfs:


100%|██████████| 310129/310129 [00:58<00:00, 5343.74it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365273/1365273 [00:22<00:00, 61190.97it/s]


    1430 parsed papers found
Parsing pdfs:


100%|██████████| 309474/309474 [00:57<00:00, 5338.77it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366666/1366666 [00:21<00:00, 62201.88it/s]


    1403 parsed papers found
Parsing pdfs:


100%|██████████| 310604/310604 [00:58<00:00, 5354.49it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365681/1365681 [00:22<00:00, 60986.22it/s]


    1394 parsed papers found
Parsing pdfs:


100%|██████████| 310565/310565 [00:58<00:00, 5280.76it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365718/1365718 [00:21<00:00, 62086.25it/s]


    1352 parsed papers found
Parsing pdfs:


100%|██████████| 310539/310539 [00:58<00:00, 5326.51it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364615/1364615 [00:21<00:00, 62122.57it/s]


    1412 parsed papers found
Parsing pdfs:


100%|██████████| 310218/310218 [00:58<00:00, 5342.81it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366657/1366657 [00:22<00:00, 61991.84it/s]


    1421 parsed papers found
Parsing pdfs:


100%|██████████| 311241/311241 [00:58<00:00, 5328.73it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364853/1364853 [00:22<00:00, 61059.08it/s]


    1386 parsed papers found
Parsing pdfs:


100%|██████████| 310683/310683 [00:58<00:00, 5350.52it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364882/1364882 [00:21<00:00, 62272.05it/s]


    1420 parsed papers found
Parsing pdfs:


100%|██████████| 310258/310258 [00:58<00:00, 5346.83it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365496/1365496 [00:22<00:00, 61678.30it/s]


    1480 parsed papers found
Parsing pdfs:


100%|██████████| 310260/310260 [00:58<00:00, 5332.68it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365958/1365958 [00:22<00:00, 61556.47it/s]


    1412 parsed papers found
Parsing pdfs:


100%|██████████| 310222/310222 [00:58<00:00, 5322.30it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365290/1365290 [00:22<00:00, 61948.07it/s]


    1387 parsed papers found
Parsing pdfs:


100%|██████████| 309829/309829 [01:00<00:00, 5101.00it/s]


Done


In [11]:
# Full dataset
download = DownloadBatch(data_url[20:40]) # For test, use data_url to get full dataset
download(cleanup=True)


Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365957/1365957 [00:22<00:00, 61075.93it/s]


    1451 parsed papers found
Parsing pdfs:


100%|██████████| 310716/310716 [01:00<00:00, 5171.18it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367851/1367851 [00:22<00:00, 60313.98it/s]


    1435 parsed papers found
Parsing pdfs:


100%|██████████| 311314/311314 [00:59<00:00, 5228.71it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364650/1364650 [00:22<00:00, 60713.15it/s]


    1448 parsed papers found
Parsing pdfs:


100%|██████████| 311174/311174 [00:59<00:00, 5235.74it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365344/1365344 [00:23<00:00, 59092.41it/s]


    1429 parsed papers found
Parsing pdfs:


100%|██████████| 309277/309277 [00:59<00:00, 5230.06it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366402/1366402 [00:22<00:00, 61013.72it/s]


    1435 parsed papers found
Parsing pdfs:


100%|██████████| 310504/310504 [00:59<00:00, 5230.81it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364991/1364991 [00:22<00:00, 60789.31it/s]


    1433 parsed papers found
Parsing pdfs:


100%|██████████| 309424/309424 [00:58<00:00, 5254.77it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366240/1366240 [00:22<00:00, 60790.07it/s]


    1398 parsed papers found
Parsing pdfs:


100%|██████████| 311452/311452 [00:59<00:00, 5264.50it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364205/1364205 [00:23<00:00, 58132.07it/s]


    1436 parsed papers found
Parsing pdfs:


100%|██████████| 309855/309855 [01:03<00:00, 4874.36it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366174/1366174 [00:23<00:00, 57111.68it/s]


    1468 parsed papers found
Parsing pdfs:


100%|██████████| 310380/310380 [01:05<00:00, 4727.30it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365819/1365819 [00:24<00:00, 55408.40it/s]


    1449 parsed papers found
Parsing pdfs:


100%|██████████| 310060/310060 [01:06<00:00, 4665.49it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364712/1364712 [00:24<00:00, 55124.33it/s]


    1414 parsed papers found
Parsing pdfs:


100%|██████████| 309894/309894 [01:07<00:00, 4619.65it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366882/1366882 [00:25<00:00, 54173.55it/s]


    1442 parsed papers found
Parsing pdfs:


100%|██████████| 311391/311391 [01:08<00:00, 4565.79it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366304/1366304 [00:25<00:00, 54092.83it/s]


    1414 parsed papers found
Parsing pdfs:


100%|██████████| 310803/310803 [01:08<00:00, 4539.90it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365752/1365752 [00:25<00:00, 53545.14it/s]


    1524 parsed papers found
Parsing pdfs:


100%|██████████| 310858/310858 [01:08<00:00, 4551.21it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365708/1365708 [00:23<00:00, 58346.50it/s]


    1346 parsed papers found
Parsing pdfs:


100%|██████████| 309825/309825 [01:03<00:00, 4890.97it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366284/1366284 [00:23<00:00, 57037.70it/s]


    1449 parsed papers found
Parsing pdfs:


100%|██████████| 311292/311292 [01:04<00:00, 4849.48it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365538/1365538 [00:24<00:00, 56641.53it/s]


    1453 parsed papers found
Parsing pdfs:


100%|██████████| 309633/309633 [01:05<00:00, 4708.82it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366943/1366943 [00:24<00:00, 56891.10it/s]


    1388 parsed papers found
Parsing pdfs:


100%|██████████| 312126/312126 [01:04<00:00, 4866.68it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364493/1364493 [00:24<00:00, 56195.70it/s]


    1395 parsed papers found
Parsing pdfs:


100%|██████████| 309773/309773 [01:06<00:00, 4630.06it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1368071/1368071 [00:23<00:00, 57116.18it/s]


    1457 parsed papers found
Parsing pdfs:


100%|██████████| 310154/310154 [01:03<00:00, 4895.30it/s]


Done


In [12]:
# Full dataset
download = DownloadBatch(data_url[40:60]) # For test, use data_url to get full dataset
download(cleanup=True)


Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364806/1364806 [00:22<00:00, 60546.32it/s]


    1423 parsed papers found
Parsing pdfs:


100%|██████████| 310657/310657 [01:02<00:00, 4957.74it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1363787/1363787 [00:23<00:00, 58786.00it/s]


    1433 parsed papers found
Parsing pdfs:


100%|██████████| 310226/310226 [01:01<00:00, 5020.80it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367766/1367766 [00:23<00:00, 58247.73it/s]


    1446 parsed papers found
Parsing pdfs:


100%|██████████| 310199/310199 [01:04<00:00, 4784.79it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366701/1366701 [00:24<00:00, 56067.27it/s]


    1506 parsed papers found
Parsing pdfs:


100%|██████████| 311127/311127 [01:05<00:00, 4752.50it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364986/1364986 [00:24<00:00, 54917.38it/s]


    1416 parsed papers found
Parsing pdfs:


100%|██████████| 310460/310460 [01:06<00:00, 4685.36it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1368101/1368101 [00:25<00:00, 54540.93it/s]


    1425 parsed papers found
Parsing pdfs:


100%|██████████| 310763/310763 [01:07<00:00, 4612.79it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365328/1365328 [00:24<00:00, 56366.90it/s]


    1440 parsed papers found
Parsing pdfs:


100%|██████████| 310409/310409 [01:06<00:00, 4660.19it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364619/1364619 [00:24<00:00, 54853.82it/s]


    1451 parsed papers found
Parsing pdfs:


100%|██████████| 309849/309849 [01:06<00:00, 4666.40it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366986/1366986 [00:23<00:00, 58254.07it/s]


    1444 parsed papers found
Parsing pdfs:


100%|██████████| 310533/310533 [01:04<00:00, 4831.81it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365501/1365501 [00:24<00:00, 56351.41it/s]


    1475 parsed papers found
Parsing pdfs:


100%|██████████| 310481/310481 [01:03<00:00, 4881.87it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1368897/1368897 [00:23<00:00, 57539.47it/s]


    1387 parsed papers found
Parsing pdfs:


100%|██████████| 312397/312397 [01:02<00:00, 4981.78it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366229/1366229 [00:24<00:00, 56128.78it/s]


    1425 parsed papers found
Parsing pdfs:


100%|██████████| 310917/310917 [01:02<00:00, 4939.88it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366634/1366634 [00:24<00:00, 56734.94it/s]


    1410 parsed papers found
Parsing pdfs:


100%|██████████| 310667/310667 [01:03<00:00, 4854.52it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366568/1366568 [00:23<00:00, 58182.75it/s]


    1442 parsed papers found
Parsing pdfs:


100%|██████████| 310224/310224 [01:03<00:00, 4913.09it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366043/1366043 [00:23<00:00, 57123.73it/s]


    1390 parsed papers found
Parsing pdfs:


100%|██████████| 310018/310018 [01:05<00:00, 4761.82it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365926/1365926 [00:23<00:00, 58173.53it/s]


    1456 parsed papers found
Parsing pdfs:


100%|██████████| 310333/310333 [01:06<00:00, 4677.05it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1363968/1363968 [00:23<00:00, 57775.32it/s]


    1449 parsed papers found
Parsing pdfs:


100%|██████████| 310014/310014 [01:06<00:00, 4666.36it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366304/1366304 [00:24<00:00, 55949.85it/s]


    1499 parsed papers found
Parsing pdfs:


100%|██████████| 310793/310793 [01:06<00:00, 4688.69it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367750/1367750 [00:24<00:00, 55348.07it/s]


    1390 parsed papers found
Parsing pdfs:


100%|██████████| 310838/310838 [01:06<00:00, 4665.42it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364545/1364545 [00:23<00:00, 57073.12it/s]


    1422 parsed papers found
Parsing pdfs:


100%|██████████| 311346/311346 [01:04<00:00, 4845.81it/s]


Done


In [13]:
# Full dataset
download = DownloadBatch(data_url[60:80]) # For test, use data_url to get full dataset
download(cleanup=True)


Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364259/1364259 [00:24<00:00, 54649.26it/s]


    1435 parsed papers found
Parsing pdfs:


100%|██████████| 310521/310521 [01:05<00:00, 4759.90it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366142/1366142 [00:25<00:00, 54207.28it/s]


    1440 parsed papers found
Parsing pdfs:


100%|██████████| 310379/310379 [01:07<00:00, 4585.15it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1368220/1368220 [00:25<00:00, 53867.87it/s]


    1373 parsed papers found
Parsing pdfs:


100%|██████████| 310904/310904 [01:09<00:00, 4447.12it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367162/1367162 [00:25<00:00, 52904.96it/s]


    1417 parsed papers found
Parsing pdfs:


100%|██████████| 309975/309975 [01:05<00:00, 4750.50it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365948/1365948 [00:23<00:00, 57003.97it/s]


    1401 parsed papers found
Parsing pdfs:


100%|██████████| 311073/311073 [01:06<00:00, 4704.92it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364858/1364858 [00:25<00:00, 53644.51it/s]


    1478 parsed papers found
Parsing pdfs:


100%|██████████| 310443/310443 [01:09<00:00, 4435.99it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366646/1366646 [00:25<00:00, 53126.50it/s]


    1433 parsed papers found
Parsing pdfs:


100%|██████████| 310376/310376 [01:09<00:00, 4465.97it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1363902/1363902 [00:25<00:00, 52583.79it/s]


    1417 parsed papers found
Parsing pdfs:


100%|██████████| 310371/310371 [01:03<00:00, 4925.63it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365028/1365028 [00:21<00:00, 62437.15it/s]


    1434 parsed papers found
Parsing pdfs:


100%|██████████| 310230/310230 [00:58<00:00, 5332.62it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1368314/1368314 [00:22<00:00, 60142.06it/s]


    1386 parsed papers found
Parsing pdfs:


100%|██████████| 311580/311580 [00:58<00:00, 5350.58it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365873/1365873 [00:21<00:00, 62695.05it/s]


    1476 parsed papers found
Parsing pdfs:


100%|██████████| 310252/310252 [00:57<00:00, 5382.73it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366562/1366562 [00:21<00:00, 62746.75it/s]


    1504 parsed papers found
Parsing pdfs:


100%|██████████| 311205/311205 [00:57<00:00, 5376.90it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365774/1365774 [00:21<00:00, 62498.12it/s]


    1440 parsed papers found
Parsing pdfs:


100%|██████████| 310169/310169 [00:59<00:00, 5237.63it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364113/1364113 [00:21<00:00, 62482.25it/s]


    1435 parsed papers found
Parsing pdfs:


100%|██████████| 310368/310368 [00:57<00:00, 5393.96it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1363330/1363330 [00:21<00:00, 62574.90it/s]


    1419 parsed papers found
Parsing pdfs:


100%|██████████| 310117/310117 [00:57<00:00, 5404.77it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366721/1366721 [00:21<00:00, 63107.61it/s]


    1402 parsed papers found
Parsing pdfs:


100%|██████████| 310957/310957 [00:57<00:00, 5408.45it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364952/1364952 [00:22<00:00, 62029.09it/s]


    1416 parsed papers found
Parsing pdfs:


100%|██████████| 309261/309261 [00:57<00:00, 5388.50it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364563/1364563 [00:21<00:00, 62192.38it/s]


    1400 parsed papers found
Parsing pdfs:


100%|██████████| 310149/310149 [00:57<00:00, 5367.68it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366518/1366518 [00:21<00:00, 62502.39it/s]


    1461 parsed papers found
Parsing pdfs:


100%|██████████| 310043/310043 [00:57<00:00, 5394.57it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365523/1365523 [00:21<00:00, 62544.05it/s]


    1490 parsed papers found
Parsing pdfs:


100%|██████████| 310695/310695 [00:57<00:00, 5359.14it/s]


Done


In [14]:
# Full dataset
download = DownloadBatch(data_url[80:]) # For test, use data_url to get full dataset
download(cleanup=True)


Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365028/1365028 [00:21<00:00, 62725.94it/s]


    1359 parsed papers found
Parsing pdfs:


100%|██████████| 309423/309423 [00:57<00:00, 5423.71it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366906/1366906 [00:21<00:00, 62219.67it/s]


    1413 parsed papers found
Parsing pdfs:


100%|██████████| 312067/312067 [00:58<00:00, 5356.86it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367460/1367460 [00:21<00:00, 62404.05it/s]


    1481 parsed papers found
Parsing pdfs:


100%|██████████| 310768/310768 [00:58<00:00, 5323.69it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366204/1366204 [00:21<00:00, 63276.84it/s]


    1395 parsed papers found
Parsing pdfs:


100%|██████████| 310740/310740 [00:58<00:00, 5344.83it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1363977/1363977 [00:21<00:00, 63153.19it/s]


    1414 parsed papers found
Parsing pdfs:


100%|██████████| 310006/310006 [00:57<00:00, 5405.94it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1368014/1368014 [00:21<00:00, 62516.37it/s]


    1447 parsed papers found
Parsing pdfs:


100%|██████████| 310315/310315 [00:57<00:00, 5410.85it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364573/1364573 [00:21<00:00, 62776.58it/s]


    1432 parsed papers found
Parsing pdfs:


100%|██████████| 310670/310670 [00:57<00:00, 5419.37it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366777/1366777 [00:21<00:00, 63272.73it/s]


    1518 parsed papers found
Parsing pdfs:


100%|██████████| 310565/310565 [00:57<00:00, 5420.49it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366421/1366421 [00:21<00:00, 63407.26it/s]


    1456 parsed papers found
Parsing pdfs:


100%|██████████| 310581/310581 [00:57<00:00, 5414.24it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366131/1366131 [00:21<00:00, 62956.23it/s]


    1450 parsed papers found
Parsing pdfs:


100%|██████████| 310297/310297 [00:57<00:00, 5412.33it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365306/1365306 [00:22<00:00, 61818.70it/s]


    1410 parsed papers found
Parsing pdfs:


100%|██████████| 310532/310532 [00:57<00:00, 5364.51it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365782/1365782 [00:21<00:00, 62936.59it/s]


    1500 parsed papers found
Parsing pdfs:


100%|██████████| 309868/309868 [00:57<00:00, 5377.34it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367685/1367685 [00:21<00:00, 62850.48it/s]


    1417 parsed papers found
Parsing pdfs:


100%|██████████| 310975/310975 [00:57<00:00, 5381.12it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365016/1365016 [00:21<00:00, 63034.78it/s]


    1486 parsed papers found
Parsing pdfs:


100%|██████████| 309525/309525 [00:57<00:00, 5388.08it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366596/1366596 [00:21<00:00, 62873.86it/s]


    1436 parsed papers found
Parsing pdfs:


100%|██████████| 310024/310024 [00:57<00:00, 5374.24it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1369090/1369090 [00:21<00:00, 62822.60it/s]


    1413 parsed papers found
Parsing pdfs:


100%|██████████| 311381/311381 [00:57<00:00, 5408.94it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366006/1366006 [00:21<00:00, 62771.86it/s]


    1415 parsed papers found
Parsing pdfs:


100%|██████████| 310298/310298 [00:57<00:00, 5436.36it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364969/1364969 [00:21<00:00, 63571.43it/s]


    1418 parsed papers found
Parsing pdfs:


100%|██████████| 309381/309381 [00:57<00:00, 5399.49it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367399/1367399 [00:21<00:00, 62845.80it/s]


    1430 parsed papers found
Parsing pdfs:


100%|██████████| 310827/310827 [00:57<00:00, 5429.43it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367311/1367311 [00:21<00:00, 62922.92it/s]


    1462 parsed papers found
Parsing pdfs:


100%|██████████| 311466/311466 [00:57<00:00, 5378.88it/s]


Done
