# Download the S2ORC Dataset

This notebook processes the latest release (2020-07-05) of the [Semantic Scholar Open Research Corpus](https://www.aclweb.org/anthology/2020.acl-main.447). It is split into 100 uniformly shuffled shards. 

The script will download each shard, filter computer science papers from it and extract the required data, then delete the superfluous data files.

In [1]:
import os
import subprocess
import urllib
import gzip
import io
import json
import re
from tqdm import tqdm
import pandas as pd
import numpy as np

In [2]:
# # Open raw file
# with open("../dataset/full_urls.txt") as f:
#     lines = f.readlines()
# 
# getlines = [line[:-1] for line in lines if line.startswith('wget')]
# getlines = getlines[2:]
# metalines = [str(line.split()[-1]) for line in getlines if line.find('meta') != -1]
# metalines = [line[1:-1] for line in metalines]
# pdflines = [str(line.split()[-1]) for line in getlines if line.find('meta') == -1]
# pdflines = [line[1:-1] for line in pdflines]

# df = pd.DataFrame(list(zip(metalines, pdflines)), columns=['metadata', 'pdf_parses'])
# df.to_csv('../dataset/ss_urls.csv', index=None)
# df.head()

In [2]:
data_url = pd.read_csv("../dataset/ss_urls.csv")
data_url.head()

Unnamed: 0,metadata,pdf_parses
0,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...
1,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...
2,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...
3,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...
4,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...


In [3]:
len(data_url)

100

### Download the data that we need

This part of codes are from HESCapstone

In [4]:
# Create required folder structure
ROOT = '../dataset'
CLEAN_DATA = os.path.join(ROOT, 'SS/clean')
METADATA_INPUT_DIR = os.path.join(ROOT,'SS/metadata/raw/') 
METADATA_OUTPUT_DIR = os.path.join(ROOT,'SS/metadata/CS/')
PDF_PARSES_INPUT_DIR = os.path.join(ROOT,'SS/pdf_parses/raw/')
PDF_PARSES_OUTPUT_DIR = os.path.join(ROOT,'SS/pdf_parses/CS/')

os.makedirs(CLEAN_DATA, exist_ok=True)
os.makedirs(METADATA_INPUT_DIR, exist_ok=True)
os.makedirs(METADATA_OUTPUT_DIR, exist_ok=True)
os.makedirs(PDF_PARSES_INPUT_DIR, exist_ok=True)
os.makedirs(PDF_PARSES_OUTPUT_DIR, exist_ok=True)

In [5]:
class DownloadBatch:
    """Filter raw SS dataset to extract only the papers we need"""

    field = "Computer Science"

    def __init__(self, df):
        self.df = df

    @staticmethod
    def get_filename(url):
        return os.path.basename(url.split("?")[0])

    def get_basename(self, url):
        filename = self.get_filename(url).split(".")[0]
        return filename
    
    
    def create_batch(self, row):
        """Create links for each row."""
        input_metadata_url = row[1]["metadata"]
        input_pdf_parses_url = row[1]["pdf_parses"]
        batch = dict(
            number=self.get_basename(input_metadata_url)[-1],
            input_metadata_url=input_metadata_url,
            input_metadata_path=os.path.join(
                METADATA_INPUT_DIR, self.get_filename(input_metadata_url)
            ),
            output_metadata_path=os.path.join(
                METADATA_OUTPUT_DIR, self.get_filename(input_metadata_url)
            )[:-3],
            input_pdf_parses_url=input_pdf_parses_url,
            input_pdf_parses_path=os.path.join(
                PDF_PARSES_INPUT_DIR, self.get_filename(input_pdf_parses_url)
            ),
            output_pdf_parses_path=os.path.join(
                PDF_PARSES_OUTPUT_DIR,
                self.get_filename(input_pdf_parses_url),
            )[:-3],
        )
        return batch

    def download_data(self, batch: dict):
        """Download the metadata and the pdf parses for the shard"""

        # Download metadata
        print(f"Downloading raw data:")
        
        if not os.path.isfile(batch["input_metadata_path"]):
            print('    Downloading metadata...')
            urllib.request.urlretrieve(batch["input_metadata_url"], batch["input_metadata_path"])
            print('    Done')
        else:
            print('    Metadata already downloaded')
        
        # Download pdf parse
        if not os.path.isfile(batch["input_pdf_parses_path"]):
            print('    Downloading pdf parses...')
            urllib.request.urlretrieve(batch["input_pdf_parses_url"], batch["input_pdf_parses_path"])
            print('    Done')
        else:
            print('    PDF data already downloaded')
        

    def check_pdf_parse(self, metadata_dict):
        """Only keep files that have a corresponding pdf parse"""
        if metadata_dict.get("has_pdf_parsed_body_text"):
            return True
        return False

    def find_topics(self, metadata_dict: dict):
        """Return papers that are in the field of interest"""
        mag_field_of_study = metadata_dict["mag_field_of_study"]
        if mag_field_of_study and self.field in mag_field_of_study:
            return True
        return False

    def filter_metadata(self, metadata_dict):
        """Filter papers of interest"""
        # Filter papers with no pdf parse
        if self.check_pdf_parse(metadata_dict):
            # Filter to selected topic
            if self.find_topics(metadata_dict):
                return True    
        return False

    def filter_batch(self, batch: dict):
        """Download raw data and filter papers of interest"""
        number = batch['number']
        print(f'\nProcessing shard {number}')
        self.download_data(batch)

        print("Filtering metadata:")
        paper_ids_to_keep = set()
        with gzip.open(batch["input_metadata_path"], "rb") as gz, open(batch["output_metadata_path"], "wb") as f_out:
            f = io.BufferedReader(gz)
            for line in tqdm(f.readlines()):
                metadata_dict = json.loads(line)
                if self.filter_metadata(metadata_dict):
                    paper_id = metadata_dict.get("paper_id")
                    paper_ids_to_keep.add(paper_id)
                    f_out.write(line)                
                
        print(f'    {len(paper_ids_to_keep)} parsed papers found')
        
        print("Parsing pdfs:")
        with gzip.open(batch['input_pdf_parses_path'], 'rb') as gz, open(batch['output_pdf_parses_path'], 'wb') as f_out:
            f = io.BufferedReader(gz)
            for line in tqdm(f.readlines()):
                metadata_dict = json.loads(line)
                paper_id = metadata_dict['paper_id']
                if paper_id in paper_ids_to_keep:
                    f_out.write(line)
        print('Done')
                 
    def cleanup(self, batch):
        """Delete the raw files to clear up space for other shards"""
        os.remove(batch['input_metadata_path'])
        os.remove(batch['input_pdf_parses_path'])

    def __call__(self, cleanup=True):
        for row in self.df.iterrows():
            batch = self.create_batch(row)
            self.filter_batch(batch)
            if cleanup:
                self.cleanup(batch)

In [6]:
# Full dataset
download = DownloadBatch(data_url[:20])
download(cleanup=True)


Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366661/1366661 [00:21<00:00, 63337.03it/s]


    15018 parsed papers found
Parsing pdfs:


100%|██████████| 310736/310736 [00:57<00:00, 5436.78it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365929/1365929 [00:22<00:00, 61647.38it/s]


    14997 parsed papers found
Parsing pdfs:


100%|██████████| 310316/310316 [00:57<00:00, 5430.22it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365724/1365724 [00:22<00:00, 61661.75it/s]


    14824 parsed papers found
Parsing pdfs:


100%|██████████| 311205/311205 [00:56<00:00, 5468.01it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364981/1364981 [00:22<00:00, 61522.91it/s]


    14784 parsed papers found
Parsing pdfs:


100%|██████████| 309634/309634 [00:56<00:00, 5443.42it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1368337/1368337 [00:22<00:00, 59549.41it/s]


    14745 parsed papers found
Parsing pdfs:


100%|██████████| 310881/310881 [00:57<00:00, 5434.30it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365293/1365293 [00:22<00:00, 60702.21it/s]


    14704 parsed papers found
Parsing pdfs:


100%|██████████| 310782/310782 [00:56<00:00, 5474.81it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366174/1366174 [00:22<00:00, 61183.06it/s]


    14640 parsed papers found
Parsing pdfs:


100%|██████████| 310864/310864 [00:57<00:00, 5453.42it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365392/1365392 [00:23<00:00, 58500.89it/s]


    14880 parsed papers found
Parsing pdfs:


100%|██████████| 310734/310734 [01:00<00:00, 5165.10it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365591/1365591 [00:23<00:00, 58334.70it/s]


    14945 parsed papers found
Parsing pdfs:


100%|██████████| 310129/310129 [00:59<00:00, 5180.28it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365273/1365273 [00:23<00:00, 58972.36it/s]


    14699 parsed papers found
Parsing pdfs:


100%|██████████| 309474/309474 [01:00<00:00, 5091.10it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366666/1366666 [00:23<00:00, 57610.51it/s]


    14963 parsed papers found
Parsing pdfs:


100%|██████████| 310604/310604 [00:59<00:00, 5187.06it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365681/1365681 [00:23<00:00, 57086.16it/s]


    14887 parsed papers found
Parsing pdfs:


100%|██████████| 310565/310565 [01:00<00:00, 5098.33it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365718/1365718 [00:23<00:00, 58046.71it/s]


    14866 parsed papers found
Parsing pdfs:


100%|██████████| 310539/310539 [00:59<00:00, 5203.56it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364615/1364615 [00:23<00:00, 58108.83it/s]


    15016 parsed papers found
Parsing pdfs:


100%|██████████| 310218/310218 [01:01<00:00, 5058.49it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366657/1366657 [00:23<00:00, 58566.22it/s]


    15079 parsed papers found
Parsing pdfs:


100%|██████████| 311241/311241 [01:01<00:00, 5038.87it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364853/1364853 [00:24<00:00, 56334.81it/s]


    15076 parsed papers found
Parsing pdfs:


100%|██████████| 310683/310683 [01:02<00:00, 4997.06it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364882/1364882 [00:22<00:00, 59430.83it/s]


    14890 parsed papers found
Parsing pdfs:


100%|██████████| 310258/310258 [01:00<00:00, 5158.77it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365496/1365496 [00:23<00:00, 58345.54it/s]


    14779 parsed papers found
Parsing pdfs:


100%|██████████| 310260/310260 [00:59<00:00, 5201.34it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365958/1365958 [00:24<00:00, 56327.65it/s]


    14964 parsed papers found
Parsing pdfs:


100%|██████████| 310222/310222 [01:02<00:00, 4974.85it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365290/1365290 [00:23<00:00, 58418.05it/s]


    14743 parsed papers found
Parsing pdfs:


100%|██████████| 309829/309829 [00:59<00:00, 5198.54it/s]


Done


In [7]:
# Full dataset
download = DownloadBatch(data_url[20:40]) # For test, use data_url to get full dataset
download(cleanup=True)


Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365957/1365957 [00:23<00:00, 58546.24it/s]


    14879 parsed papers found
Parsing pdfs:


100%|██████████| 310716/310716 [01:00<00:00, 5124.88it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367851/1367851 [00:23<00:00, 58556.33it/s]


    15098 parsed papers found
Parsing pdfs:


100%|██████████| 311314/311314 [01:00<00:00, 5182.65it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364650/1364650 [00:23<00:00, 57976.44it/s]


    14800 parsed papers found
Parsing pdfs:


100%|██████████| 311174/311174 [01:00<00:00, 5141.70it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365344/1365344 [00:23<00:00, 57652.67it/s]


    14581 parsed papers found
Parsing pdfs:


100%|██████████| 309277/309277 [01:00<00:00, 5141.22it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366402/1366402 [00:23<00:00, 57854.63it/s]


    14901 parsed papers found
Parsing pdfs:


100%|██████████| 310504/310504 [00:59<00:00, 5213.55it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364991/1364991 [00:23<00:00, 58540.80it/s]


    14842 parsed papers found
Parsing pdfs:


100%|██████████| 309424/309424 [00:59<00:00, 5206.10it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366240/1366240 [00:23<00:00, 58246.40it/s]


    14869 parsed papers found
Parsing pdfs:


100%|██████████| 311452/311452 [01:00<00:00, 5177.31it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364205/1364205 [00:22<00:00, 59317.16it/s]


    14790 parsed papers found
Parsing pdfs:


100%|██████████| 309855/309855 [00:57<00:00, 5421.72it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366174/1366174 [00:22<00:00, 60253.15it/s]


    14983 parsed papers found
Parsing pdfs:


100%|██████████| 310380/310380 [00:57<00:00, 5431.54it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365819/1365819 [00:22<00:00, 60680.59it/s]


    14901 parsed papers found
Parsing pdfs:


100%|██████████| 310060/310060 [00:56<00:00, 5445.86it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364712/1364712 [00:22<00:00, 59504.08it/s]


    14743 parsed papers found
Parsing pdfs:


100%|██████████| 309894/309894 [00:57<00:00, 5352.73it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366882/1366882 [00:23<00:00, 58933.09it/s]


    14833 parsed papers found
Parsing pdfs:


100%|██████████| 311391/311391 [00:58<00:00, 5321.76it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366304/1366304 [00:23<00:00, 59308.08it/s]


    14638 parsed papers found
Parsing pdfs:


100%|██████████| 310803/310803 [00:58<00:00, 5292.03it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365752/1365752 [00:22<00:00, 59475.90it/s]


    15027 parsed papers found
Parsing pdfs:


100%|██████████| 310858/310858 [00:58<00:00, 5327.49it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365708/1365708 [00:22<00:00, 59669.81it/s]


    14980 parsed papers found
Parsing pdfs:


100%|██████████| 309825/309825 [00:58<00:00, 5307.15it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366284/1366284 [00:22<00:00, 60053.25it/s]


    14926 parsed papers found
Parsing pdfs:


100%|██████████| 311292/311292 [00:58<00:00, 5352.17it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365538/1365538 [00:22<00:00, 59527.72it/s]


    14862 parsed papers found
Parsing pdfs:


100%|██████████| 309633/309633 [00:57<00:00, 5350.33it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366943/1366943 [00:22<00:00, 59880.18it/s]


    14934 parsed papers found
Parsing pdfs:


100%|██████████| 312126/312126 [00:58<00:00, 5323.68it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364493/1364493 [00:22<00:00, 59685.35it/s]


    14828 parsed papers found
Parsing pdfs:


100%|██████████| 309773/309773 [00:58<00:00, 5330.67it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1368071/1368071 [00:22<00:00, 59786.06it/s]


    14707 parsed papers found
Parsing pdfs:


100%|██████████| 310154/310154 [00:58<00:00, 5315.43it/s]


Done


In [8]:
# Full dataset
download = DownloadBatch(data_url[40:60]) # For test, use data_url to get full dataset
download(cleanup=True)


Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364806/1364806 [00:22<00:00, 59688.42it/s]


    14877 parsed papers found
Parsing pdfs:


100%|██████████| 310657/310657 [00:58<00:00, 5326.30it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1363787/1363787 [00:22<00:00, 59576.87it/s]


    14796 parsed papers found
Parsing pdfs:


100%|██████████| 310226/310226 [00:56<00:00, 5472.21it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367766/1367766 [00:22<00:00, 60755.65it/s]


    14824 parsed papers found
Parsing pdfs:


100%|██████████| 310199/310199 [00:57<00:00, 5370.75it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366701/1366701 [00:22<00:00, 61099.14it/s]


    15008 parsed papers found
Parsing pdfs:


100%|██████████| 311127/311127 [00:57<00:00, 5456.77it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364986/1364986 [00:22<00:00, 60136.94it/s]


    14863 parsed papers found
Parsing pdfs:


100%|██████████| 310460/310460 [00:56<00:00, 5479.25it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1368101/1368101 [00:22<00:00, 60540.74it/s]


    14948 parsed papers found
Parsing pdfs:


100%|██████████| 310763/310763 [00:57<00:00, 5405.80it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365328/1365328 [00:22<00:00, 60481.70it/s]


    14833 parsed papers found
Parsing pdfs:


100%|██████████| 310409/310409 [00:56<00:00, 5477.06it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364619/1364619 [00:22<00:00, 60592.16it/s]


    14992 parsed papers found
Parsing pdfs:


100%|██████████| 309849/309849 [00:56<00:00, 5468.19it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366986/1366986 [00:22<00:00, 60741.18it/s]


    14845 parsed papers found
Parsing pdfs:


100%|██████████| 310533/310533 [00:56<00:00, 5479.18it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365501/1365501 [00:22<00:00, 61104.57it/s]


    14769 parsed papers found
Parsing pdfs:


100%|██████████| 310481/310481 [00:56<00:00, 5471.72it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1368897/1368897 [00:22<00:00, 61452.31it/s]


    15015 parsed papers found
Parsing pdfs:


100%|██████████| 312397/312397 [00:56<00:00, 5498.34it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366229/1366229 [00:22<00:00, 61338.89it/s]


    15323 parsed papers found
Parsing pdfs:


100%|██████████| 310917/310917 [00:56<00:00, 5467.99it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366634/1366634 [00:22<00:00, 61383.59it/s]


    15120 parsed papers found
Parsing pdfs:


100%|██████████| 310667/310667 [00:56<00:00, 5487.28it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366568/1366568 [00:22<00:00, 61458.42it/s]


    14827 parsed papers found
Parsing pdfs:


100%|██████████| 310224/310224 [00:56<00:00, 5508.18it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366043/1366043 [00:22<00:00, 61420.58it/s]


    15071 parsed papers found
Parsing pdfs:


100%|██████████| 310018/310018 [00:56<00:00, 5498.44it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365926/1365926 [00:22<00:00, 61107.26it/s]


    14966 parsed papers found
Parsing pdfs:


100%|██████████| 310333/310333 [00:56<00:00, 5471.19it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1363968/1363968 [00:22<00:00, 61167.04it/s]


    14832 parsed papers found
Parsing pdfs:


100%|██████████| 310014/310014 [00:56<00:00, 5496.58it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366304/1366304 [00:22<00:00, 60255.90it/s]


    14722 parsed papers found
Parsing pdfs:


100%|██████████| 310793/310793 [00:58<00:00, 5315.81it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367750/1367750 [00:22<00:00, 59959.77it/s]


    14834 parsed papers found
Parsing pdfs:


100%|██████████| 310838/310838 [00:58<00:00, 5348.59it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364545/1364545 [00:22<00:00, 59767.87it/s]


    14911 parsed papers found
Parsing pdfs:


100%|██████████| 311346/311346 [00:58<00:00, 5364.38it/s]


Done


In [9]:
# Full dataset
download = DownloadBatch(data_url[60:80]) # For test, use data_url to get full dataset
download(cleanup=True)


Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364259/1364259 [00:22<00:00, 59682.05it/s]


    14857 parsed papers found
Parsing pdfs:


100%|██████████| 310521/310521 [00:58<00:00, 5333.52it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366142/1366142 [00:22<00:00, 59469.14it/s]


    14822 parsed papers found
Parsing pdfs:


100%|██████████| 310379/310379 [00:56<00:00, 5459.00it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1368220/1368220 [00:22<00:00, 61445.72it/s]


    15022 parsed papers found
Parsing pdfs:


100%|██████████| 310904/310904 [00:57<00:00, 5404.53it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367162/1367162 [00:22<00:00, 60828.58it/s]


    14978 parsed papers found
Parsing pdfs:


100%|██████████| 309975/309975 [00:57<00:00, 5390.36it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365948/1365948 [00:22<00:00, 61143.99it/s]


    14784 parsed papers found
Parsing pdfs:


100%|██████████| 311073/311073 [00:57<00:00, 5436.52it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364858/1364858 [00:22<00:00, 61133.59it/s]


    14783 parsed papers found
Parsing pdfs:


100%|██████████| 310443/310443 [00:57<00:00, 5435.09it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366646/1366646 [00:22<00:00, 60870.19it/s]


    15076 parsed papers found
Parsing pdfs:


100%|██████████| 310376/310376 [00:57<00:00, 5427.92it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1363902/1363902 [00:22<00:00, 60985.36it/s]


    14759 parsed papers found
Parsing pdfs:


100%|██████████| 310371/310371 [00:56<00:00, 5449.99it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365028/1365028 [00:22<00:00, 60730.56it/s]


    14779 parsed papers found
Parsing pdfs:


100%|██████████| 310230/310230 [00:57<00:00, 5368.56it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1368314/1368314 [00:22<00:00, 60489.02it/s]


    14955 parsed papers found
Parsing pdfs:


100%|██████████| 311580/311580 [00:57<00:00, 5464.77it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365873/1365873 [00:22<00:00, 61056.99it/s]


    14961 parsed papers found
Parsing pdfs:


100%|██████████| 310252/310252 [00:56<00:00, 5455.71it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366562/1366562 [00:22<00:00, 61301.60it/s]


    14882 parsed papers found
Parsing pdfs:


100%|██████████| 311205/311205 [00:56<00:00, 5465.71it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365774/1365774 [00:22<00:00, 61243.86it/s]


    14783 parsed papers found
Parsing pdfs:


100%|██████████| 310169/310169 [00:56<00:00, 5478.36it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364113/1364113 [00:22<00:00, 61033.14it/s]


    14988 parsed papers found
Parsing pdfs:


100%|██████████| 310368/310368 [00:56<00:00, 5450.90it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1363330/1363330 [00:23<00:00, 58617.46it/s]


    14813 parsed papers found
Parsing pdfs:


100%|██████████| 310117/310117 [00:57<00:00, 5389.44it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366721/1366721 [00:22<00:00, 60858.66it/s]


    14856 parsed papers found
Parsing pdfs:


100%|██████████| 310957/310957 [00:56<00:00, 5489.57it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364952/1364952 [00:22<00:00, 61195.52it/s]


    14805 parsed papers found
Parsing pdfs:


100%|██████████| 309261/309261 [00:56<00:00, 5443.75it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364563/1364563 [00:22<00:00, 61314.50it/s]


    14855 parsed papers found
Parsing pdfs:


100%|██████████| 310149/310149 [00:57<00:00, 5421.14it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366518/1366518 [00:22<00:00, 60120.03it/s]


    14969 parsed papers found
Parsing pdfs:


100%|██████████| 310043/310043 [00:56<00:00, 5447.02it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365523/1365523 [00:23<00:00, 58688.20it/s]


    14762 parsed papers found
Parsing pdfs:


100%|██████████| 310695/310695 [00:57<00:00, 5403.44it/s]


Done


In [10]:
# Full dataset
download = DownloadBatch(data_url[80:]) # For test, use data_url to get full dataset
download(cleanup=True)


Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365028/1365028 [00:22<00:00, 60676.44it/s]


    14803 parsed papers found
Parsing pdfs:


100%|██████████| 309423/309423 [00:56<00:00, 5453.79it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366906/1366906 [00:22<00:00, 60833.73it/s]


    14754 parsed papers found
Parsing pdfs:


100%|██████████| 312067/312067 [00:57<00:00, 5439.19it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367460/1367460 [00:22<00:00, 60782.71it/s]


    14870 parsed papers found
Parsing pdfs:


100%|██████████| 310768/310768 [00:57<00:00, 5443.53it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366204/1366204 [00:22<00:00, 61326.32it/s]


    14842 parsed papers found
Parsing pdfs:


100%|██████████| 310740/310740 [00:56<00:00, 5465.15it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1363977/1363977 [00:22<00:00, 61449.21it/s]


    14936 parsed papers found
Parsing pdfs:


100%|██████████| 310006/310006 [00:56<00:00, 5473.19it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1368014/1368014 [00:22<00:00, 60538.22it/s]


    14992 parsed papers found
Parsing pdfs:


100%|██████████| 310315/310315 [00:56<00:00, 5446.90it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364573/1364573 [00:22<00:00, 60159.51it/s]


    14928 parsed papers found
Parsing pdfs:


100%|██████████| 310670/310670 [00:56<00:00, 5476.36it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366777/1366777 [00:22<00:00, 61255.83it/s]


    14971 parsed papers found
Parsing pdfs:


100%|██████████| 310565/310565 [00:56<00:00, 5469.41it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366421/1366421 [00:22<00:00, 60405.98it/s]


    14776 parsed papers found
Parsing pdfs:


100%|██████████| 310581/310581 [00:56<00:00, 5479.36it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366131/1366131 [00:22<00:00, 60847.91it/s]


    14843 parsed papers found
Parsing pdfs:


100%|██████████| 310297/310297 [00:56<00:00, 5490.01it/s]


Done

Processing shard 0
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365306/1365306 [00:22<00:00, 60215.71it/s]


    14973 parsed papers found
Parsing pdfs:


100%|██████████| 310532/310532 [00:57<00:00, 5437.79it/s]


Done

Processing shard 1
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365782/1365782 [00:22<00:00, 60808.74it/s]


    14713 parsed papers found
Parsing pdfs:


100%|██████████| 309868/309868 [00:56<00:00, 5436.69it/s]


Done

Processing shard 2
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367685/1367685 [00:22<00:00, 60891.96it/s]


    14907 parsed papers found
Parsing pdfs:


100%|██████████| 310975/310975 [00:56<00:00, 5500.64it/s]


Done

Processing shard 3
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1365016/1365016 [00:22<00:00, 61006.31it/s]


    14642 parsed papers found
Parsing pdfs:


100%|██████████| 309525/309525 [00:56<00:00, 5483.32it/s]


Done

Processing shard 4
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366596/1366596 [00:22<00:00, 61365.62it/s]


    14866 parsed papers found
Parsing pdfs:


100%|██████████| 310024/310024 [00:56<00:00, 5448.21it/s]


Done

Processing shard 5
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1369090/1369090 [00:22<00:00, 61490.67it/s]


    15054 parsed papers found
Parsing pdfs:


100%|██████████| 311381/311381 [00:56<00:00, 5496.67it/s]


Done

Processing shard 6
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1366006/1366006 [00:22<00:00, 60704.70it/s]


    14856 parsed papers found
Parsing pdfs:


100%|██████████| 310298/310298 [00:56<00:00, 5473.04it/s]


Done

Processing shard 7
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1364969/1364969 [00:22<00:00, 60754.63it/s]


    14601 parsed papers found
Parsing pdfs:


100%|██████████| 309381/309381 [00:56<00:00, 5452.24it/s]


Done

Processing shard 8
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367399/1367399 [00:22<00:00, 61042.15it/s]


    14835 parsed papers found
Parsing pdfs:


100%|██████████| 310827/310827 [00:56<00:00, 5476.56it/s]


Done

Processing shard 9
Downloading raw data:
    Downloading metadata...
    Done
    Downloading pdf parses...
    Done
Filtering metadata:


100%|██████████| 1367311/1367311 [00:22<00:00, 61100.86it/s]


    15064 parsed papers found
Parsing pdfs:


100%|██████████| 311466/311466 [00:57<00:00, 5432.60it/s]


Done
