# Download the S2ORC Dataset

This notebook processes the latest release (2020-07-05) of the [Semantic Scholar Open Research Corpus](https://www.aclweb.org/anthology/2020.acl-main.447). It is split into 100 uniformly shuffled shards. 

The script will download each shard, filter computer science papers from it and extract the required data, then delete the superfluous data files.

In [7]:
import os
import urllib
import gzip
import io
import json
import re
from tqdm import tqdm
import pandas as pd
import numpy as np

In [9]:
# # Open raw file
# with open("../dataset/full_urls.txt") as f:
#     lines = f.readlines()

# getlines = [line[:-1] for line in lines if line.startswith('wget')]
# getlines = getlines[2:]
# metalines = [str(line.split()[-1]) for line in getlines if line.find('meta') != -1]
# metalines = [line[1:-1] for line in metalines]
# pdflines = [str(line.split()[-1]) for line in getlines if line.find('meta') == -1]
# pdflines = [line[1:-1] for line in pdflines]

# df = pd.DataFrame(list(zip(metalines, pdflines)), columns=['metadata', 'pdf_parses'])
# df.to_csv(os.path.join(ROOT, 'ss_urls.csv'), index=None)
# df.head()

In [2]:
data_url = pd.read_csv("../dataset/ss_urls.csv")
data_url.head()

Unnamed: 0,metadata,pdf_parses
0,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...
1,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...
2,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...
3,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...
4,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...,https://ai2-s2-s2orc.s3.amazonaws.com/20200705...


### Download the data that we need

This part of codes are from HESCapstone

In [3]:
# Create required folder structure
ROOT = '../dataset'
CLEAN_DATA = os.path.join(ROOT, 'SS/clean')
METADATA_INPUT_DIR = os.path.join(ROOT,'SS/metadata/raw/') 
METADATA_OUTPUT_DIR = os.path.join(ROOT,'SS/metadata/CS/')
PDF_PARSES_INPUT_DIR = os.path.join(ROOT,'SS/pdf_parses/raw/')
PDF_PARSES_OUTPUT_DIR = os.path.join(ROOT,'SS/pdf_parses/CS/')

os.makedirs(CLEAN_DATA, exist_ok=True)
os.makedirs(METADATA_INPUT_DIR, exist_ok=True)
os.makedirs(METADATA_OUTPUT_DIR, exist_ok=True)
os.makedirs(PDF_PARSES_INPUT_DIR, exist_ok=True)
os.makedirs(PDF_PARSES_OUTPUT_DIR, exist_ok=True)

In [15]:
class DownloadBatch:
    """Filter raw SS dataset to extract only the papers we need"""

    field = "Computer Science"

    def __init__(self, df):
        self.df = df

    @staticmethod
    def get_filename(url):
        return os.path.basename(url.split("?")[0])

    def get_basename(self, url):
        filename = self.get_filename(url).split(".")[0]
        return filename
    
    
    def create_batch(self, row):
        """Create links for each row."""
        input_metadata_url = row[1]["metadata"]
        input_pdf_parses_url = row[1]["pdf_parses"]
        batch = dict(
            number=self.get_basename(input_metadata_url)[-1],
            input_metadata_url=input_metadata_url,
            input_metadata_path=os.path.join(
                METADATA_INPUT_DIR, self.get_filename(input_metadata_url)
            ),
            output_metadata_path=os.path.join(
                METADATA_OUTPUT_DIR, self.get_filename(input_metadata_url)
            ),
            input_pdf_parses_url=input_pdf_parses_url,
            input_pdf_parses_path=os.path.join(
                PDF_PARSES_INPUT_DIR, self.get_filename(input_pdf_parses_url)
            ),
            output_pdf_parses_path=os.path.join(
                PDF_PARSES_OUTPUT_DIR,
                self.get_filename(input_pdf_parses_url),
            ),
        )
        return batch

    def download_data(self, batch: dict):
        """Download the metadata and the pdf parses for the shard"""

        # Download metadata
        print(f"Downloading raw data:")
        
        if not os.path.isfile(batch["input_metadata_path"]):
            print('    Downloading metadata...')
            urllib.request.urlretrieve(batch["input_metadata_url"], batch["input_metadata_path"])
            print('    Done')
        else:
            print('    Metadata already downloaded')
        
        # Download pdf parse
        if not os.path.isfile(batch["input_pdf_parses_path"]):
            print('    Downloading pdf parses...')
            urllib.request.urlretrieve(batch["input_pdf_parses_url"], batch["input_pdf_parses_path"])
            print('    Done')
        else:
            print('    PDF data already downloaded')
        

    def check_pdf_parse(self, metadata_dict):
        """Only keep files that have a corresponding pdf parse"""
        if metadata_dict.get("has_pdf_parsed_body_text"):
            return True
        return False

    def find_topics(self, metadata_dict: dict):
        """Return papers that are in the field of interest"""
        mag_field_of_study = metadata_dict["mag_field_of_study"]
        if mag_field_of_study and self.field in mag_field_of_study:
            return True
        return False

    def filter_metadata(self, metadata_dict):
        """Filter papers of interest"""
        # Filter papers with no pdf parse
        if self.check_pdf_parse(metadata_dict):
            # Filter to selected topic
            if self.find_topics(metadata_dict):
                return True    
        return False

    def filter_batch(self, batch: dict):
        """Download raw data and filter papers of interest"""
        number = batch['number']
        print(f'\nProcessing shard {number}')
        self.download_data(batch)

        print("Filtering metadata:")
        paper_ids_to_keep = set()
        with gzip.open(batch["input_metadata_path"], "rb") as gz, open(
            batch["output_metadata_path"], "wb"
        ) as f_out:
            f = io.BufferedReader(gz)
            for line in tqdm(f.readlines()):
                metadata_dict = json.loads(line)
                if self.filter_metadata(metadata_dict):
                    paper_id = metadata_dict.get("paper_id")
                    paper_ids_to_keep.add(paper_id)
                    f_out.write(line)
                    
                
                
        print(f'    {len(paper_ids_to_keep)} parsed papers found')
        
        print("Parsing pdfs:")
        with gzip.open(batch['input_pdf_parses_path'], 'rb') as gz, open(batch['output_pdf_parses_path'], 'wb') as f_out:
            f = io.BufferedReader(gz)
            for line in tqdm(f.readlines()):
                metadata_dict = json.loads(line)
                paper_id = metadata_dict['paper_id']
                if paper_id in paper_ids_to_keep:
                    f_out.write(line)
        print('Done')
                 
    def cleanup(self, batch):
        """Delete the raw files to clear up space for other shards"""
        os.remove(batch['input_metadata_path'])
        os.remove(batch['input_pdf_parses_path'])

    def __call__(self, cleanup=True):
        for row in self.df.iterrows():
            batch = self.create_batch(row)
            self.filter_batch(batch)
            if cleanup:
                self.cleanup(batch)            

In [16]:
# Full dataset
download = DownloadBatch(data_url)
download(cleanup=True)


Processing shard 0
Downloading raw data:
    Downloading metadata...


HTTPError: HTTP Error 403: Forbidden