# LaTeXpOsEd: Scraping Stage

In this stage, the map for arXiv's AWS S3 bucket map is downloaded and converted to an easily parsable JSON format. The content is also randomized for anonymity. Then, the files archives are downloaded, extracted and the relevant content saved one by one to not use too much disk space.

Before running this script:

- Install the AWS CLI tool on your local machine.
- Create an AWS account and set up your local configuration to use the account with an access key.

> ⚠️ It is important to note that the running of this script will incur charges on you AWS account!

In [None]:
%pip install -q boto3 tqdm

In [None]:
import os
import json
import gzip
import random
import tarfile

import xml.etree.ElementTree as ET

import boto3
from tqdm import tqdm
from boto3.s3.transfer import S3Transfer, TransferConfig

In [None]:
# Manifest files
ARXIV_MANIFEST_JSON = 'data/arXiv_src_manifest.json'
ARXIV_MANIFEST_XML = 'data/arXiv_src_manifest.xml'
# Number of papers to download. The actual number will be slightly higher due to the batching of achives.
TARGET_COUNT = 100_000
# Directory where the downloaded tar.gz files are stored.
ARCHIVES_DIR = 'data/archives'
# Temporary directory where the files will be extracted. It's best to mount a memory disk here.
TMP_DIR = 'tmp'
# Final directory where the selected files will be stored. One JSON for each paper.
TARGET_DIR = 'data/final'

## Download and parse storage map

In [None]:
def download_requester_pays(bucket, key, download_path):
    transfer = S3Transfer(boto3.client('s3'), config=TransferConfig(use_threads=True))
    transfer.download_file(
        bucket, key, download_path,
        extra_args={'RequestPayer': 'requester'}
    )

In [None]:
# Download the original manifest file from arXiv in XML format
download_requester_pays('arxiv', 'src/arXiv_src_manifest.xml', ARXIV_MANIFEST_XML)

In [None]:
# Convert to JSON ordered randomly (or by timestamp?)

# Parse XML file
tree = ET.parse(ARXIV_MANIFEST_XML)
root = tree.getroot()
files = []
for file_elem in root.findall('file'):
    entry = {}
    for child in file_elem:
        entry[child.tag] = child.text
    entry['num_items'] = int(entry['num_items'])
    files.append(entry)

# Sort by timestamp (ISO format: YYYY-MM-DD HH:MM:SS)
# files.sort(key=lambda x: datetime.strptime(x['timestamp'], '%Y-%m-%d %H:%M:%S'))

# Randomize the order for anonymity
random.shuffle(files)

# Write to JSON
with open(ARXIV_MANIFEST_JSON, 'w', encoding='utf-8') as f:
    json.dump(files, f, indent=2, ensure_ascii=False)

## Download the archives

In [None]:
s3 = boto3.client('s3')

def s3_download_requester_pays(bucket, key, download_path):
    transfer = S3Transfer(s3, config=TransferConfig(use_threads=True))
    transfer.download_file(
        bucket, key, download_path,
        extra_args={'RequestPayer': 'requester'}
    )
    
def extract_latex_from_archive(archive_path):
    tex_data = {}
    try: # Try extracting tar.gz content
        with tarfile.open(archive_path, "r:gz") as tar:
            for member in tar.getmembers():
                if member.isfile() and member.name.lower().endswith('.tex'):
                    data = tar.extractfile(member)
                    if data is not None:
                        tex_data[member.name] = data.read().decode('utf-8', errors='ignore')
    except Exception as e: # If it fails, try extracting gzip content
        try:
            with gzip.open(archive_path, 'rt', encoding='utf-8', errors='ignore') as f:
                tex_data['____main.tex'] = f.read()
        except Exception as e2:
            print(f"Failed to extract {archive_path}: {e}, {e2}")
        return None
    return tex_data

In [None]:
selected_files = []
cumulative_items = 0

# Clear tmp folder
if os.path.exists(TMP_DIR):
    for filename in os.listdir(TMP_DIR):
        file_path = os.path.join(TMP_DIR, filename)
        if os.path.isfile(file_path):
            os.remove(file_path)

# Load manifest
with open(ARXIV_MANIFEST_JSON, 'r', encoding='utf-8') as f:
    files = json.load(f)


# Select files until we reach the target count
for fileinfo in files:
    num_items = int(fileinfo['num_items'])
    selected_files.append(fileinfo)
    cumulative_items += num_items
    if cumulative_items >= TARGET_COUNT:
        break
print(f'Selected {len(selected_files)} files, cumulative num_items={cumulative_items}')


os.makedirs(TMP_DIR, exist_ok=True)
os.makedirs(ARCHIVES_DIR, exist_ok=True)

for fileinfo in tqdm(selected_files, desc='Downloading files'):
    key = fileinfo['filename']  # "src/arXiv_src_2505_191.tar"
    filename = os.path.basename(key)
    download_path = os.path.join(ARCHIVES_DIR, filename)
    
    # Download from S3
    s3_download_requester_pays('arxiv', key, download_path)
    
    # Extract main archive
    with tarfile.open(download_path, 'r') as tar:
        for member in tar.getmembers():
            # Adjust the path to remove the top-level folder
            member.name = os.path.basename(member.name)
            tar.extract(member, path=TMP_DIR)
    os.remove(download_path)

    # Delete all folders and non-zip archives in the directory and keep files
    for item in os.listdir(TMP_DIR):
        item_path = os.path.join(TMP_DIR, item)
        if os.path.isdir(item_path):
            os.rmdir(item_path)
        elif not item.endswith('.gz'):
            os.remove(item_path)
            
    for item in os.listdir(TMP_DIR):
        paper_id = item.removesuffix('.gz')
        item_path = os.path.join(TMP_DIR, item)
        tex_data = extract_latex_from_archive(item_path)
        with open(os.path.join(TARGET_DIR, f'{paper_id}.json'), 'w', encoding='utf-8') as f:
            json.dump(tex_data, f)
        os.remove(item_path)