In [1]:
import time
import urllib.request
import urllib.parse
import sys

BASE = "http://export.arxiv.org/api/query"
UA = "AsciiHydraArxivClient/0.1 (+mailto:hernan.picatto@ascii.ac.at)" 

In [2]:
import boto3
import polars as pl
import botocore
import tarfile
import os
from io import BytesIO
from pypdf import PdfReader, PdfWriter

client = boto3.client('s3', 
                      aws_access_key_id=os.getenv('ASCII_AWS_ACCESS_KEY_ID'),
        aws_secret_access_key=os.getenv('ASCII_AWS_SECRET_ACCESS_KEY'))

In [None]:
BUCKET = "arxiv"
PREFIX = "pdf/arXiv_pdf_25"

def tar_key(i: int, m: int) -> str:
    return f"{PREFIX}{m:02d}_{i:03d}.tar"

existing_keys = []
for m in range(1, 11):
    for i in range(1, 999):
        key = tar_key(i, m)
        try:
            client.head_object(Bucket=BUCKET, Key=key, RequestPayer="requester")
            existing_keys.append({"year": 2025, "month":m, "partitions":key})
        except botocore.exceptions.ClientError as e:
            if e.response["Error"]["Code"] == "404":
                # optionally break after you get several consecutive 404s
                continue
            else:
                raise
    pl.from_dicts(existing_keys).write_parquet('../../../data/partitions_2025.parquet')

In [None]:
index = pl.read_parquet('../../../data/partitions_2025.parquet')
index.head()

In [None]:
to_download = index.top_k(3, by='partitions')['partitions'].to_list()
for element in to_download:
    print(element.split('/')[1])

In [None]:
for element in to_download:
    resp = client.get_object(
        Bucket="arxiv",
        Key=element,
        RequestPayer="requester",
    )
    with open(element.split('/')[1], "wb") as f:
        for chunk in resp["Body"].iter_chunks():
            f.write(chunk)


In [None]:
pdfs = []
for element in to_download:
    tar_file = element.split('/')[1]
    with tarfile.open(tar_file, mode="r:*") as tar:
        for member in tar.getmembers():
            pdf_file = tar.extractfile(member.name)
            try:
                pdf_bytes = pdf_file.read()
                reader = PdfReader(BytesIO(pdf_bytes))
                if len(reader.pages) == 0:
                    raise ValueError("PDF has no pages")
                pdfs.append({
                    'source': tar_file,
                    'pdf_file': member.name,
                    'first_page':reader.get_page(0).extract_text()    
                })
            except:
                pdfs.append({
                    'source': tar_file,
                    'pdf_file': member.name,
                    'first_page':None
                })

In [None]:
pdfs = pl.from_dicts(pdfs)
pdfs.head()
pdfs.write_parquet('../../../data/pdfs_table.parquet')

In [None]:
target_name = '2510/2510.27597.pdf'
resp = client.get_object(Bucket="arxiv", Key="pdf/arXiv_pdf_2510_205.tar", RequestPayer="requester")
body = resp["Body"]

with tarfile.open(fileobj=body, mode="r|*") as tar:
    member = tar.getmember(target_name)  # or iterate and match
    pdf_file = tar.extractfile('2510/2510.27690.pdf')
    pdf_bytes = pdf_file.read()
reader = PdfReader(BytesIO(pdf_bytes))
writer = PdfWriter()
if len(reader.pages) == 0:
    raise ValueError("PDF has no pages")
print(reader.get_page(0).extract_text())