# ArXiv API 

Testing the arxiv api functioning:

In [3]:
import arxiv
from typing import Literal

def search_arxiv(
    query: str, 
    max_results: int = 10, 
    sort_criterion : Literal['relevance', 'last_submitted'] = 'relevance'
)-> str | list[dict]:
    """
    Searches arXiv for the top N articles based on a query.
    Returns a list of dictionaries containing article ID, title, summary, and authors.
    
    Args:
        query (str): The search query (e.g., "AI agents", "quantum computing").
        max_results (int): The maximum number of results to return. Default is 10.
        sort_criterion (Literal['relevance', 'last_submitted']): The criterion to sort the results by. 
            Default is 'relevance': this sorts by relevance to the query.
            'last_submitted' sorts by the date the article was submitted to the arXiv.
    """
    client = arxiv.Client()

    if sort_criterion == 'relevance':
        sort_by = arxiv.SortCriterion.Relevance
    elif sort_criterion == 'last_submitted':
        sort_by = arxiv.SortCriterion.SubmittedDate
    
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=sort_by
    )
    
    results = []
    
    # client.results() is a generator
    for result in client.results(search):
        results.append({
            # entry_id is a URL like 'http://arxiv.org/abs/2310.12345'
            # We split to get just the ID '2310.12345'
            "id": result.entry_id.split('/')[-1],
            "title": result.title,
            "published": result.published.strftime("%Y-%m-%d"),
            "authors": [author.name for author in result.authors],
            "summary": result.summary.replace("\n", " ") # Clean up newlines in abstract
        })
        
    return results

In [2]:
# test it out:

result = search_arxiv(query="AI agents")

for entry in result:
    print(entry)
    print()

AttributeError: module 'arxiv' has no attribute 'Client'

Can we read the article without downloading it to the local file system? 

In [None]:
import arxiv
import requests
import fitz  # PyMuPDF
import io
from langchain_core.tools import tool

@tool
def read_arxiv_in_memory(paper_id: str):
    """
    Downloads an arXiv paper into memory (without saving to disk) and extracts its text.
    
    Args:
        paper_id (str): The arXiv ID (e.g., "2103.00020").
    """
    # 1. Get the PDF URL
    client = arxiv.Client()
    search = arxiv.Search(id_list=[paper_id])
    
    try:
        paper = next(client.results(search))
        pdf_url = paper.pdf_url
    except StopIteration:
        return f"Error: Paper {paper_id} not found."

    # 2. Download bytes into memory
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
        
        # 3. Open PDF from bytes
        # fitz.open(stream=..., filetype="pdf") lets us read from RAM
        with fitz.open(stream=response.content, filetype="pdf") as doc:
            text = ""
            for page in doc:
                text += page.get_text()
                
        return text[:20000]  # Truncate to avoid context overflow
        
    except Exception as e:
        return f"Error reading paper in memory: {e}"

Now let's download the articles.

In [None]:
import arxiv
import os
from langchain_core.tools import tool

def download_arxiv_pdf(paper_id: str, save_dir: str = "./downloads"):
    """
    Downloads the PDF of an arXiv paper given its ID.
    
    Args:
        paper_id (str): The arXiv ID (e.g., "2103.00020").
        save_dir (str): The directory to save the PDF in. Defaults to "./downloads".
    
    Returns:
        str: The file path of the downloaded PDF.
    """
    # Ensure directory exists
    os.makedirs(save_dir, exist_ok=True)
    
    client = arxiv.Client()
    
    # We must "search" by ID to get the paper object
    search = arxiv.Search(id_list=[paper_id])
    
    try:
        paper = next(client.results(search))
        
        # Create a safe filename using the ID and a sanitized title
        # e.g., "2103.00020_Attention_Is_All_You_Need.pdf"
        safe_title = "".join(c for c in paper.title if c.isalnum() or c in (' ', '_', '-')).rstrip()
        safe_title = safe_title.replace(" ", "_")
        filename = f"{paper_id}_{safe_title}.pdf"
        
        # Download
        path = paper.download_pdf(dirpath=save_dir, filename=filename)
        return f"Successfully downloaded file to: {path}"
        
    except StopIteration:
        return f"Error: Paper with ID {paper_id} not found."
    except Exception as e:
        return f"Error downloading paper: {str(e)}"

In [None]:
result = search_arxiv(query="Transformer models")

for entry in result:
    print(entry['id'])

# Download the PDF
download_arxiv_pdf(paper_id=result[0]['id'])

2201.00978v1
2106.02277v1
2104.11502v1
2208.03987v4
2404.05657v1


'Successfully downloaded file to: ./downloads/2106.02277v1_Glance-and-Gaze_Vision_Transformer.pdf'

In [None]:
import pymupdf

def read_pdf_text(filepath: str):
    """
    Extracts text from a locally saved PDF file.
    """
    try:
        doc = pymupdf.open(filepath)
        text = ""
        for page in doc:
            text += page.get_text()
        return text[:10000] # Truncate if too long for context window
    except Exception as e:
        return f"Error reading PDF: {e}"

In [None]:
# list files in the downloads folder
import os
print(os.listdir("./test_downloads"))    

['2106.02277v1_Glance-and-Gaze_Vision_Transformer.pdf']


In [None]:
# read it out
text = read_pdf_text(filepath="./test_downloads/2106.02277v1_Glance-and-Gaze_Vision_Transformer.pdf")
print(text)

Glance-and-Gaze Vision Transformer
Qihang Yu1, Yingda Xia1, Yutong Bai1, Yongyi Lu1, Alan Yuille1, Wei Shen2
1 The Johns Hopkins University
2 Shanghai Jiaotong University
Abstract
Recently, there emerges a series of vision Transformers, which show superior
performance with a more compact model size than conventional convolutional
neural networks, thanks to the strong ability of Transformers to model long-range
dependencies. However, the advantages of vision Transformers also come with a
price: Self-attention, the core part of Transformer, has a quadratic complexity to
the input sequence length. This leads to a dramatic increase of computation and
memory cost with the increase of sequence length, thus introducing difﬁculties
when applying Transformers to the vision tasks that require dense predictions based
on high-resolution feature maps.
In this paper, we propose a new vision Transformer, named Glance-and-Gaze Trans-
former (GG-Transformer), to address the aforementioned issues. It is