# ArXiv API 

Testing the arxiv api functioning:

In [1]:
import arxiv
from typing import Literal

def search_arxiv(
    query: str, 
    max_results: int = 10, 
    sort_criterion : Literal['relevance', 'last_submitted'] = 'relevance'
)-> str | list[dict]:
    """
    Searches arXiv for the top N articles based on a query.
    Returns a list of dictionaries containing article ID, title, summary, and authors.
    
    Args:
        query (str): The search query (e.g., "AI agents", "quantum computing").
        max_results (int): The maximum number of results to return. Default is 10.
        sort_criterion (Literal['relevance', 'last_submitted']): The criterion to sort the results by. 
            Default is 'relevance': this sorts by relevance to the query.
            'last_submitted' sorts by the date the article was submitted to the arXiv.
    """
    client = arxiv.Client()

    if sort_criterion == 'relevance':
        sort_by = arxiv.SortCriterion.Relevance
    elif sort_criterion == 'last_submitted':
        sort_by = arxiv.SortCriterion.SubmittedDate
    
    search = arxiv.Search(
        query=query,
        max_results=max_results,
        sort_by=sort_by
    )
    
    results = []
    
    # client.results() is a generator
    for result in client.results(search):
        results.append({
            # entry_id is a URL like 'http://arxiv.org/abs/2310.12345'
            # We split to get just the ID '2310.12345'
            "id": result.entry_id.split('/')[-1],
            "title": result.title,
            "published": result.published.strftime("%Y-%m-%d"),
            "authors": [author.name for author in result.authors],
            "summary": result.summary.replace("\n", " ") # Clean up newlines in abstract
        })
        
    return results

In [2]:
# test it out:

result = search_arxiv(query="AI agents")

for entry in result:
    print(entry)
    print()

{'id': '2501.02842v1', 'title': 'Foundations of GenIR', 'published': '2025-01-06', 'authors': ['Qingyao Ai', 'Jingtao Zhan', 'Yiqun Liu'], 'summary': 'The chapter discusses the foundational impact of modern generative AI models on information access (IA) systems. In contrast to traditional AI, the large-scale training and superior data modeling of generative AI models enable them to produce high-quality, human-like responses, which brings brand new opportunities for the development of IA paradigms. In this chapter, we identify and introduce two of them in details, i.e., information generation and information synthesis. Information generation allows AI to create tailored content addressing user needs directly, enhancing user experience with immediate, relevant outputs. Information synthesis leverages the ability of generative AI to integrate and reorganize existing information, providing grounded responses and mitigating issues like model hallucination, which is particularly valuable in

Can we read the article without downloading it to the local file system? 

In [16]:
import arxiv
import requests
import pymupdf  
from langchain_core.tools import tool

def read_arxiv_in_memory(paper_id: str, start_page: int = 1, end_page: int = 3):
    """
    Downloads an arXiv paper and returns text from specific pages.
    Use this to read sections without loading the entire document.
    
    Args:
        paper_id (str): The arXiv ID (e.g., "2103.00020").
        start_page (int): The first page to read (1-based index). Default is 1.
        end_page (int): The last page to read (1-based index). Default is 3.
    """
    # 1. Fetch PDF URL
    client = arxiv.Client()
    try:
        paper = next(client.results(arxiv.Search(id_list=[paper_id])))
        pdf_url = paper.pdf_url
    except StopIteration:
        return f"Error: Paper {paper_id} not found."

    # 2. Download to RAM
    try:
        response = requests.get(pdf_url)
        response.raise_for_status()
        
        # 3. Open PDF stream
        with pymupdf.open(stream=response.content, filetype="pdf") as doc:
            num_pages = len(doc)
            
            # Validate page numbers
            if start_page < 1: start_page = 1
            if end_page > num_pages: end_page = num_pages
            
            # Convert to 0-based index for PyMuPDF
            # We iterate only the requested range
            text_content = []
            for i in range(start_page - 1, end_page):
                page_text = doc[i].get_text()
                text_content.append(f"--- Page {i+1} ---\n{page_text}")
            
            return "\n".join(text_content)

    except Exception as e:
        return f"Error reading paper: {e}"

In [17]:
text = read_arxiv_in_memory(result[0]['id'])
lines = text.splitlines()
for line in lines:
    print(line)

--- Page 1 ---
Foundations of GenIR
Qingyao Ai1†, Jingtao Zhan1†, Yiqun Liu1
1Dept. of Computer Science and Technology, Tsinghua University,
Beijing, China.
Contributing authors: aiqy@tsinghua.edu.cn;
zhanjt20@mails.tsinghua.edu.cn; yiqunliu@tsinghua.edu.cn;
†These authors contributed equally to this work.
Abstract
The chapter discusses the foundational impact of modern generative AI models
on information access (IA) systems. In contrast to traditional AI, the large-scale
training and superior data modeling of generative AI models enable them to pro-
duce high-quality, human-like responses, which brings brand new opportunities
for the development of IA paradigms. In this chapter, we identify and introduce
two of them in details, i.e., information generation and information synthesis.
Information generation allows AI to create tailored content addressing user needs
directly, enhancing user experience with immediate, relevant outputs. Information
synthesis leverages the ability of genera

Now let's download the articles.

In [18]:
import arxiv
import os
from langchain_core.tools import tool

def download_arxiv_pdf(paper_id: str, save_dir: str = "./downloads"):
    """
    Downloads the PDF of an arXiv paper given its ID.
    
    Args:
        paper_id (str): The arXiv ID (e.g., "2103.00020").
        save_dir (str): The directory to save the PDF in. Defaults to "./downloads".
    
    Returns:
        str: The file path of the downloaded PDF.
    """
    # Ensure directory exists
    os.makedirs(save_dir, exist_ok=True)
    
    client = arxiv.Client()
    
    # We must "search" by ID to get the paper object
    search = arxiv.Search(id_list=[paper_id])
    
    try:
        paper = next(client.results(search))
        
        # Create a safe filename using the ID and a sanitized title
        # e.g., "2103.00020_Attention_Is_All_You_Need.pdf"
        safe_title = "".join(c for c in paper.title if c.isalnum() or c in (' ', '_', '-')).rstrip()
        safe_title = safe_title.replace(" ", "_")
        filename = f"{paper_id}_{safe_title}.pdf"
        
        # Download
        path = paper.download_pdf(dirpath=save_dir, filename=filename)
        return f"Successfully downloaded file to: {path}"
        
    except StopIteration:
        return f"Error: Paper with ID {paper_id} not found."
    except Exception as e:
        return f"Error downloading paper: {str(e)}"

In [20]:
result = search_arxiv(query="Transformer models")

for entry in result:
    print(entry['id'])

# Download the PDF
download_arxiv_pdf(paper_id=result[0]['id'], save_dir="./test_downloads")

2201.00978v1
2106.02277v1
2104.11502v1
2208.03987v4
2404.05657v1
2107.03844v3
2303.00957v1
1901.02860v3
2405.09508v2
2302.14017v1


'Successfully downloaded file to: ./test_downloads/2201.00978v1_PyramidTNT_Improved_Transformer-in-Transformer_Baselines_with_Pyramid_Architecture.pdf'

In [21]:
# list files in the downloads folder
import os
print(os.listdir("./test_downloads"))    

['2106.02277v1_Glance-and-Gaze_Vision_Transformer.pdf', '2201.00978v1_PyramidTNT_Improved_Transformer-in-Transformer_Baselines_with_Pyramid_Architecture.pdf']
