In [1]:
import requests
from urllib.parse import quote, urlencode

You can find detailed docummentation about retrieving ArXive data via their REST API here: https://info.arxiv.org/help/api/user-manual.html

#### URL for querying ArXiv papers

In [2]:
BASE_URL = "https://export.arxiv.org/api/query"

#### Compose a Search Query

In [3]:
category_filter = "cat:cs.AI"
date_filter = "submittedDate:[202501010000+TO+202510012359]"
search_query = f"{category_filter} AND {date_filter}"

#### Configure other parameters and transform them into a query

In [4]:
params = {
    "search_query": search_query,
    "start": 0,
    "max_results": 20,
}
query = urlencode(params, quote_via=quote, safe=":+[]*")

In [5]:
query

'search_query=cat:cs.AI%20AND%20submittedDate:[202501010000+TO+202510012359]&start=0&max_results=20'

#### Construct a URL for a GET request

In [6]:
get_url = f"{BASE_URL}?{query}"

In [7]:
get_url

'https://export.arxiv.org/api/query?search_query=cat:cs.AI%20AND%20submittedDate:[202501010000+TO+202510012359]&start=0&max_results=20'

#### Execute the GET request

In [8]:
response = requests.get(get_url)

In [9]:
print(response.text)

<?xml version="1.0" encoding="UTF-8"?>
<feed xmlns="http://www.w3.org/2005/Atom">
  <link href="http://arxiv.org/api/query?search_query%3Dcat%3Acs.AI%20AND%20submittedDate%3A%5B202501010000%20TO%20202510012359%5D%26id_list%3D%26start%3D0%26max_results%3D20" rel="self" type="application/atom+xml"/>
  <title type="html">ArXiv Query: search_query=cat:cs.AI AND submittedDate:[202501010000 TO 202510012359]&amp;id_list=&amp;start=0&amp;max_results=20</title>
  <id>http://arxiv.org/api/ZKR0lfoX03mB3UXQnPZDo8ddvV4</id>
  <updated>2025-10-27T00:00:00-04:00</updated>
  <opensearch:totalResults xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">33121</opensearch:totalResults>
  <opensearch:startIndex xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">0</opensearch:startIndex>
  <opensearch:itemsPerPage xmlns:opensearch="http://a9.com/-/spec/opensearch/1.1/">20</opensearch:itemsPerPage>
  <entry>
    <id>http://arxiv.org/abs/2501.00750v2</id>
    <updated>2025-01-29T06:49:30Z</updated>
 

In [10]:
import xml.etree.ElementTree as ET

# Parse the XML string
root = ET.fromstring(response.text)

# Define the namespace (arXiv uses Atom namespace)
namespaces = {
    'atom': 'http://www.w3.org/2005/Atom',
    'opensearch': 'http://a9.com/-/spec/opensearch/1.1/',
    'arxiv': 'http://arxiv.org/schemas/atom'
}

# Extract total results
total_results = root.find('opensearch:totalResults', namespaces).text
print(f"Total results: {total_results}")

# Iterate through entries
entries = root.findall('atom:entry', namespaces)

papers = []
for entry in entries:
    paper = {
        'id': entry.find('atom:id', namespaces).text,
        'title': entry.find('atom:title', namespaces).text.strip(),
        'summary': entry.find('atom:summary', namespaces).text.strip(),
        'published': entry.find('atom:published', namespaces).text,
        'updated': entry.find('atom:updated', namespaces).text,
        'authors': [author.find('atom:name', namespaces).text 
                   for author in entry.findall('atom:author', namespaces)],
        'pdf_link': entry.find("atom:link[@title='pdf']", namespaces).get('href')
    }
    papers.append(paper)


Total results: 33121


In [11]:
papers

[{'id': 'http://arxiv.org/abs/2501.00750v2',
  'title': 'Beyond Text: Implementing Multimodal Large Language Model-Powered\n  Multi-Agent Systems Using a No-Code Platform',
  'summary': 'This study proposes the design and implementation of a multimodal LLM-based\nMulti-Agent System (MAS) leveraging a No-Code platform to address the practical\nconstraints and significant entry barriers associated with AI adoption in\nenterprises. Advanced AI technologies, such as Large Language Models (LLMs),\noften pose challenges due to their technical complexity and high implementation\ncosts, making them difficult for many organizations to adopt. To overcome these\nlimitations, this research develops a No-Code-based Multi-Agent System designed\nto enable users without programming knowledge to easily build and manage AI\nsystems. The study examines various use cases to validate the applicability of\nAI in business processes, including code generation from image-based notes,\nAdvanced RAG-based questi

#### Download PDF files of the papers

In [12]:
def download_pdf(url, filename):
    """Download PDF from URL and save to file"""
    response = requests.get(url)
    
    if response.status_code == 200:
        with open(filename, 'wb') as f:
            f.write(response.content)
        print(f"PDF saved as {filename}")
    else:
        print(f"Failed to download. Status code: {response.status_code}")

In [13]:
for paper in papers:
    path = f"../../data/arxiv_papers/{paper['pdf_link'].split('/')[-1]}.pdf"
    download_pdf(paper['pdf_link'], path)

PDF saved as ../../data/arxiv_papers/2501.00750v2.pdf
PDF saved as ../../data/arxiv_papers/2501.01149v2.pdf
PDF saved as ../../data/arxiv_papers/2501.01349v1.pdf
PDF saved as ../../data/arxiv_papers/2501.01835v1.pdf
PDF saved as ../../data/arxiv_papers/2501.02725v4.pdf
PDF saved as ../../data/arxiv_papers/2501.03824v1.pdf
PDF saved as ../../data/arxiv_papers/2501.05435v2.pdf
PDF saved as ../../data/arxiv_papers/2501.06231v1.pdf
PDF saved as ../../data/arxiv_papers/2501.06322v1.pdf
PDF saved as ../../data/arxiv_papers/2501.06423v1.pdf
PDF saved as ../../data/arxiv_papers/2501.06442v1.pdf
PDF saved as ../../data/arxiv_papers/2501.06461v1.pdf
PDF saved as ../../data/arxiv_papers/2501.06471v1.pdf
PDF saved as ../../data/arxiv_papers/2501.06485v1.pdf
PDF saved as ../../data/arxiv_papers/2501.06561v2.pdf
PDF saved as ../../data/arxiv_papers/2501.06577v1.pdf
PDF saved as ../../data/arxiv_papers/2501.06598v3.pdf
PDF saved as ../../data/arxiv_papers/2501.06625v1.pdf
PDF saved as ../../data/arxi

In [14]:
#Save Metadata of Arxiv Papers
import json
with open('../../data/arxiv_papers/metadata.json', 'w') as f:
    json.dump(papers, f, indent=4)