In [1]:
import feedparser
import requests
import time
import csv
import urllib, urllib.request
from datetime import datetime
import pytz

In [2]:
BASE_URL = "http://export.arxiv.org/api/query?"

In [3]:
def search_arxiv(keywords, start, max_results=10):
    terms = keywords.split()

    query = " AND ".join(
        f"(ti:{t} OR abs:{t})" for t in terms
    )

    params = {
        "search_query": query,
        "start": start,
        "max_results": max_results,
        "sortBy": "submittedDate",
        "sortOrder": "descending"
    }

    response = requests.get(BASE_URL, params=params)
    print(response.url)  # üîç depuraci√≥n REAL

    feed = feedparser.parse(response.text)

    papers = []

    for entry in feed.entries:
        paper = {
            "arxiv_id": entry.id.split("/abs/")[-1],
            "title": entry.title.strip().replace("\n", " "),
            "abstract": entry.summary.strip().replace("\n", " "),
            "published": entry.published,
            "authors": [author.name for author in entry.authors],
            "categories": [tag["term"] for tag in entry.tags],
            "doi": entry.get("arxiv_doi", None),
            "pdf_url": next(
                (link.href for link in entry.links if link.type == "application/pdf"),
                None
            )
        }

        papers.append(paper)

    return papers


In [4]:
#termino_busqueda = "active aging elderly"
busquedas = ["active aging elderly", 
             "recommender systems human learning",  #necesario a√±adir human
             "ontology movement", 
             "human movement language models",  #necesario a√±adir human
             "tai chi", #produce muy pocos resultados en arXiv
             "pathology ontology",
             ]
starting_point = 0
n_results = 5
years_back = 5
current_year = time.localtime().tm_year
ending_year = current_year - years_back

In [5]:
def save_papers_to_csv(papers, filename):
    if not papers:
        return

    fieldnames = papers[0].keys()

    with open(filename, mode="w", newline="", encoding="utf-8") as f:
        writer = csv.DictWriter(f, fieldnames=fieldnames)
        writer.writeheader()

        for paper in papers:
            writer.writerow({
                k: ", ".join(v) if isinstance(v, list) else v
                for k, v in paper.items()
            })

In [None]:
for i in busquedas:
    termino_busqueda = i
    paper_year = current_year
    papers_list = []
    print("Buscando papers para el t√©rmino:", termino_busqueda, paper_year)
    while paper_year > ending_year:
        papers = search_arxiv(termino_busqueda, start=starting_point, max_results=starting_point+n_results)
        if len(papers) == 0:
            break
        last_paper = papers[-1]
        date_str = last_paper["published"]
        dt_utc = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")
        dt_utc = dt_utc.replace(tzinfo=pytz.UTC)
        local_tz = pytz.timezone("Europe/Madrid")  # cambia si hace falta
        dt_local = dt_utc.astimezone(local_tz)
        paper_year = dt_local.year
        papers_list.extend(papers)
        starting_point += n_results
        print("A√±o del √∫ltimo paper obtenido:", paper_year, "Total de papers acumulados:", len(papers_list))
        if len(papers_list) >= 500:
            print("L√≠mite de 500 papers alcanzado, deteniendo la b√∫squeda.")
            break
    save_papers_to_csv(papers_list, "arxiv_{}.csv".format(termino_busqueda.replace(" ", "_")))

Buscando papers para el t√©rmino: active aging elderly
https://export.arxiv.org/api/query?search_query=%28ti%3Aactive+OR+abs%3Aactive%29+AND+%28ti%3Aaging+OR+abs%3Aaging%29+AND+%28ti%3Aelderly+OR+abs%3Aelderly%29&start=0&max_results=5&sortBy=submittedDate&sortOrder=descending
A√±o del √∫ltimo paper obtenido: 2025 Total de papers acumulados: 5
https://export.arxiv.org/api/query?search_query=%28ti%3Aactive+OR+abs%3Aactive%29+AND+%28ti%3Aaging+OR+abs%3Aaging%29+AND+%28ti%3Aelderly+OR+abs%3Aelderly%29&start=5&max_results=10&sortBy=submittedDate&sortOrder=descending
A√±o del √∫ltimo paper obtenido: 2024 Total de papers acumulados: 15
https://export.arxiv.org/api/query?search_query=%28ti%3Aactive+OR+abs%3Aactive%29+AND+%28ti%3Aaging+OR+abs%3Aaging%29+AND+%28ti%3Aelderly+OR+abs%3Aelderly%29&start=10&max_results=15&sortBy=submittedDate&sortOrder=descending
A√±o del √∫ltimo paper obtenido: 2022 Total de papers acumulados: 30
https://export.arxiv.org/api/query?search_query=%28ti%3Aactive+OR+abs%

In [8]:
for p in papers:
    print(p["title"])
    print(p["doi"])
    print(p["published"])
    date_str = p["published"]
    dt_utc = datetime.strptime(date_str, "%Y-%m-%dT%H:%M:%SZ")
    dt_utc = dt_utc.replace(tzinfo=pytz.UTC)
    dt_local = dt_utc.astimezone(dt_utc.tzinfo)
    print(dt_local.year)
    print(p["categories"])
    print("-" * 40)