# DOI Scouting (article id) and HTML Download
Questa sezione esegue una query su arxiv.org cercando articoli che parlano di **LLM multi agent** e attraverso *bs4* ottiene i vari [DOI](https://en.wikipedia.org/wiki/Digital_object_identifier).
Con questi DOI è dunque possibile eseguire richieste specifiche per scaricare i documenti HTML del topic scelto.  

### Libraries and Costants

In [1]:
import os
import requests
from bs4 import BeautifulSoup
import logging
import time
logging.basicConfig(level=logging.INFO, format='%(asctime)s - %(levelname)s - %(message)s', force=True)

SOURCES_DIR = 'sources/'

if not os.path.exists(SOURCES_DIR):
    os.makedirs(f'./{SOURCES_DIR}', exist_ok=True)

article_ids = set()
TOPIC = "LLM multi agent"
found_urls = set()

### Functions

In [None]:
def save_to_file(article, article_id):
    file_path = os.path.join(SOURCES_DIR, f"ar5iv_article_{article_id}.html") 
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(article.prettify())
    logging.info(f"Article {article_id} saved.")

In [None]:
def get_ar5iv_html(article_id):
    urls = [ f"https://ar5iv.labs.arxiv.org/html/{article_id}",
             f"https://arxiv.org/html/{article_id}" ]               # sometimes html is provided on this url when it's not showing on ar5ive
    for url in urls:
        try:
            response = requests.get(url, timeout=10)
            logging.info(f"Request URL: {response.url}")

            if response.status_code == 200:
                soup = BeautifulSoup(response.text, 'html.parser')
                article = soup.find('article')

                if article:
                    found_urls.add(url)
                    return article
            else:
                logging.error(f"Error fetching article {article_id}, status code: {response.status_code}")
                return None

        except requests.exceptions.RequestException as e:
            logging.error(f"Error downloading article {article_id}: {e}")
            return None
    with open('read.me', 'w') as f:
        f.writelines([url + '\n' for url in found_urls])
        logging.info(f"Successfully wrote {len(found_urls)} urls to read.me")

In [None]:
def retrieve_article_id(url, params):
    response = requests.get(url, params=params)
    logging.info(f"Request URL: {response.url}")

    if response.status_code == 200:
        soup = BeautifulSoup(response.text, "html.parser")
        ol_tag = soup.find('ol')
        if ol_tag:
            li_tags = ol_tag.find_all('li')
            local_article_counter = 0

            for li in li_tags:
                a_tag = li.find('a')
                if a_tag:
                    article_id = a_tag.text.split(':')[-1]
                    logging.info(f"Processing article number: {len(article_ids)}: {article_id}")
                    article = get_ar5iv_html(article_id)
                    if article:
                        logging.info(f"Article found: {article_id}")
                        local_article_counter += 1
                        article_ids.add(article_id)
                        save_to_file(article, article_id)
                    else:
                        logging.warning(f"Article {article_id} not found, skipping...")
                else:
                    logging.warning("No <a> element found in the <li> element.")

            logging.info(f"Number of local articles found: {local_article_counter} over 200")  # inside a single page
            time.sleep(2)
            return local_article_counter
        else:
            logging.warning("No <ol> element found.")
            return None
    else:
        logging.error(f"Failed to retrieve data. Status code: {response.status_code}")
        return None

Punto di lancio del codice, la query è composta da `url + params`

In [None]:
# Define the URL for the advanced search on arXiv
url = "https://arxiv.org/search/advanced"

# Define the search parameters
params = {
    "advanced": "",
    "terms-0-term": TOPIC,              # Search term
    "terms-0-operator": "AND",          # Operator
    "terms-0-field": "all",             # Search field (all)
    "classification-computer_science": "y",  # Limit to computer science
    "classification-physics_archives": "all",  # Include all physics archives
    "classification-include_cross_list": "include",  # Include cross-list
    "date-filter_by": "all_dates",      # Filter by date (all dates)
    "date-year": "",                    # No specific year
    "date-from_date": "",               # No start date
    "date-to_date": "",                 # No end date
    "date-date_type": "submitted_date", # Search by submission date
    "abstracts": "hide",                # Show abstracts
    "size": 200,                        # Number of results to display
    "order": "submitted_date"           # Order by submission date (oldest first) #older documents are more likely to have html version
}

doi_retrieved = 0
while doi_retrieved <= 300:
    doi_retrieved += retrieve_article_id(url, params)
    params["start"] = doi_retrieved

logging.info(f"Number of articles retrieved: {len(article_ids)}")