In [1]:
### Phase 1 : enregistrement des notices xml-tei

# phase_1_extract_notices.py
import os
import requests
import pandas as pd
from lxml import etree
from datetime import datetime
from urllib.parse import urlparse

# Configuration
base_url = "https://api.archives-ouvertes.fr/search/INRIA"
query = "dateLastIndexed_tdate:[NOW-1DAY TO NOW/HOUR]"
params = {"wt": "xml-tei", "rows": 1, "sort": "docid asc"}
cursor_mark = "*"
previous_cursor_mark = None
namespaces = {"tei": "http://www.tei-c.org/ns/1.0"}

# Dossiers
today = datetime.now().strftime("%Y_%m_%d")
year = datetime.now().strftime("%Y")
folder_xml = f"/data/Notices_Xml-tei_de_Hal/{year}/{today}"
folder_log = f"/data/log"
os.makedirs(folder_xml, exist_ok=True)
os.makedirs(folder_log, exist_ok=True)

# Extraction des PDF
pdf_records = []

while cursor_mark != previous_cursor_mark:
    params["cursorMark"] = cursor_mark
    response = requests.get(f"{base_url}?q={query}", params=params)
    if response.status_code != 200:
        print("Erreur de requête :", response.status_code)
        break

    root = etree.fromstring(response.content)
    hal_id = root.xpath('.//tei:idno[@type="halId"]/text()', namespaces=namespaces)
    version = root.xpath('.//tei:edition[@type="current"]/@n', namespaces=namespaces)
    if not hal_id:
        print("Plus de résultats.")
        break

    hal_id = hal_id[0]
    version = version[0] if version else "v1"
    xml_filename = os.path.join(folder_xml, f"{hal_id}_{version}.xml")
    with open(xml_filename, "wb") as f:
        f.write(response.content)

    refs = root.xpath('.//tei:edition[@type="current"]/tei:ref', namespaces=namespaces)
    for ref in refs:
        target = ref.get('target')
        if not target or not target.lower().endswith(".pdf"):
            continue
        ref_type = ref.get('type')
        subtype = ref.get('subtype')
        n = ref.get('n')
        not_before = ref.xpath('./tei:date/@notBefore', namespaces=namespaces)
        embargo_date = not_before[0] if not_before else None
        pdf_records.append({
            "hal_id": hal_id,
            "version": version,
            "ref_type": ref_type,
            "subtype": subtype,
            "n": n,
            "target": target,
            "embargo_date": embargo_date
        })

    previous_cursor_mark = cursor_mark
    cursor_mark = root.attrib.get("next")

# Enregistrer dans un CSV
df_pdf = pd.DataFrame(pdf_records)
csv_path = os.path.join(folder_log, f"telechargements_{today}.csv")
df_pdf.to_csv(csv_path, index=False)
print(f"\n📄 Données PDF enregistrées dans : {csv_path}")


Plus de résultats.

📄 Données PDF enregistrées dans : /data/log\telechargements_2025_06_11.csv


In [None]:
# Phase 2 : enregistrement des PDF
# phase_2_download_pdfs.py
import os
import requests
import pandas as pd
import re
from datetime import datetime, date
import concurrent.futures

# Date d’aujourd’hui pour retrouver le bon CSV
today = datetime.now().strftime("%Y_%m_%d")
year = datetime.now().strftime("%Y")
csv_path = f"/data/log/telechargements_{today}.csv"
pdf_folder = f"/data/pdf_de_hal/{year}/{today}/"
os.makedirs(pdf_folder, exist_ok=True)

# Charger les données
df = pd.read_csv(csv_path)

# Nettoyage nom de fichier
def clean_filename(url):
    name = os.path.basename(url.split("?")[0])
    return re.sub(r'[^\w\-_\.]', '_', name)

# Téléchargement unitaire
def download_pdf(row):
    try:
        if not str(row["target"]).lower().endswith(".pdf"):
            return None
        if pd.notna(row["embargo_date"]):
            embargo = datetime.strptime(row["embargo_date"], "%Y-%m-%d").date()
            if embargo > date.today():
                return f"⛔ Sous embargo : {row['target']}"

        filename = clean_filename(row["target"])
        path = os.path.join(pdf_folder, f"{row['hal_id']}_{row['version']}_{filename}")
        if os.path.exists(path):
            return f"✅ Déjà téléchargé : {filename}"

        response = requests.get(row["target"], timeout=10)
        response.raise_for_status()
        with open(path, "wb") as f:
            f.write(response.content)
        return f"⬇️ Téléchargé : {filename}"
    except Exception as e:
        return f"❌ Erreur sur {row['target']} : {e}"

# Télécharger en parallèle
rows = [row for _, row in df.iterrows()]
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
    results = list(executor.map(download_pdf, rows))

# Log console
for r in results:
    if r:
        print(r)
