<a href="https://colab.research.google.com/github/GwenTsang/Papyri/blob/main/scrapping_date___provenance_papyri.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
import requests
from bs4 import BeautifulSoup
import copy
import json
import time

def scrap_papyrus(url: str) -> dict:
    """
    Scrap les infos 'Date' et 'Provenance' sur une page Trismegistos
    et les renvoie dans un dictionnaire.
    """
    headers = {
        "User-Agent": (
            "Mozilla/5.0 (Windows NT 10.0; Win64; x64) "
            "AppleWebKit/537.36 (KHTML, like Gecko) "
            "Chrome/124.0 Safari/537.36"
        )
    }

    resp = requests.get(url, headers=headers, timeout=10)
    resp.raise_for_status()

    soup = BeautifulSoup(resp.text, "html.parser")

    data = {"Date": None, "Provenance": None}

    def extract_from_division(label: str):
        """
        Pour un label comme 'Date' ou 'Provenance',
        cherche un élément avec class="division" contenant
        <span class="semibold">Label:</span> et retourne le texte après le label.
        """
        for block in soup.select(".division"):  # <div class="division"> ou <p class="division">, etc.
            span = block.find("span", class_="semibold")
            if not span:
                continue

            span_text = span.get_text(strip=True)  # ex: 'Provenance:' ou 'Date:'
            normalized = span_text.rstrip(":").strip().lower()

            if normalized == label.lower():
                # copie pour pouvoir enlever les tooltips si besoin
                temp = copy.copy(block)
                # enlève les tooltips (span.tooltiptext)
                for tt in temp.select(".tooltiptext"):
                    tt.decompose()

                full_text = temp.get_text(" ", strip=True)
                value = full_text.replace(span_text, "", 1).strip(" :\u00A0")
                return value or None

        return None

    data["Date"] = extract_from_division("Date")
    data["Provenance"] = extract_from_division("Provenance")

    return data


if __name__ == "__main__":
    base_url = "https://www.trismegistos.org/text/{}"
    results = {}

    for i in range(1, 1001):  # de 1 à 1000 inclus
        url = base_url.format(i)
        print(f"Scraping id {i}: {url}")
        try:
            data = scrap_papyrus(url)
            # on ajoute aussi l'URL et l'id pour info
            data["id"] = i
            data["url"] = url
        except requests.HTTPError as e:
            # par ex. 404 : on enregistre quand même l'erreur
            print(f"  Erreur HTTP pour {url}: {e}")
            data = {
                "id": i,
                "url": url,
                "Date": None,
                "Provenance": None,
                "error": str(e),
            }
        except Exception as e:
            print(f"  Erreur générique pour {url}: {e}")
            data = {
                "id": i,
                "url": url,
                "Date": None,
                "Provenance": None,
                "error": str(e),
            }

        results[i] = data

        # petite pause pour être sympa avec le serveur
        time.sleep(0.5)

    # Sauvegarde en JSON
    output_file = "papyrus_1_1000.json"
    with open(output_file, "w", encoding="utf-8") as f:
        json.dump(results, f, ensure_ascii=False, indent=2)

    print(f"Données sauvegardées dans {output_file}")

Scraping id 1: https://www.trismegistos.org/text/1
Scraping id 2: https://www.trismegistos.org/text/2
Scraping id 3: https://www.trismegistos.org/text/3
Scraping id 4: https://www.trismegistos.org/text/4
Scraping id 5: https://www.trismegistos.org/text/5
Scraping id 6: https://www.trismegistos.org/text/6
Scraping id 7: https://www.trismegistos.org/text/7
Scraping id 8: https://www.trismegistos.org/text/8
Scraping id 9: https://www.trismegistos.org/text/9
Scraping id 10: https://www.trismegistos.org/text/10
Scraping id 11: https://www.trismegistos.org/text/11
Scraping id 12: https://www.trismegistos.org/text/12
Scraping id 13: https://www.trismegistos.org/text/13
Scraping id 14: https://www.trismegistos.org/text/14
Scraping id 15: https://www.trismegistos.org/text/15
Scraping id 16: https://www.trismegistos.org/text/16
Scraping id 17: https://www.trismegistos.org/text/17
Scraping id 18: https://www.trismegistos.org/text/18
Scraping id 19: https://www.trismegistos.org/text/19
Scraping id

In [None]:
from google.colab import drive
drive.mount('/content/drive')