In [None]:
!pip install requests beautifulsoup4 lxml

In [None]:
import requests
import json
import re

# Headers para evitar bloqueos
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}


def fetch_and_transform_data(url):
    try:
        # Realizar la solicitud HTTP para obtener los datos
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()  # Verifica que la solicitud sea exitosa
        
        # Parsear los datos JSON
        raw_data = response.json()
        
        # Limpiar y transformar los datos si es necesario
        cleaned_data = []
        for item in raw_data:
            # Eliminar etiquetas HTML de los valores
            cleaned_item = {key: value.replace('<\/i>', '').replace('<\/a>', '') if isinstance(value, str) else value
                            for key, value in item.items()}
            cleaned_data.append(cleaned_item)

        # Extraer solo el ID del documento del campo 'docID'
        for item in cleaned_data:
            if 'docID' in item:
                match = re.search(r"CODEA-\d+", item['docID'])
                if match:
                    item['docID'] = match.group(0)
        
        return cleaned_data

    except requests.exceptions.RequestException as e:
        print(f"Error al realizar la solicitud: {e}")
        return None
    except json.JSONDecodeError as e:
        print(f"Error al parsear los datos JSON: {e}")
        return None

# URL del corpus
url = "https://corpuscodea.es/corpus/corpus2022/inventario2.php"

data = fetch_and_transform_data(url)

# Guardar los datos en un archivo JSON
if data:
    with open("corpus_data.json", "w", encoding="utf-8") as f:
        json.dump(data, f, indent=4, ensure_ascii=False)
    print("Datos guardados en 'corpus_data.json'.")

if data:
    print(json.dumps(data, indent=4, ensure_ascii=False))
else:
    print("No se pudieron obtener los datos.")

In [None]:
import os
import requests
import json
from bs4 import BeautifulSoup

# Headers para evitar bloqueos
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}

def create_document_structure(doc_id, data):
    """Crea la estructura de carpetas y guarda el JSON con los datos del documento."""
    # folder_path = os.path.join("Documents", doc_id)
    folder_path = "Documents"
    os.makedirs(folder_path, exist_ok=True)

    file_path = os.path.join(folder_path, f"{doc_id}.json")
    with open(file_path, "w", encoding="utf-8") as file:
        json.dump(data, file, indent=4, ensure_ascii=False)

def fetch_document_details(doc_id):
    """Extrae los detalles de un documento desde la página correspondiente."""
    url = f"https://corpuscodea.es/corpus/corpus2022/documento.php?documento={doc_id}&loc=undefined&paleografica=off&abreviaturas=off&mayusculas=off&busqueda="
    try:
        response = requests.get(url, headers=HEADERS)
        response.raise_for_status()

        soup = BeautifulSoup(response.text, "html.parser")
        table = soup.find("table", {"id": "textos"})
        
        if not table:
            print(f"No se encontró tabla en el documento {doc_id}")
            return []

        pages = []
        rows = table.find("tbody").find_all("tr", class_="tr-texto")

        for row in rows:
            cols = row.find_all("td")
            if len(cols) >= 2:
                # Extraer URL de la imagen
                image_tag = cols[0].find("img")
                image_url = 'https://corpuscodea.es/corpus/corpus2022/'+image_tag["src"] if image_tag else None

                # Extraer texto paleográfico
                paleographic_text = cols[0].get_text(strip=True)

                # Extraer texto crítico
                critical_text = cols[1].get_text(strip=True)

                pages.append({
                    "image_url": image_url,
                    "paleographic_text": paleographic_text,
                    "critical_text": critical_text
                })

        return pages

    except requests.exceptions.RequestException as e:
        print(f"Error al acceder al documento {doc_id}: {e}")
        return []

def process_corpus_data(corpus_file):
    """Procesa el archivo JSON con los datos del corpus y extrae detalles de cada documento."""

    try:
        with open(corpus_file, "r", encoding="utf-8") as file:
            corpus_data = json.load(file)

        for document in corpus_data:
            doc_id = document.get("docID")
            if not doc_id:
                print("Documento sin ID encontrado, omitiendo...")
                continue

            print(f"Procesando documento {doc_id}...")
            document_pages = fetch_document_details(doc_id)
            if len(document_pages)==1:
                print(f"Se encontró {len(document_pages)} página para el documento {doc_id}.")
            else:
                print(f"Se encontraron {len(document_pages)} páginas para el documento {doc_id}.")
            
            create_document_structure(doc_id, document_pages)
     

    except FileNotFoundError:
        print(f"El archivo {corpus_file} no fue encontrado.")
    except json.JSONDecodeError:
        print(f"Error al leer el archivo JSON {corpus_file}.")

if __name__ == "__main__":
    corpus_file = "corpus_data.json"
    process_corpus_data(corpus_file)


In [None]:
import os
import json
import requests

# Headers para evitar bloqueos
HEADERS = {
    "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/91.0.4472.124 Safari/537.36",
    "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8",
    "Accept-Language": "en-US,en;q=0.9",
    "Connection": "keep-alive",
    "Upgrade-Insecure-Requests": "1",
}

def download_image(image_url, save_path):
    """Descarga una imagen desde la URL y la guarda en la ruta especificada."""
    try:
        response = requests.get(image_url, stream=True, headers=HEADERS)
        response.raise_for_status()
        with open(save_path, "wb") as file:
            for chunk in response.iter_content(1024):
                file.write(chunk)
        return save_path
    except Exception as e:
        print(f"Error descargando la imagen: {e}")
        return None

def process_json_files(documents_folder):
    """Procesa los archivos JSON en la carpeta Documents."""
    for filename in os.listdir(documents_folder):
        if filename.endswith(".json"):
            try:
                print(f"Procesando {filename}...")
                json_path = os.path.join(documents_folder, filename)
                with open(json_path, "r", encoding="utf-8") as file:
                    document_data = json.load(file)

                    updated_data = []
                    for page in document_data:
                        image_url = page.get("image_url")
                        local_image_path = None
                        if image_url:
                            # Generar la ruta local para la imagen
                            image_name = os.path.basename(image_url)
                            local_image_path = os.path.join(documents_folder, image_name)
                            # Descargar la imagen
                            local_image_path = download_image(image_url, local_image_path)

                        # Actualizar la información de la página
                        page["local_image_path"] = local_image_path
                        updated_data.append(page)

                    # Guardar el JSON actualizado
                    with open(json_path, "w", encoding="utf-8") as file:
                        json.dump(updated_data, file, indent=4, ensure_ascii=False)
            except Exception as e:
                print(f"Error procesando el documento: {e}")


if __name__ == "__main__":
    documents_folder = "Documents"
    process_json_files(documents_folder)
