In [None]:
import os
import PyPDF2
from langchain.text_splitter import CharacterTextSplitter
pdf_folder = "/content/drive/My Drive/clase 3 carrera/PROY3/Archivos/"
import pdfplumber
from pdfplumber.table import TableFinder
from langchain.schema import Document

In [None]:
async def load_pdf_by_language(file_path: str) -> list[Document]:
    """
    Carga un PDF separando contenido en valenciano y castellano,
    agrupando todo el contenido por idioma en documentos únicos
    """
    valencia_content = []
    castellano_content = []

    with pdfplumber.open(file_path) as pdf:
        for page_num, page in enumerate(pdf.pages):
            # Configuración para detección de tablas
            table_settings = {
                "vertical_strategy": "lines",
                "horizontal_strategy": "lines",
                "snap_tolerance": 4,
                "join_tolerance": 10
            }

            # Procesar tablas primero
            tf = TableFinder(page, table_settings)
            table_bboxes = []
            width = page.width
            mid_x = width / 2

            for table in tf.tables:
                # Extraer contenido de tabla manejando None
                table_data = []
                for row in table.extract():
                    cleaned_row = [str(cell) if cell is not None else "" for cell in row]
                    table_data.append("|".join(cleaned_row))

                table_content = "\n".join(table_data)
                table_center = (table.bbox[0] + table.bbox[2]) / 2

                # Determinar idioma por posición
                if table_center < mid_x:
                    valencia_content.append(f"Tabla página {page_num+1}:\n{table_content}")
                else:
                    castellano_content.append(f"Tabla página {page_num+1}:\n{table_content}")

                table_bboxes.append(table.bbox)

            # Procesar texto normal excluyendo tablas
            words = page.extract_words()
            non_table_words = [
                word for word in words
                if not any(
                    (word['x0'] >= t[0] and word['x1'] <= t[2] and
                    word['top'] >= t[1] and word['bottom'] <= t[3])
                    for t in table_bboxes
                )
            ]

            # Separar columnas con márgenes
            left_col = []
            right_col = []
            for word in non_table_words:
                word_center = (word['x0'] + word['x1']) / 2
                if word_center < mid_x - 15:
                    left_col.append(word)
                elif word_center > mid_x + 15:
                    right_col.append(word)

            # Construir textos
            def build_text(col_words):
                return " ".join([w['text'] for w in sorted(col_words, key=lambda x: (x['top'], x['x0']))])

            valencia_content.append(build_text(left_col))
            castellano_content.append(build_text(right_col))

    # Crear documentos finales por idioma
    return [
        Document(
            page_content="\n\n".join(valencia_content),
            metadata={"source": file_path, "language": "valencia"}
        ),
        Document(
            page_content="\n\n".join(castellano_content),
            metadata={"source": file_path, "language": "castellano"}
        )
    ]

In [None]:
file_path = "/content/drive/My Drive/clase 3 carrera/PROY3/Archivos/U0957354.pdf"
documents = await load_pdf_by_language(file_path)

# Acceder a los documentos por idioma
doc_valencia = documents[0]
doc_castellano = documents[1]

print(f"Contenido en Valenciano ({len(doc_valencia.page_content)} caracteres):")
print(json.dumps(doc_valencia, indent=4, default=lambda o: o.__dict__))
print(f"\nContenido en Castellano ({len(doc_castellano.page_content)} caracteres):")
print(json.dumps(doc_castellano, indent=4, default=lambda o: o.__dict__))