In [None]:
import fitz  # PyMuPDF
import os

def extract_texts_from_folder(folder_path):
    pdf_texts = {}

    for filename in os.listdir(folder_path):
        if filename.lower().endswith(".pdf"):
            full_path = os.path.join(folder_path, filename)
            try:
                doc = fitz.open(full_path)
                pages = [page.get_text().strip() for page in doc if page.get_text().strip()]
                pdf_texts[filename] = pages
            except Exception as e:
                print(f"Error procesando {filename}: {e}")

    return pdf_texts  # Diccionario: nombre_archivo → lista de páginas

# Usar con tu carpeta 'PDF'
pdf_folder = "PDF"
pdf_text_data = extract_texts_from_folder(pdf_folder)

# Ver ejemplo
for nombre, paginas in pdf_text_data.items():
    print(f"{nombre} - {len(paginas)} páginas")
    print(paginas[0][:300])  # Muestra los primeros caracteres de la primera página
    break


In [None]:
import weaviate
from weaviate.classes.config import Property, DataType

with weaviate.connect_to_local() as client:

    # Eliminar la colección si ya existe
    if client.collections.exists("PdfPage"):
        client.collections.delete("PdfPage")

    # Crear una nueva colección
    client.collections.create(
        name="PdfPage",
        properties=[
            Property(name="content", data_type=DataType.TEXT),
            Property(name="source", data_type=DataType.TEXT),
            Property(name="page_number", data_type=DataType.INT)
        ]
    )

    print("✅ Colección 'PdfPage' creada correctamente.")


In [None]:

with weaviate.connect_to_local() as client:
    collection = client.collections.get("PdfPage")

    for nombre_archivo, paginas in pdf_text_data.items():
        for i, texto in enumerate(paginas):
            collection.data.insert({
                "content": texto,
                "source": nombre_archivo,
                "page_number": i + 1
            })

print("📚 Todos los PDFs fueron cargados en Weaviate correctamente.")

In [None]:
import weaviate
from weaviate.classes.query import Filter

palabra_clave = "cliente"

with weaviate.connect_to_local() as client:
    collection = client.collections.get("PdfPage")

    filtro = Filter.by_property("content").like(f"*{palabra_clave}*")

    resultados = collection.query.fetch_objects(
        filters=filtro,
        limit=5
    )

for obj in resultados.objects:
    print(f"{obj.properties['source']} (Página {obj.properties['page_number']}):")
    print(obj.properties['content'][:3000], "\n---\n")
