PyMuPDF LangChain Chunking Ollama   

MCP recibe comando: "ingesta PDF"
PyMuPDF extrae texto
Chunking
Ollama genera embeddings
Se almacenan en ChromaDB
Más adelante, Claude pide información:

El MCP consulta ChromaDB
Devuelve chunks relevantes
Claude genera la respuesta final

In [None]:
import os
import fitz
from langchain_text_splitters import RecursiveCharacterTextSplitter
from chromadb import Client
import subprocess
import json
import uuid
import chromadb
import uuid
import requests

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
CHROMA_DB_PATH = "./chroma_db"
EMBEDDING_MODEL = "nomic-embed-text"
COLLECTION_NAME = "pdf_docs"

In [3]:
def extract_text_and_metadata(pdf_path):
    doc = fitz.open(pdf_path)

    metadata = doc.metadata
    text = ""

    for page in doc:
        text += page.get_text()

    return text, metadata


def chunk_text(text, chunk_size=800, chunk_overlap=150):
    splitter = RecursiveCharacterTextSplitter(
        chunk_size=chunk_size,
        chunk_overlap=chunk_overlap
    )
    return splitter.split_text(text)

def generate_embedding_ollama(text):
    url = "http://localhost:11434/api/embeddings"
    payload = {
        "model": EMBEDDING_MODEL,
        "prompt": text
    }

    response = requests.post(url, json=payload)

    if response.status_code != 200:
        raise RuntimeError(f"Error de Ollama: {response.text}")

    data = response.json()
    return data["embedding"]

In [4]:

def store_in_chromadb(chunks, metadata, pdf_name):
    client = chromadb.PersistentClient(path=CHROMA_DB_PATH)

    collection = client.get_or_create_collection(
        name=COLLECTION_NAME,
        metadata={"hnsw:space": "cosine"},
        embedding_function=None
    )

    for chunk in chunks:
        emb = generate_embedding_ollama(chunk)

        collection.add(
            ids=[str(uuid.uuid4())],
            metadatas=[{
                "source_pdf": pdf_name,
                "pdf_metadata":  json.dumps(metadata, ensure_ascii=False)
            }],
            documents=[chunk],
            embeddings=[emb]
        )

    print("✓ Datos guardados en ChromaDB (API moderna)")

In [5]:
def ingest_pdf(pdf_path):
    if not os.path.exists(pdf_path):
        raise FileNotFoundError(f"No se encontró el PDF: {pdf_path}")

    print(f"→ Ingestando PDF: {pdf_path}")

    text, metadata = extract_text_and_metadata(pdf_path)
    print("✓ Texto y metadatos extraídos")

    chunks = chunk_text(text)
    print(f"✓ Chunks creados: {len(chunks)}")

    store_in_chromadb(chunks, metadata, os.path.basename(pdf_path))

    print("\n→ Ingesta completa.")
    print("Metadatos del PDF:")
    print(json.dumps(metadata, indent=4))

In [8]:
PDF_FILE = "./Documentos/CobiT4_Espanol.pdf"
ingest_pdf(PDF_FILE)

→ Ingestando PDF: ./Documentos/CobiT4_Espanol.pdf
✓ Texto y metadatos extraídos
✓ Chunks creados: 1037
✓ Datos guardados en ChromaDB (API moderna)

→ Ingesta completa.
Metadatos del PDF:
{
    "format": "PDF 1.5",
    "title": "4",
    "author": "JAVIER DIAZ LOPEZ",
    "subject": "",
    "keywords": "",
    "creator": "Microsoft\u00ae Office Word 2007 Versi\u00f3n de Evaluaci\u00f3n",
    "producer": "Microsoft\u00ae Office Word 2007 Versi\u00f3n de Evaluaci\u00f3n",
    "creationDate": "D:20070328122601",
    "modDate": "D:20070328122601",
    "trapped": "",
    "encryption": null
}
