In [1]:
import sys
import os

PROJECT_ROOT = "/Users/kevingarrison/Code Projects/Private Projects/Agents/Langchain_LECL/langchain-lecl"
SRC_PATH = os.path.join(PROJECT_ROOT, "src")

if SRC_PATH not in sys.path:
    sys.path.append(SRC_PATH)

In [None]:
from langchain.agents.middleware import dynamic_prompt, ModelRequest
from langchain_core.vectorstores import InMemoryVectorStore
from langchain.chat_models import init_chat_model
from langchain_core.prompts import PromptTemplate
from langchain_docling.loader import ExportType
from langchain_openai import OpenAIEmbeddings
from docling.chunking import HybridChunker
from langchain.agents import create_agent
from langchain.tools import tool
from dotenv import load_dotenv
from typing import Literal


load_dotenv()

FILE_PATH = ["/Users/kevingarrison/Code Projects/Private Projects/Agents/Langchain_LECL/langchain-lecl/notebooks/Bachelorarbeit_Kevin_Garrison_85826.pdf"]
EMBED_MODEL_ID = "sentence-transformers/all-MiniLM-L6-v2"
GEN_MODEL_ID = "Qwen/Qwen2.5-7B-Instruct"
EXPORT_TYPE = ExportType.DOC_CHUNKS
QUESTION = "Which are the main AI models in Docling?"
PROMPT = PromptTemplate.from_template(
    "Context information is below.\n---------------------\n{context}\n---------------------\nGiven the context information and not prior knowledge, answer the query.\nQuery: {input}\nAnswer:\n",
)
TOP_K = 3

In [3]:
from langchain_docling import DoclingLoader

from docling.chunking import HybridChunker

loader = DoclingLoader(
    file_path=FILE_PATH,
    export_type=EXPORT_TYPE,
    chunker=HybridChunker(tokenizer=EMBED_MODEL_ID),
)

docs = loader.load()

2025-11-24 12:22:02,599 - INFO - detected formats: [<InputFormat.PDF: 'pdf'>]
2025-11-24 12:22:02,638 - INFO - Going to convert document batch...
2025-11-24 12:22:02,639 - INFO - Initializing pipeline for StandardPdfPipeline with options hash 44ae89a68fc272bc7889292e9b5a1bad
2025-11-24 12:22:02,644 - INFO - Loading plugin 'docling_defaults'
2025-11-24 12:22:02,646 - INFO - Registered picture descriptions: ['vlm', 'api']
2025-11-24 12:22:02,649 - INFO - Loading plugin 'docling_defaults'
2025-11-24 12:22:02,652 - INFO - Registered ocr engines: ['auto', 'easyocr', 'ocrmac', 'rapidocr', 'tesserocr', 'tesseract']
2025-11-24 12:22:03,095 - INFO - Auto OCR model selected ocrmac.
2025-11-24 12:22:03,099 - INFO - Accelerator device: 'mps'
2025-11-24 12:22:03,771 - INFO - Accelerator device: 'mps'
2025-11-24 12:22:04,043 - INFO - Processing document Bachelorarbeit_Kevin_Garrison_85826.pdf
2025-11-24 12:22:19,968 - INFO - Finished converting document Bachelorarbeit_Kevin_Garrison_85826.pdf in 17.

In [4]:
docs

[Document(metadata={'source': '/Users/kevingarrison/Code Projects/Private Projects/Agents/Langchain_LECL/langchain-lecl/notebooks/Bachelorarbeit_Kevin_Garrison_85826.pdf', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/0', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 103.33333305833334, 't': 802.6580558356798, 'r': 125.33333305833332, 'b': 797.3384682068138, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 9]}]}, {'self_ref': '#/texts/1', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 1, 'bbox': {'l': 11.999999324444465, 't': 742.1477465717364, 'r': 214.66666599111113, 'b': 722.199292963489, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 7]}]}, {'self_ref': '#/texts/2', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'pa

In [5]:
if EXPORT_TYPE == ExportType.DOC_CHUNKS:
    splits = docs
elif EXPORT_TYPE == ExportType.MARKDOWN:
    from langchain_text_splitters import MarkdownHeaderTextSplitter

    splitter = MarkdownHeaderTextSplitter(
        headers_to_split_on=[
            ("#", "Header_1"),
            ("##", "Header_2"),
            ("###", "Header_3"),
        ],
    )
    splits = [split for doc in docs for split in splitter.split_text(doc.page_content)]
else:
    raise ValueError(f"Unexpected export type: {EXPORT_TYPE}")

In [6]:
for d in splits[:3]:
    print(f"- {d.page_content=}")
print("...")

- d.page_content='STUTTGART\nPORSCHe\nBachelorarbeit Studiengang : Data Science'
- d.page_content='Einsatz von Large Language Models (LLM) zur Extraktion und Strukturierung von Zolldokumenten: Ein KI-gestützter Ansatz zur automatisierten Datenverarbeitung\nbei Porsche AG von Kevin Garrison\n85826\nBetreuender Professor: Prof. Dr. Winfried Bantel Zweitprüfer : Prof. Dr. Tim Dahmen\nEinreichungsdatum : 14. August 2025'
- d.page_content='Angaben zur Firma\nUnternehmen :\nPorsche AG\nBranche :\nAutomobilbranche Finanzstrategie & Data Science Porscheplatz 1 D - 70435 Stuttgart\nAbteilung :\nAdresse :\nBetreuerin :\nMaike Klepsch (FOD) (+49) 0 152 3 911 0075 maike.klepsch@porsche.de\nTelefon :\nE-Mail :'
...


In [7]:
print(f"Split blog post into {len(splits)} sub-documents.")

Split blog post into 327 sub-documents.


In [8]:
embeddings = OpenAIEmbeddings(model="text-embedding-3-large")


vector_store = InMemoryVectorStore(embeddings)


document_ids = vector_store.add_documents(documents=splits)

print(document_ids[:3])

2025-11-24 12:22:23,630 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


['79119573-876e-427a-97cd-3d143e9a715c', 'd8938eac-196a-46bd-9fdc-c6cf44a9e35d', 'a852a9eb-d966-4ae9-8db4-a08b6fcdab2c']


In [9]:
@tool(response_format="content_and_artifact")
def retrieve_context(query: str):
    """Retrieve information to help answer a query."""
    retrieved_docs = vector_store.similarity_search(query, k=TOP_K)
    serialized = "\n\n".join(
        (f"Source: {doc.metadata}\nContent: {doc.page_content}")
        for doc in retrieved_docs
    )
    return serialized, retrieved_docs

In [10]:
tools = [retrieve_context]
# If desired, specify custom instructions
prompt = (
    "You have access to a tool that retrieves context from a blog post. "
    "Use the tool to help answer user queries."
)

model = init_chat_model("gpt-4.1")

agent = create_agent(model, tools, system_prompt=prompt)

In [11]:
query = (
    "What are the key findings of the paper?\n\n"
    "Once you get the answer, retrive the ."
)

for event in agent.stream(
    {"messages": [{"role": "user", "content": query}]},
    stream_mode="values",
):
    event["messages"][-1].pretty_print()


What are the key findings of the paper?

Once you get the answer, retrive the .


huggingface/tokenizers: The current process just got forked, after parallelism has already been used. Disabling parallelism to avoid deadlocks...
	- Avoid using `tokenizers` before the fork if possible
	- Explicitly set the environment variable TOKENIZERS_PARALLELISM=(true | false)
2025-11-24 12:22:25,164 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"


Tool Calls:
  retrieve_context (call_u8KlSG3SIRt3IjXynA8zKspj)
 Call ID: call_u8KlSG3SIRt3IjXynA8zKspj
  Args:
    query: key findings of the paper


2025-11-24 12:22:25,400 - INFO - HTTP Request: POST https://api.openai.com/v1/embeddings "HTTP/1.1 200 OK"


Name: retrieve_context

Source: {'source': '/Users/kevingarrison/Code Projects/Private Projects/Agents/Langchain_LECL/langchain-lecl/notebooks/Bachelorarbeit_Kevin_Garrison_85826.pdf', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker.DocMeta', 'version': '1.0.0', 'doc_items': [{'self_ref': '#/texts/27', 'parent': {'$ref': '#/body'}, 'children': [], 'content_layer': 'body', 'label': 'text', 'prov': [{'page_no': 5, 'bbox': {'l': 88.931, 't': 642.6360146484375, 'r': 507.798, 'b': 511.0610146484375, 'coord_origin': 'BOTTOMLEFT'}, 'charspan': [0, 750]}]}], 'headings': ['Kurzfassung'], 'origin': {'mimetype': 'application/pdf', 'binary_hash': 18027861618462669308, 'filename': 'Bachelorarbeit_Kevin_Garrison_85826.pdf'}}}
Content: Kurzfassung
Ergebnisse hinzunehmen.

Source: {'source': '/Users/kevingarrison/Code Projects/Private Projects/Agents/Langchain_LECL/langchain-lecl/notebooks/Bachelorarbeit_Kevin_Garrison_85826.pdf', 'dl_meta': {'schema_name': 'docling_core.transforms.chunker

2025-11-24 12:22:29,159 - INFO - HTTP Request: POST https://api.openai.com/v1/chat/completions "HTTP/1.1 200 OK"



Here are the key findings of the paper:

- The LLM-based (Large Language Model-based) approach is more robust to faulty input data compared to classical systems, thanks to its ability for contextual interpretation.
- The best results in micro-metrics were a recall of 0.53 and an F1-score of 0.60, achieved with exact numerical matches and a Levenshtein threshold above 90% for string comparison with test data.
- The LLM approach shows a higher generalization capability but also has limitations regarding reproducibility and extraction precision.
- The performance of the system is strongly influenced by the quality and structure of underlying data.
- The extraction pipeline, in its current state, is not robust enough to fully and automatically replace processes critical for customs operations.

If you need more detailed findings or another section, please specify!
