In [None]:
%pip install -qU langchain-docling

In [None]:
%pip install -q --progress-bar off --no-warn-conflicts langchain-core langchain-huggingface langchain_milvus langchain python-dotenv

In [None]:
import os

# https://github.com/huggingface/transformers/issues/5486:
os.environ["TOKENIZERS_PARALLELISM"] = "false"

In [None]:
from langchain_docling import DoclingLoader

FILE_PATH = "https://arxiv.org/pdf/2408.09869"
# FILE_PATH = "2408.pdf"

loader = DoclingLoader(file_path=FILE_PATH)

In [None]:
docs = loader.load_and_split()

In [None]:
docs

In [None]:
print(docs[-1].page_content)

In [None]:
len(docs)

In [None]:
for d in docs[:3]:
    print(f"- {d.page_content=}")

In [None]:
%pip install -Uq "unstructured[all-docs]" pillow lxml pillow
%pip install -Uq chromadb tiktoken
%pip install -Uq langchain langchain-community langchain-openai langchain-groq
%pip install -Uq python_dotenv

In [None]:
from unstructured.partition.pdf import partition_pdf

output_path = "C:\\Users\\LoChandran\\Desktop\\Python\\python_testing\\doc\\"
file_path = 'C:\\Users\\LoChandran\\Desktop\\Python\\python_testing\\2408.pdf'

# Reference: https://docs.unstructured.io/open-source/core-functionality/chunking
chunks = partition_pdf(
    filename=file_path,
    infer_table_structure=True,            # extract tables
    strategy="hi_res",                     # mandatory to infer tables

    # extract_image_block_types=["Image"],   # Add 'Table' to list to extract image of tables
    # image_output_dir_path=output_path,   # if None, images and tables will saved in base64

    extract_image_block_to_payload=True,   # if true, will extract base64 for API usage

    chunking_strategy="by_title",          # or 'basic'
    max_characters=10000,                  # defaults to 500
    combine_text_under_n_chars=2000,       # defaults to 0
    new_after_n_chars=6000,

    # extract_images_in_pdf=True,          # deprecated
)

In [None]:
# We get 2 types of elements from the partition_pdf function
set([str(type(el)) for el in chunks])

In [None]:
from docling.document_converter import DocumentConverter
from langchain_community.document_loaders import TextLoader, ToMarkdownLoader
from langchain.text_splitter import CharacterTextSplitter, RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

# # Step 1: Convert PDF to Markdown
converter = DocumentConverter()
result = converter.convert("2408.pdf")
# with open("output1.md", "w", encoding='utf-8') as f:
#     f.write(result.document.export_to_markdown(mark_annotations=True))


In [None]:
from docling.document_converter import DocumentConverter
import pandas as pd

In [None]:
doc_converter = DocumentConverter()
conv_res = doc_converter.convert("2408.pdf")

In [None]:
conv_res.document.export_to_markdown

In [None]:
# Export tables
for table_ix, table in enumerate(conv_res.document.tables):
    table_df: pd.DataFrame = table.export_to_dataframe()
    print(f"## Table {table_ix}\n\n")
    print(table_df.to_markdown())
    break
    # Save the table as csv
    # element_csv_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.csv"
    # log.info(f"Saving CSV table to {element_csv_filename}")
    # table_df.to_csv(element_csv_filename)

    # Save the table as html
    # element_html_filename = output_dir / f"{doc_filename}-table-{table_ix + 1}.html"
    # _log.info(f"Saving HTML table to {element_html_filename}")
    # with element_html_filename.open("w") as fp:
    #     fp.write(table.export_to_html(doc=conv_res.document))

In [57]:
print(table_df.to_markdown())

|    | CPU.                             | Thread budget.   | native backend.TTS   | native backend.Pages/s   | native backend.Mem   | pypdfium backend.TTS   | pypdfium backend.Pages/s   | pypdfium backend.Mem   |
|---:|:---------------------------------|:-----------------|:---------------------|:-------------------------|:---------------------|:-----------------------|:---------------------------|:-----------------------|
|  0 | Apple M3 Max                     | 4                | 177 s 167 s          | 1.27 1.34                | 6.20 GB              | 103 s 92 s             | 2.18 2.45                  | 2.56 GB                |
|  1 | (16 cores) Intel(R) Xeon E5-2690 | 16 4 16          | 375 s 244 s          | 0.60 0.92                | 6.16 GB              | 239 s 143 s            | 0.94 1.57                  | 2.42 GB                |


In [None]:
prompt = """Decompose the "Content" into clear and simple propositions, ensuring they are interpretable out of
context.
1. Split compound sentence into simple sentences. Maintain the original phrasing from the input
whenever possible.
2. For any named entity that is accompanied by additional descriptive information, separate this
information into its own distinct proposition.
3. Decontextualize the proposition by adding necessary modifier to nouns or entire sentences
and replacing pronouns (e.g., "it", "he", "she", "they", "this", "that") with the full name of the
entities they refer to.
4. Present the results as a list of strings, formatted in JSON.

Example:

Input: Title: ¯Eostre. Section: Theories and interpretations, Connection to Easter Hares. Content:
The earliest evidence for the Easter Hare (Osterhase) was recorded in south-west Germany in
1678 by the professor of medicine Georg Franck von Franckenau, but it remained unknown in
other parts of Germany until the 18th century. Scholar Richard Sermon writes that "hares were
frequently seen in gardens in spring, and thus may have served as a convenient explanation for the
origin of the colored eggs hidden there for children. Alternatively, there is a European tradition
that hares laid eggs, since a hare’s scratch or form and a lapwing’s nest look very similar, and
both occur on grassland and are first seen in the spring. In the nineteenth century the influence
of Easter cards, toys, and books was to make the Easter Hare/Rabbit popular throughout Europe.
German immigrants then exported the custom to Britain and America where it evolved into the
Easter Bunny."
Output: [ "The earliest evidence for the Easter Hare was recorded in south-west Germany in
1678 by Georg Franck von Franckenau.", "Georg Franck von Franckenau was a professor of
medicine.", "The evidence for the Easter Hare remained unknown in other parts of Germany until
the 18th century.", "Richard Sermon was a scholar.", "Richard Sermon writes a hypothesis about
the possible explanation for the connection between hares and the tradition during Easter", "Hares
were frequently seen in gardens in spring.", "Hares may have served as a convenient explanation
for the origin of the colored eggs hidden in gardens for children.", "There is a European tradition
that hares laid eggs.", "A hare’s scratch or form and a lapwing’s nest look very similar.", "Both
hares and lapwing’s nests occur on grassland and are first seen in the spring.", "In the nineteenth
century the influence of Easter cards, toys, and books was to make the Easter Hare/Rabbit popular
throughout Europe.", "German immigrants exported the custom of the Easter Hare/Rabbit to
Britain and America.", "The custom of the Easter Hare/Rabbit evolved into the Easter Bunny in
Britain and America."
"""

In [None]:
print(table_df.to_markdown())

In [None]:
input = ('system',prompt,'user', "Decompose the following" + str(table_df.to_markdown()))

In [None]:
print(input)

In [None]:
from langchain_ollama import OllamaLLM

llm = OllamaLLM(model="qwen3:8b")

In [53]:
x = llm.invoke(input=input)

In [55]:
print(x)

<think>
Okay, let's tackle this problem. The user wants me to decompose the given table into clear and simple propositions, following specific guidelines. First, I need to understand the input data. The table has rows with different CPU models and various metrics like Thread budget, TTS (Time to Scan?), Pages/s, Mem (Memory usage?), and some backend-specific metrics for native and pypdfium backends.

The example provided earlier shows that the output should be a list of JSON strings, each representing a proposition. The key points from the example are splitting compound sentences, separating named entities with their descriptions into distinct propositions, decontextualizing by replacing pronouns with full names, and maintaining original phrasing as much as possible.

Looking at the input table, the first row is for Apple M3 Max with 4 threads, and then various metrics. The second row is for an Intel Xeon E5-2690 with 16 cores, and different thread counts. Each metric has values for na

In [None]:
# Step 2: Load into LangChain
loader = TextLoader("output.md")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter()
docs = text_splitter.split_documents(documents)

In [None]:
print(documents[0].page_content)

In [None]:
# Step 2: Load into LangChain
loader = ToMarkdownLoader("output1.md")
documents = loader.load()
text_splitter = RecursiveCharacterTextSplitter()
docs1 = text_splitter.split_documents(documents)

In [None]:
docs

In [None]:
len(docs)

In [None]:
print(docs[-5].page_content)

In [None]:
print(docs[-9].page_content)

In [None]:
%pip install -qU langchain-unstructured

In [None]:
file_path = "2408.pdf"

In [None]:
from langchain_community.document_loaders import PyPDFLoader

loader = PyPDFLoader(file_path)
pages = []
async for page in loader.alazy_load():
    pages.append(page)

In [None]:
index = 6
print(f"{pages[index].metadata}\n")
print(pages[index].page_content)

In [None]:
pages