# Install Docling and Process Document

In [1]:
from google.colab import userdata

!pip install docling

!docling https://arxiv.org/pdf/2408.09869 --output /content/

from docling.document_converter import DocumentConverter

source = "https://arxiv.org/pdf/2408.09869"

converter = DocumentConverter()

result = converter.convert(source)

print(result.document.export_to_markdown())

Collecting docling
  Downloading docling-2.44.0-py3-none-any.whl.metadata (10 kB)
Collecting docling-core<3.0.0,>=2.42.0 (from docling-core[chunking]<3.0.0,>=2.42.0->docling)
  Downloading docling_core-2.44.1-py3-none-any.whl.metadata (6.5 kB)
Collecting docling-parse<5.0.0,>=4.0.0 (from docling)
  Downloading docling_parse-4.1.0-cp311-cp311-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (9.6 kB)
Collecting docling-ibm-models<4,>=3.9.0 (from docling)
  Downloading docling_ibm_models-3.9.0-py3-none-any.whl.metadata (6.7 kB)
Collecting filetype<2.0.0,>=1.2.0 (from docling)
  Downloading filetype-1.2.0-py2.py3-none-any.whl.metadata (6.5 kB)
Collecting pypdfium2!=4.30.1,<5.0.0,>=4.30.0 (from docling)
  Downloading pypdfium2-4.30.0-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m48.5/48.5 kB[0m [31m4.1 MB/s[0m eta [36m0:00:00[0m
[?25hCollecting pydantic-settings<3.0.0,>=2.3.0 (from doclin

# Install Required Packages

In [2]:
# Install necessary packages
!pip install -q --progress-bar off llama-index-core llama-index-readers-docling \
llama-index-node-parser-docling llama-index-embeddings-huggingface \
llama-index-llms-huggingface-api llama-index-vector-stores-milvus \
llama-index-readers-file python-dotenv

[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.
ipython 7.34.0 requires jedi>=0.16, which is not installed.[0m[31m
[0m

# Import Modules and Define Helper Functions

In [3]:
# Import Modules & Define Helper Functions
import os
from pathlib import Path
from tempfile import mkdtemp
from warnings import filterwarnings
from dotenv import load_dotenv

# Load environment variables
load_dotenv()

# Suppress warnings for cleaner outputs
filterwarnings(action="ignore", category=UserWarning, module="pydantic")
filterwarnings(action="ignore", category=FutureWarning, module="easyocr")

# Prevent tokenizer parallelism issues
os.environ["TOKENIZERS_PARALLELISM"] = "false"

# Function to get environment variables
def _get_env_from_colab_or_os(key):
    return os.getenv(key)

# Define Main Parameters

In [4]:
# Define Main Parameters
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface_api import HuggingFaceInferenceAPI

# Fetch HuggingFace token from environment variables
hf_token = userdata.get("HF_TOKEN")

# Ensure the token is set
if not hf_token:
    raise EnvironmentError("HF_TOKEN environment variable is not set!")

# Define embedding and generation models
EMBED_MODEL = HuggingFaceEmbedding(model_name="BAAI/bge-small-en-v1.5")
MILVUS_URI = str(Path(mkdtemp()) / "docling.db")

# Use HuggingFaceTB/SmolLM3-3B which is available on HF Inference API
GEN_MODEL = HuggingFaceInferenceAPI(
    token=hf_token,
    model_name="HuggingFaceTB/SmolLM3-3B",
)

SOURCE = "https://arxiv.org/pdf/2408.09869"  # Docling Technical Report
QUERY = "Which are the main AI models in Docling?"

embed_dim = len(EMBED_MODEL.get_text_embedding("hi"))

# Define RAG Pipeline with DoclingReader

In [5]:
# Define RAG Pipeline with DoclingReader
from llama_index.core import StorageContext, VectorStoreIndex
from llama_index.core.node_parser import MarkdownNodeParser
from llama_index.readers.docling import DoclingReader
from llama_index.vector_stores.milvus import MilvusVectorStore

# Define the DoclingReader and node parser
reader = DoclingReader()
node_parser = MarkdownNodeParser()

# Configure the vector store
vector_store = MilvusVectorStore(
    uri=str(Path(mkdtemp()) / "docling.db"), # Temporary database
    dim=embed_dim,
    overwrite=True,
)

# Create the index
index = VectorStoreIndex.from_documents(
    documents=reader.load_data(SOURCE),
    transformations=[node_parser],
    storage_context=StorageContext.from_defaults(vector_store=vector_store),
    embed_model=EMBED_MODEL,
)

  from pkg_resources import DistributionNotFound, get_distribution


# Query the Pipeline and Display Results

In [7]:
# Query the pipeline
result = index.as_query_engine(llm=GEN_MODEL).query(QUERY)

print(result)

<think>
Okay, let's see. The user is asking which are the main AI models in Docling. I need to look through the provided context to find the answer.

First, I'll scan the sections. The context mentions "3.2 AI models" as a header. That's probably where the main models are discussed. Let me check that section.

Under 3.2 AI models, the first model mentioned is a layout analysis model, described as an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], which is a state-of-the-art table structure recognition model. The context also mentions that these models are released by the team and provide pre-trained weights and inference code in docling-ibm-models. 

So the two main AI models in Docling are the layout analysis model and TableFormer. The user is asking for the main ones, and the context clearly lists these two. I should make sure there aren't any other models mentioned elsewhere, but the rest of the context talks about applications and other pac

In [8]:
# Display the results
print(f"Q: {QUERY}\nA: {result.response.strip()}\n\nSources:")
for node in result.source_nodes:
    print(f"Text: {node.text}\nMetadata: {node.metadata}\n")

Q: Which are the main AI models in Docling?
A: <think>
Okay, let's see. The user is asking which are the main AI models in Docling. I need to look through the provided context to find the answer.

First, I'll scan the sections. The context mentions "3.2 AI models" as a header. That's probably where the main models are discussed. Let me check that section.

Under 3.2 AI models, the first model mentioned is a layout analysis model, described as an accurate object-detector for page elements [13]. The second model is TableFormer [12, 9], which is a state-of-the-art table structure recognition model. The context also mentions that these models are released by the team and provide pre-trained weights and inference code in docling-ibm-models. 

So the two main AI models in Docling are the layout analysis model and TableFormer. The user is asking for the main ones, and the context clearly lists these two. I should make sure there aren't any other models mentioned elsewhere, but the rest of the