# Multimodal Pipeline for RAG

## Phase 0: Setting up the Project

In [1]:
from unstructured.partition.pdf import partition_pdf
import pytest
from services.categorizer import categorize

  from .autonotebook import tqdm as notebook_tqdm


## Phase 1: Indexing

Indexing starts with the cleaning and extraction of raw data in diverse formats like PDF, HTML, Word, and Markdown, which is then converted into a uniform plain text format.

### Extraction

In [53]:
file_path = "./assets/CG101_CG102_CG103.pdf"
#"./assets/Table.pdf"
#

pdf_elements = partition_pdf(filename=file_path,
                             strategy='hi_res',
                             infer_table_structure=True,
                             hi_res_model_name='yolox',
                             #extract_image_block_types=['Image'], #Enable later but fix Bosch logo extraction!!!
                             #extract_image_block_to_payload=True, #If True, will extract base64 for API usage
                             chunking_strategy='by_title',          # splitting strategy for the document (related elements are now grouped together)
                             max_characters=10000,                  # defaults to 500
                             combine_text_under_n_chars=2000,       # defaults to 0
                             new_after_n_chars=6000)

With the recent updates to the unstructured library (especially >=0.11.x), when using chunking_strategy="by_title", the output elements are wrapped as CompositeElement, grouping together content under headings — which can include Table, Text, Image, etc.

🔍 When to Use Raw Access (No Chunking)

✅ Use this when:
	•	Your primary goal is to extract specific elements, like tables, without worrying about their surrounding context.
	•	You want to classify, transform, or analyze tables or text independently.
	•	You’re building a pipeline where you process each element individually (e.g., sending them to LLMs, storing in a vector DB, etc.).

✅ Pros:
	•	Simple and straightforward.
	•	Full visibility into all content types.
	•	Easier debugging and testing.

❌ Cons:
	•	No semantic grouping — loses the logical structure (e.g., which section the table belongs to).

🧩 When to Use Chunking (e.g. by_title)

✅ Use this when:
	•	You want to preserve the document’s logical structure — e.g., sections, headings, context.
	•	You’re building a retrieval system, summarizer, or LLM pipeline that benefits from cohesive, meaningful chunks.
	•	You want to preserve the relationship between paragraphs and tables/images under a specific section.

✅ Pros:
	•	More semantically meaningful.
	•	Better input for language models.
	•	Maintains context between related elements.

❌ Cons:
	•	More complex to work with — requires digging into CompositeElement.elements.
	•	Slightly harder to extract just tables.

In [None]:
#pdf_elements is a list full of chunks. the different elements are stored in the metadata attribute of each chunk so next we have to extract the elements from the metadata with a loop

pdf_elements[2].metadata.orig_elements

[<unstructured.documents.elements.Text at 0x337a89940>,
 <unstructured.documents.elements.Text at 0x337a8a740>,
 <unstructured.documents.elements.NarrativeText at 0x337a8b770>,
 <unstructured.documents.elements.Image at 0x337a8b930>,
 <unstructured.documents.elements.Image at 0x337a8ba10>,
 <unstructured.documents.elements.Text at 0x337a8b540>,
 <unstructured.documents.elements.Title at 0x337a8bd20>,
 <unstructured.documents.elements.Title at 0x35111a350>,
 <unstructured.documents.elements.Title at 0x337a88050>,
 <unstructured.documents.elements.NarrativeText at 0x337a88750>,
 <unstructured.documents.elements.Title at 0x337a88c20>,
 <unstructured.documents.elements.NarrativeText at 0x337a88fa0>,
 <unstructured.documents.elements.Title at 0x337a8be00>,
 <unstructured.documents.elements.Title at 0x337a89240>,
 <unstructured.documents.elements.Title at 0x337a8ae40>,
 <unstructured.documents.elements.Title at 0x32d15c1a0>,
 <unstructured.documents.elements.Title at 0x32d15c440>,
 <unstruct

In [47]:
# Subelements contain the actual elements
chunks = pdf_elements[0].metadata.orig_elements
set([str(type(el)) for el in chunks])

{"<class 'unstructured.documents.elements.Image'>",
 "<class 'unstructured.documents.elements.NarrativeText'>",
 "<class 'unstructured.documents.elements.Table'>",
 "<class 'unstructured.documents.elements.Text'>",
 "<class 'unstructured.documents.elements.Title'>"}

{"<class 'unstructured.documents.elements.Image'>",
 "<class 'unstructured.documents.elements.NarrativeText'>",
 "<class 'unstructured.documents.elements.Table'>",
 "<class 'unstructured.documents.elements.Text'>",
 "<class 'unstructured.documents.elements.Title'>"}

### Separate extracted elements into tables, text, and images

Separate tables from texts

In [None]:
# separate tables from texts
tables = []
texts = []

for chunk in chunks:
    if "Table" in str(type(chunk)):
        tables.append(chunk)

    if "CompositeElement" in str(type((chunk))):
        texts.append(chunk)

[<unstructured.documents.elements.Table at 0x337dab380>]

Get the images from the CompositeElement objects

In [50]:
def get_images_base64(chunks):
    images_b64 = []
    for chunk in chunks:
        if "CompositeElement" in str(type(chunk)):
            chunk_els = chunk.metadata.orig_elements
            for el in chunk_els:
                if "Image" in str(type(el)):
                    images_b64.append(el.metadata.image_base64)
    return images_b64

images = get_images_base64(chunks)