### Data will persist inside the container unless map a local volume
docker run -d \
    --name qdrant \
    -p 6333:6333 \
    -p 6334:6334 \
    -v $(pwd)/qdrant_data:/qdrant/storage \
    qdrant/qdrant

In [None]:
import ollama
import re
from docx import Document
from collections import defaultdict
from langchain_ollama import ChatOllama

In [None]:
CHUNK_METHODS = ["sliding", "sentence"]  # "sliding" or "sentence"
SLIDING_WINDOW_SIZE = 200   # characters per chunk
SLIDING_WINDOW_OVERLAP = 50 # characters of overlap between chunks
SENTENCE_MAX_CHARS = 300    # maximum sentence chunk size

In [3]:
LANGUAGE_MODEL = ChatOllama(model="llama3.2",)
EMBEDDING_MODEL = 'hf.co/CompendiumLabs/bge-base-en-v1.5-gguf'

In [4]:
from unstructured.partition.auto import partition

all_metadata_pdf = []
all_text_pdf = []

elements_pdf = partition(filename="../../data/files/rag.pdf")
# elements include metadata and text blocks

for el in elements_pdf:
    # Extract metadata as a dictionary
    metadata_dict = vars(el.metadata)
    all_metadata_pdf.append(metadata_dict)
    # Extract text
    all_text_pdf.append(el.text)



In [5]:
for el in elements_pdf:
    print(type(el), el.metadata.page_number)

<class 'unstructured.documents.elements.Text'> 1
<class 'unstructured.documents.elements.Title'> 1
<class 'unstructured.documents.elements.Text'> 1
<class 'unstructured.documents.elements.Title'> 1
<class 'unstructured.documents.elements.Text'> 1
<class 'unstructured.documents.elements.Title'> 1
<class 'unstructured.documents.elements.Text'> 1
<class 'unstructured.documents.elements.Text'> 1
<class 'unstructured.documents.elements.NarrativeText'> 1
<class 'unstructured.documents.elements.Title'> 1
<class 'unstructured.documents.elements.Title'> 1
<class 'unstructured.documents.elements.Title'> 1
<class 'unstructured.documents.elements.NarrativeText'> 1
<class 'unstructured.documents.elements.NarrativeText'> 1
<class 'unstructured.documents.elements.NarrativeText'> 1
<class 'unstructured.documents.elements.NarrativeText'> 1
<class 'unstructured.documents.elements.NarrativeText'> 1
<class 'unstructured.documents.elements.Title'> 1
<class 'unstructured.documents.elements.NarrativeText'> 1

In [6]:
print("Metadata:")
for md in all_metadata_pdf:
    print(md)

Metadata:
{'coordinates': CoordinatesMetadata(points=((16.34, 210.03999999999996), (16.34, 250.03999999999996), (36.34, 250.03999999999996), (36.34, 210.03999999999996)), system=<unstructured.documents.coordinates.PixelSpace object at 0x000001E7A2126CF0>), 'file_directory': '../../data/files', 'filename': 'rag.pdf', 'last_modified': '2025-08-12T12:35:27', 'links': [], 'page_number': 1, 'languages': ['eng'], '_known_field_names': frozenset({'cc_recipient', 'category_depth', 'emphasized_text_tags', 'image_url', 'last_modified', 'key_value_pairs', 'link_start_indexes', 'detection_class_prob', 'text_as_html', 'sent_to', 'coordinates', 'filetype', 'email_message_id', 'file_directory', 'url', 'page_number', 'table_as_cells', 'sent_from', 'links', 'orig_elements', 'link_urls', 'attached_to_filename', 'detection_origin', 'header_footer_type', 'link_texts', 'page_name', 'subject', 'image_path', 'filename', 'is_continuation', 'signature', 'languages', 'bcc_recipient', 'emphasized_text_contents',

In [7]:
first_metadata = all_metadata_pdf[0]
print(list(first_metadata.keys()))

['coordinates', 'file_directory', 'filename', 'last_modified', 'links', 'page_number', 'languages', '_known_field_names', 'filetype']


In [8]:
all_keys = set()

for md in all_metadata_pdf:
    all_keys.update(md.keys())

print(sorted(all_keys))

['_known_field_names', 'coordinates', 'file_directory', 'filename', 'filetype', 'languages', 'last_modified', 'links', 'page_number', 'parent_id']


In [9]:
for key, value in first_metadata.items():
    print(f"{key}: {type(value)}")

coordinates: <class 'unstructured.documents.elements.CoordinatesMetadata'>
file_directory: <class 'str'>
filename: <class 'str'>
last_modified: <class 'str'>
links: <class 'list'>
page_number: <class 'int'>
languages: <class 'list'>
_known_field_names: <class 'frozenset'>
filetype: <class 'str'>


In [10]:
print("\nTexts:")
for txt in all_text_pdf:
    print(txt)


Texts:
4 2 0 2
r a
M 7 2
] L C . s c [
5 v 7 9 9 0 1 . 2 1 3 2 : v i X r a
Retrieval-Augmented Generation for Large Language Models: A Survey
Yunfan Gaoa, Yun Xiongb, Xinyu Gaob, Kangxiang Jiab, Jinliu Panb, Yuxi Bic, Yi Daia, Jiawei Suna, Meng Wangc, and Haofen Wang a,c
aShanghai Research Institute for Intelligent Autonomous Systems, Tongji University bShanghai Key Laboratory of Data Science, School of Computer Science, Fudan University cCollege of Design and Innovation, Tongji University
Abstract—Large Language Models (LLMs) showcase impres- sive capabilities but encounter challenges like hallucination, outdated knowledge, and non-transparent, untraceable reasoning processes. Retrieval-Augmented Generation (RAG) has emerged as a promising solution by incorporating knowledge from external databases. This enhances the accuracy and credibility of the generation, particularly for knowledge-intensive tasks, and allows for continuous knowledge updates and integration of domain- specific i

In [11]:
clean_texts = [txt for txt in all_text_pdf if len(txt.strip()) > 10]  # only keep blocks longer than 10 chars

for txt in clean_texts:
    print(txt)
    print('-----')

] L C . s c [
-----
5 v 7 9 9 0 1 . 2 1 3 2 : v i X r a
-----
Retrieval-Augmented Generation for Large Language Models: A Survey
-----
Yunfan Gaoa, Yun Xiongb, Xinyu Gaob, Kangxiang Jiab, Jinliu Panb, Yuxi Bic, Yi Daia, Jiawei Suna, Meng Wangc, and Haofen Wang a,c
-----
aShanghai Research Institute for Intelligent Autonomous Systems, Tongji University bShanghai Key Laboratory of Data Science, School of Computer Science, Fudan University cCollege of Design and Innovation, Tongji University
-----
Abstract—Large Language Models (LLMs) showcase impres- sive capabilities but encounter challenges like hallucination, outdated knowledge, and non-transparent, untraceable reasoning processes. Retrieval-Augmented Generation (RAG) has emerged as a promising solution by incorporating knowledge from external databases. This enhances the accuracy and credibility of the generation, particularly for knowledge-intensive tasks, and allows for continuous knowledge updates and integration of domain- specif

In [12]:
full_text = "\n\n".join(clean_texts)
print(full_text)

] L C . s c [

5 v 7 9 9 0 1 . 2 1 3 2 : v i X r a

Retrieval-Augmented Generation for Large Language Models: A Survey

Yunfan Gaoa, Yun Xiongb, Xinyu Gaob, Kangxiang Jiab, Jinliu Panb, Yuxi Bic, Yi Daia, Jiawei Suna, Meng Wangc, and Haofen Wang a,c

aShanghai Research Institute for Intelligent Autonomous Systems, Tongji University bShanghai Key Laboratory of Data Science, School of Computer Science, Fudan University cCollege of Design and Innovation, Tongji University

Abstract—Large Language Models (LLMs) showcase impres- sive capabilities but encounter challenges like hallucination, outdated knowledge, and non-transparent, untraceable reasoning processes. Retrieval-Augmented Generation (RAG) has emerged as a promising solution by incorporating knowledge from external databases. This enhances the accuracy and credibility of the generation, particularly for knowledge-intensive tasks, and allows for continuous knowledge updates and integration of domain- specific information. RAG syner

In [13]:
page_texts = defaultdict(list)

for md, txt in zip(all_metadata_pdf, all_text_pdf):
    page_texts[md['page_number']].append(txt)

for page_num, texts in sorted(page_texts.items()):
    print(f"\n\n--- Page {page_num} ---\n")
    print("\n\n".join([t for t in texts if len(t.strip()) > 10]))



--- Page 1 ---

] L C . s c [

5 v 7 9 9 0 1 . 2 1 3 2 : v i X r a

Retrieval-Augmented Generation for Large Language Models: A Survey

Yunfan Gaoa, Yun Xiongb, Xinyu Gaob, Kangxiang Jiab, Jinliu Panb, Yuxi Bic, Yi Daia, Jiawei Suna, Meng Wangc, and Haofen Wang a,c

aShanghai Research Institute for Intelligent Autonomous Systems, Tongji University bShanghai Key Laboratory of Data Science, School of Computer Science, Fudan University cCollege of Design and Innovation, Tongji University

Abstract—Large Language Models (LLMs) showcase impres- sive capabilities but encounter challenges like hallucination, outdated knowledge, and non-transparent, untraceable reasoning processes. Retrieval-Augmented Generation (RAG) has emerged as a promising solution by incorporating knowledge from external databases. This enhances the accuracy and credibility of the generation, particularly for knowledge-intensive tasks, and allows for continuous knowledge updates and integration of domain- specific info

In [14]:
for el in elements_pdf:
    print(type(el), el.metadata.page_number)

<class 'unstructured.documents.elements.Text'> 1
<class 'unstructured.documents.elements.Title'> 1
<class 'unstructured.documents.elements.Text'> 1
<class 'unstructured.documents.elements.Title'> 1
<class 'unstructured.documents.elements.Text'> 1
<class 'unstructured.documents.elements.Title'> 1
<class 'unstructured.documents.elements.Text'> 1
<class 'unstructured.documents.elements.Text'> 1
<class 'unstructured.documents.elements.NarrativeText'> 1
<class 'unstructured.documents.elements.Title'> 1
<class 'unstructured.documents.elements.Title'> 1
<class 'unstructured.documents.elements.Title'> 1
<class 'unstructured.documents.elements.NarrativeText'> 1
<class 'unstructured.documents.elements.NarrativeText'> 1
<class 'unstructured.documents.elements.NarrativeText'> 1
<class 'unstructured.documents.elements.NarrativeText'> 1
<class 'unstructured.documents.elements.NarrativeText'> 1
<class 'unstructured.documents.elements.Title'> 1
<class 'unstructured.documents.elements.NarrativeText'> 1

In [15]:
from unstructured.documents.elements import Table

for el in elements_pdf:
    if isinstance(el, Table):
        # el is a Table element
        for row in el.rows:
            cells_text = [cell.text for cell in row]
            print(cells_text)

In [16]:
from unstructured.documents.elements import Image

for el in elements_pdf:
    if isinstance(el, Image):
        print("Found an image on page:", el.metadata.page_number)
        # Print metadata (like image path, size, etc.)
        print(el.metadata)

        # If the image data is available (depends on your PDF and library version)
        if hasattr(el, "image_bytes"):
            # You can save the image bytes to a file, e.g.:
            with open(f"extracted_image_page{el.metadata.page_number}.png", "wb") as img_file:
                img_file.write(el.image_bytes)


In [17]:
from unstructured.documents.elements import Table, Image

has_tables = any(isinstance(el, Table) for el in elements_pdf)
has_images = any(isinstance(el, Image) for el in elements_pdf)

print(f"Contains tables? {has_tables}")
print(f"Contains images? {has_images}")

Contains tables? False
Contains images? False


In [18]:
all_metadata_doc = []
all_text_doc = []

elements_doc = partition(filename="../../data/files/rag.docx")

for el in elements_doc:
    all_metadata_doc.append(vars(el.metadata))  # Metadata as dict
    all_text_doc.append(el.text)                 # Text content

In [19]:
print("---- METADATA ----")
for md in all_metadata_doc:
    print(md)

---- METADATA ----
{'category_depth': 0, 'file_directory': '../../data/files', 'filename': 'rag.docx', 'header_footer_type': 'primary', 'languages': ['eng'], '_known_field_names': frozenset({'cc_recipient', 'category_depth', 'emphasized_text_tags', 'image_url', 'last_modified', 'key_value_pairs', 'link_start_indexes', 'detection_class_prob', 'text_as_html', 'sent_to', 'coordinates', 'filetype', 'email_message_id', 'file_directory', 'url', 'page_number', 'table_as_cells', 'sent_from', 'links', 'orig_elements', 'link_urls', 'attached_to_filename', 'detection_origin', 'header_footer_type', 'link_texts', 'page_name', 'subject', 'image_path', 'filename', 'is_continuation', 'signature', 'languages', 'bcc_recipient', 'emphasized_text_contents', 'data_source', 'image_mime_type', 'image_base64', 'parent_id'}), 'filetype': 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'}
{'category_depth': 0, 'file_directory': '../../data/files', 'filename': 'rag.docx', 'last_modified':

In [20]:
first_metadata = all_metadata_doc[0]
print(list(first_metadata.keys()))

['category_depth', 'file_directory', 'filename', 'header_footer_type', 'languages', '_known_field_names', 'filetype']


In [21]:
print("\n---- TEXT ----")
for txt in all_text_doc:
    print(txt)
    print("-" * 40)


---- TEXT ----
1
----------------------------------------
Retrieval-Augmented Generation for Large Language Models: A Survey
----------------------------------------
Yunfan Gaoa, Yun Xiongb, Xinyu Gaob, Kangxiang Jiab, Jinliu Panb, Yuxi Bic, Yi Daia, Jiawei Suna, Meng Wangc, and Haofen Wang a,c
----------------------------------------
aShanghai Research Institute for Intelligent Autonomous Systems, Tongji University bShanghai Key Laboratory of Data Science, School of Computer Science, Fudan University
----------------------------------------
cCollege of Design and Innovation, Tongji University
----------------------------------------
Abstract—Large Language Models (LLMs) showcase impressive capabilities but encounter challenges like hallucination, outdated knowledge, and non-transparent, untraceable reasoning processes. Retrieval-Augmented Generation (RAG) has emerged as a promising solution by incorporating knowledge from external databases. This enhances the accuracy and credibility

In [22]:
from unstructured.documents.elements import Table, Image

has_tables = any(isinstance(el, Table) for el in elements_doc)
has_images = any(isinstance(el, Image) for el in elements_doc)

print(f"Contains tables? {has_tables}")
print(f"Contains images? {has_images}")


Contains tables? True
Contains images? False


In [23]:
laskdfj;laskjd;l

NameError: name 'laskdfj' is not defined

In [None]:
def chunk_sliding_window(text, window_size=200, overlap=50):
    chunks = []
    start = 0
    while start < len(text):
        end = start + window_size
        chunks.append(text[start:end])
        start += window_size - overlap
    return chunks

In [None]:
def chunk_by_sentences(text, max_chars=300):
    # Split on sentence boundaries
    sentences = re.split(r'(?<=[.!?]) +', text)
    chunks = []
    current_chunk = ""
    for sentence in sentences:
        if len(current_chunk) + len(sentence) <= max_chars:
            current_chunk += sentence + " "
        else:
            chunks.append(current_chunk.strip())
            current_chunk = sentence + " "
    if current_chunk.strip():
        chunks.append(current_chunk.strip())
    return chunks

In [None]:
if CHUNK_METHOD == "sliding":
    dataset = chunk_sliding_window(raw_text, SLIDING_WINDOW_SIZE, SLIDING_WINDOW_OVERLAP)
elif CHUNK_METHOD == "sentence":
    dataset = chunk_by_sentences(raw_text, SENTENCE_MAX_CHARS)
else:
    raise ValueError("Invalid CHUNK_METHOD. Choose 'sliding' or 'sentence'.")

In [None]:
print(f'Loaded {len(dataset)} chunks using {CHUNK_METHOD} method.')

### Store vector database in Qdrant (Docker)

In [None]:
from qdrant_client import QdrantClient
from qdrant_client.models import Distance, VectorParams, PointStruct
import ollama
import re
import uuid

In [None]:
qdrant = QdrantClient(host="localhost", port=6333)
COLLECTION_NAME = "raw data"

In [None]:
qdrant.recreate_collection(
    collection_name=COLLECTION_NAME,
    vectors_config=VectorParams(size=768, distance=Distance.COSINE)  # size depends on your embedding model
)

In [None]:
def add_chunk_to_qdrant(chunk, chunk_index, source_file):
    embedding = ollama.embed(model=EMBEDDING_MODEL, input=chunk)['embeddings'][0]
    point_id = str(uuid.uuid4())  # unique ID for the point
    qdrant.upsert(
        collection_name=COLLECTION_NAME,
        points=[
            PointStruct(
                id=point_id,
                vector=embedding,
                payload={
                    "text": chunk,             # the chunk text
                    "chunk_index": chunk_index, # order in document
                    "source": source_file,      # which file it came from
                    "length": len(chunk)        # optional: size of chunk
                }
            )
        ]
    )

In [None]:
for i, chunk in enumerate(dataset):
    add_chunk_to_qdrant(chunk)
    print(f'Added chunk {i+1}/{len(dataset)} to Qdrant.')