- input: PDF file
- output: Vector DB representation

Steps:
1. Load & Parse PDF
2. Text splitting
3. Embedding
4. Vector DB

## 0. Install Modules

In [None]:
%pip install arxiv
%pip install langchain
%pip install pypdf
%pip install langchain_community
%pip install cohere
%pip install faiss-cpu

## 1. PDF Loader

In [11]:
# PDF File sample
import arxiv

def download_arxiv_doi(doi, save_path="paper.pdf"):
    '''
    download only arxiv papers
    '''
    if not doi.startswith("10.48550/arXiv."):
        raise Exception("Not an arXiv DOI.")
    
    arxiv_id = doi.split("arXiv.")[-1]
    search = arxiv.Search(id_list=[arxiv_id])
    paper = next(search.results())
    paper.download_pdf(filename=save_path)
    print(f"Downloaded {arxiv_id} to {save_path}")

# Example:
download_arxiv_doi("10.48550/arXiv.2510.18234")

  paper = next(search.results())


Downloaded 2510.18234 to paper.pdf


In [25]:
from docling.document_converter import DocumentConverter
from docling.datamodel.accelerator_options import AcceleratorDevice, AcceleratorOptions
from docling.datamodel.base_models import InputFormat
from docling_core.types.doc.document import PictureItem
from docling.document_converter import DocumentConverter, PdfFormatOption
from docling.datamodel.pipeline_options import PdfPipelineOptions
from pathlib import Path

class ArixParse:
    def __init__(self, pdf_path: str):
        self.path = pdf_path

        accelerator_options = AcceleratorOptions(
            num_threads=8, device=AcceleratorDevice.CUDA
        )
        self.pipeline_options = PdfPipelineOptions()
        self.pipeline_options.accelerator_options = accelerator_options

        self.pipeline_options.do_ocr = False
        self.pipeline_options.do_table_structure = True
        self.pipeline_options.ocr_options.lang = ["en"]
        self.pipeline_options.images_scale = 2.0
        self.pipeline_options.generate_page_images = False
        self.pipeline_options.generate_picture_images = True
        self.converter = DocumentConverter(
            format_options={
                InputFormat.PDF: PdfFormatOption(pipeline_options=self.pipeline_options)
            }
        )

    def parse(self):
        return self.converter.convert(self.path).document


def parsePDF(path:str, outputdir, withImages = False):
    """ takes PDF file path and returns PDF parsed pages text with metadata"""
    from langchain_community.document_loaders import PyPDFLoader
    llm_loader = PyPDFLoader(path)
    pages = llm_loader.load_and_split()
    
    if withImages:
        doc = ArixParse(pdf_path=path).parse()
        images_path = Path(outputdir) / "images"
        images_path.mkdir(exist_ok=True)

        image_counter = 0
        for element, level in doc.iterate_items():
            if isinstance(element, PictureItem):
                image_counter += 1
                img = element.get_image(doc)
                if img:
                    img_path = images_path / f"figure_{image_counter}.png"
                    img.save(img_path, "PNG")
    return pages



In [30]:
pages = parsePDF("paper.pdf", outputdir="output", withImages=True)

CUDA is not available in the system. Fall back to 'CPU'
CUDA is not available in the system. Fall back to 'CPU'


In [29]:
pages[1].page_content

'Contents\n1 Introduction 3\n2 Related Works 4\n2.1 Typical Vision Encoders in VLMs . . . . . . . . . . . . . . . . . . . . . . . . . . . . 4\n2.2 End-to-end OCR Models . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 4\n3 Methodology 5\n3.1 Architecture . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 5\n3.2 DeepEncoder . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 5\n3.2.1 Architecture of DeepEncoder . . . . . . . . . . . . . . . . . . . . . . . . . . 5\n3.2.2 Multiple resolution support . . . . . . . . . . . . . . . . . . . . . . . . . . . 6\n3.3 The MoE Decoder . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 7\n3.4 Data Engine . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 7\n3.4.1 OCR 1.0 data . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . 7\n3.4.2 OCR 2.0 data . . . . . . . . . . . . . . . . . . . . . . . . . . . . . . .

## 2. Text Splitter

In [31]:
from langchain_text_splitters import CharacterTextSplitter

text_splitter = CharacterTextSplitter(
    separator="",
    chunk_size=300,  # configurable variable
    length_function = len,
    chunk_overlap=50, # configurable variable
)

In [32]:
documents = []
metadatas = []
for page in pages:
    documents.append(page.page_content)
    metadatas.append(page.metadata)

In [33]:
chunks = text_splitter.create_documents(documents, metadatas)

In [34]:
len(chunks)

221

## 3. Embedding

In [35]:
from dotenv import dotenv_values
env_values = dotenv_values('app.env')
cohere_api_key = env_values['COHERE_API_KEY']

In [36]:
from langchain_community.embeddings.cohere import CohereEmbeddings
embedding_llm = CohereEmbeddings(cohere_api_key=cohere_api_key, user_agent="langchain")

  embedding_llm = CohereEmbeddings(cohere_api_key=cohere_api_key, user_agent="langchain")


## 4. Vector DB

In [37]:
from langchain_community.vectorstores import FAISS
vector_db = FAISS.from_documents(chunks, embedding_llm)

In [47]:
query = "VLM"
similar_docs = vector_db.similarity_search(query)

In [49]:
print(similar_docs[1].page_content)

VITDet
VIT
Down-
sample VIT
LLM
VIT
(navit) LLM
Down-
sample
LLM
Down-
sample
Vary/DeepSeekVL/...
[× ] unsupported pipeline  parallel   
1024
1024
224224
384
384
384
384
[× ] unsupported extreme resolution   
[× ] low native resolution  [× ] overly small patches
InternVL series/
DeepSeekVL2/...
usua


In [65]:
save_to_dir = "faiss_vector_data"
vector_db.save_local(save_to_dir)