- IRP ENV
- This code is for extracting, text, images and convert pdfs to images and save in releavnt directories.
- Texts are chunked and embeddings are created.
- With meta data text are stored to given chromadb collection
- meta data includes
    - pdf name
    - page number
    - bounding box(empty, kept for further use)
    - dataset name

In [None]:
# Install dependencies (run in Jupyter or your environment)
# !pip install langchain pymupdf chromadb nomic tiktoken pytesseract opencv-python


In [5]:
import os
import fitz  # PyMuPDF
import pytesseract
import cv2


In [3]:
# from langchain.embeddings import NomicEmbeddings
from pdf2image import convert_from_path
import os
from openai import OpenAI
from langchain.embeddings.base import Embeddings
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma
from langchain.document_loaders import PyMuPDFLoader
from langchain.docstore.document import Document

In [47]:

# EMBEDDINGS via LM Studio
# Custom wrapper to integrate LM Studio embeddings with LangChain
class LMStudioEmbeddings(Embeddings):
    def __init__(self, base_url="http://localhost:1235/v1", api_key="lm-studio", model="text-embedding-nomic-embed-text-v1.5"):
        self.client = OpenAI(base_url=base_url, api_key=api_key)
        self.model = model

    def embed_documents(self, texts):
        return [self.client.embeddings.create(input=[text.replace("\n", " ")], model=self.model).data[0].embedding for text in texts]

    def embed_query(self, text):
        return self.client.embeddings.create(input=[text.replace("\n", " ")], model=self.model).data[0].embedding

# Use LM Studio Embeddings in LangChain
embeddings = LMStudioEmbeddings()


In [48]:
# TEXT CHUNKER
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)

## Functions

In [5]:
# 1️. Function: Extract images from PDFs
def extract_images(pdf_path, output_base_dir):
    doc = fitz.open(pdf_path)
    output_dir = os.path.join(output_base_dir, "images_extracted")
    os.makedirs(output_dir, exist_ok=True)
    
    image_paths = []
    for page_num in range(len(doc)):
        for img_index, img in enumerate(doc[page_num].get_images(full=True)):
            xref = img[0]
            pix = fitz.Pixmap(doc, xref)
            img_name = f"{os.path.splitext(os.path.basename(pdf_path))[0]}_img_p{page_num+1}_{img_index}.png"
            img_path = os.path.join(output_dir, img_name)
            if pix.n < 5:
                pix.save(img_path)
            else:
                pix = fitz.Pixmap(fitz.csRGB, pix)
                pix.save(img_path)
            image_paths.append(img_path)
    return image_paths



# 2. Function: Load text from PDF using PyMuPDFLoader
def extract_text_from_pdf(pdf_path):
    loader = PyMuPDFLoader(pdf_path,
                           mode = 'page') # extracts the whole pdf as a single langchain doc object
    return loader.load()


# 3. Function: Process a single PDF and return text chunks with metadata
def process_pdf(pdf_path, dataset_name):
    docs = extract_text_from_pdf(pdf_path)
    all_chunks = []
    for doc in docs:
        chunks = text_splitter.split_documents([doc])
        for chunk in chunks:
            chunk.metadata.update({
                "dataset_name": dataset_name,
                "pdf_name": os.path.basename(pdf_path),
                "page_number": chunk.metadata.get("page", ""),
                "bounding_box": "",  # Not used here
            })
        all_chunks.extend(chunks)
    return all_chunks

# 4. Function: convert pdfs to images and save

def save_pdf_pages_as_images(pdf_path, output_base_dir):
    from pathlib import Path
    
    dataset_name = Path(pdf_path).parts[-3]
    output_dir = os.path.join(output_base_dir, "pdfs_to_images")
    os.makedirs(output_dir, exist_ok=True)

    pdf_name = os.path.splitext(os.path.basename(pdf_path))[0]
    page_images = convert_from_path(pdf_path, dpi=300)

    for i, img in enumerate(page_images):
        img.save(os.path.join(output_dir, f"{pdf_name}_page_{i+1}.png"), "PNG")




In [54]:

# 4. Function: Process an entire dataset folder
def process_dataset(dataset_name):
    input_docs_path = os.path.join(DATA_FOLDER, dataset_name, "docs")
    output_base_path = os.path.join(DATA_FOLDER_NEW, dataset_name)
    os.makedirs(output_base_path, exist_ok=True)

    collection = Chroma(
        collection_name=dataset_name,
        embedding_function=embeddings,
        persist_directory=os.path.join("chromadb", dataset_name)
    )

    for filename in os.listdir(input_docs_path):
        if filename.endswith(".pdf"):
            pdf_path = os.path.join(input_docs_path, filename)

            # Save rendered pages
            save_pdf_pages_as_images(pdf_path, output_base_path)

            # Save embedded images
            extract_images(pdf_path, output_base_path)

            # Extract and store text chunks
            chunks = process_pdf(pdf_path, dataset_name)
            collection.add_documents(chunks)

    collection.persist()
    print(f"[✓] Processed: {dataset_name}")

---

# For SPIQA dataset

In [56]:
# === CONFIG ===
dataset_name = "spiqa"
DATA_FOLDER = "../../../Data/VisDoM-main"
DATA_FOLDER_NEW = "../../../Data/VisDOM"
CHROMA_DB_FOLDER = "../../../chromadb"


### Extract Images

In [2]:
from tqdm.auto import tqdm

### Save PDF as Images

In [59]:
input_docs_path = os.path.join(DATA_FOLDER, dataset_name, "docs")
output_base_path = os.path.join(DATA_FOLDER_NEW, dataset_name)
os.makedirs(output_base_path, exist_ok=True)


for filename in tqdm(os.listdir(input_docs_path)):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(input_docs_path, filename)

        # Save rendered pages
        save_pdf_pages_as_images(pdf_path, output_base_path)

print(f"[✓] Processed: {dataset_name}")

  0%|          | 0/118 [00:00<?, ?it/s]

[✓] Processed: spiqa


In [6]:
dataset_name = "scigraphvqa"
DATA_FOLDER = "../../../Data/VisDoM-main"
DATA_FOLDER_NEW = "../../../Data/VisDOM"
CHROMA_DB_FOLDER = "../../../chromadb"

input_docs_path = os.path.join(DATA_FOLDER, dataset_name, "docs")
output_base_path = os.path.join(DATA_FOLDER_NEW, dataset_name)
os.makedirs(output_base_path, exist_ok=True)

for filename in tqdm(os.listdir(input_docs_path)):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(input_docs_path, filename)

        # Save rendered pages
        save_pdf_pages_as_images(pdf_path, output_base_path)

print(f"[✓] Processed: {dataset_name}")

  0%|          | 0/460 [00:00<?, ?it/s]

[✓] Processed: scigraphvqa


In [7]:
dataset_name = "slidevqa"
DATA_FOLDER = "../../../Data/VisDoM-main"
DATA_FOLDER_NEW = "../../../Data/VisDOM"
CHROMA_DB_FOLDER = "../../../chromadb"

input_docs_path = os.path.join(DATA_FOLDER, dataset_name, "docs")
output_base_path = os.path.join(DATA_FOLDER_NEW, dataset_name)
os.makedirs(output_base_path, exist_ok=True)

for filename in tqdm(os.listdir(input_docs_path)):
    if filename.endswith(".pdf"):
        pdf_path = os.path.join(input_docs_path, filename)

        # Save rendered pages
        save_pdf_pages_as_images(pdf_path, output_base_path)

print(f"[✓] Processed: {dataset_name}")

  0%|          | 0/244 [00:00<?, ?it/s]

[✓] Processed: slidevqa


In [None]:
# feta_tab, paper_tab, scigraphvqa, slidevqa, spiqa