In [1]:
import os
import fitz  # PyMuPDF
from PyPDF2 import PdfReader
from nltk.tokenize import sent_tokenize
import nltk
import torch
from PIL import Image
from uuid import uuid4
from qdrant_client import QdrantClient
from qdrant_client.models import PointStruct
from sentence_transformers import SentenceTransformer
from colpali_engine.models import ColPali, ColPaliProcessor
from tqdm import tqdm
import stamina
import io

nltk.download('punkt')

  from .autonotebook import tqdm as notebook_tqdm
[nltk_data] Downloading package punkt to /Users/ishgod01/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [2]:


# Initialize text embedding model
text_model = SentenceTransformer('all-MiniLM-L6-v2')
text_model.to(torch.device("mps" if torch.backends.mps.is_available() else "cpu"))

# Initialize ColPali (image embedding model)
image_model_name = "vidore/colpali-v1.2"
image_model = ColPali.from_pretrained(
    image_model_name, torch_dtype=torch.bfloat16, device_map="mps"
).eval()
image_processor = ColPaliProcessor.from_pretrained(image_model_name)


`config.hidden_act` is ignored, you should use `config.hidden_activation` instead.
Gemma's activation function will be set to `gelu_pytorch_tanh`. Please, use
`config.hidden_activation` if you want to override this behaviour.
See https://github.com/huggingface/transformers/pull/29402 for more details.
Loading checkpoint shards: 100%|██████████| 2/2 [00:10<00:00,  5.27s/it]


In [3]:

# Initialize Qdrant client
client = QdrantClient(path="qdrant_storage")
collection_name = "hybrid_embeddings"

# Check if collection exists, delete and create it
if client.collection_exists(collection_name):
    client.delete_collection(collection_name=collection_name)

client.create_collection(
    collection_name=collection_name,
    vectors_config={
        "size": text_model.get_sentence_embedding_dimension(),  # For text embeddings
        "distance": "Cosine",
    },
)


True

In [4]:

@stamina.retry(on=Exception, attempts=3)
def upsert_to_qdrant(points):
    try:
        client.upsert(collection_name=collection_name, points=points, wait=False)
    except Exception as e:
        print(f"Error during upsert: {e}")
        return False
    return True

# Helper function to extract metadata
def extract_metadata(file_path):
    doc = fitz.open(file_path)
    pdf_reader = PdfReader(file_path)

    metadata = {
        "filename": os.path.basename(file_path),
        "title": doc.metadata.get("title", ""),
        "author": doc.metadata.get("author", ""),
        "subject": doc.metadata.get("subject", ""),
        "keywords": doc.metadata.get("keywords", ""),
        "creator": doc.metadata.get("creator", ""),
        "producer": doc.metadata.get("producer", ""),
        "creation_date": doc.metadata.get("creationDate", ""),
        "modification_date": doc.metadata.get("modDate", ""),
        "page_count": len(doc),
        "custom_metadata": pdf_reader.metadata,
        "pages": [],
    }

    for page_num, page in enumerate(doc, start=1):
        page_data = {
            "page_number": page_num,
            "text": page.get_text(),
            "dimensions": {"width": page.rect.width, "height": page.rect.height},
            "images": [],
            "sentences": [],
        }

        # Extract images
        for img_index, img in enumerate(page.get_images(full=True)):
            xref = img[0]
            base_image = doc.extract_image(xref)
            image_info = {
                "image_index": img_index,
                "image_bytes": base_image["image"],
                "width": base_image.get("width"),
                "height": base_image.get("height"),
            }
            page_data["images"].append(image_info)

        # Extract sentences
        sentences = sent_tokenize(page_data["text"])
        page_data["sentences"] = sentences
        metadata["pages"].append(page_data)

    return metadata

# Process and store embeddings
def process_pdf(file_path):
    metadata = extract_metadata(file_path)

    with tqdm(total=len(metadata["pages"]), desc=f"Processing {metadata['filename']}", leave=False) as page_pbar:
        for page in metadata["pages"]:
            points = []

            # Generate text embeddings
            sentences = page["sentences"]
            text_embeddings = text_model.encode(sentences, batch_size=16, device=torch.device("mps"))

            for sentence, embedding in zip(sentences, text_embeddings):
                points.append(
                    PointStruct(
                        id=str(uuid4()),
                        vector=list(embedding),
                        payload={
                            "type": "text",
                            "filename": metadata["filename"],
                            "page_number": page["page_number"],
                            "sentence": sentence,
                            "title": metadata["title"],
                            "author": metadata["author"],
                            "subject": metadata["subject"],
                            "keywords": metadata["keywords"],
                            "creator": metadata["creator"],
                            "producer": metadata["producer"],
                            "creation_date": metadata["creation_date"],
                            "modification_date": metadata["modification_date"],
                            "dimensions": page["dimensions"],
                        },
                    )
                )

            # Generate image embeddings
            images = [Image.open(io.BytesIO(img["image_bytes"])) for img in page["images"]]
            if images:
                processed_images = image_processor.process_images(images).to(image_model.device)
                image_embeddings = image_model(**processed_images)

                # Add image embeddings as multivectors
                for img_info, embedding in zip(page["images"], image_embeddings):
                    multivector = embedding.cpu().float().numpy().tolist()
                    points.append(
                        PointStruct(
                            id=str(uuid4()),
                            vector=multivector,
                            payload={
                                "type": "image",
                                "filename": metadata["filename"],
                                "page_number": page["page_number"],
                                "image_index": img_info["image_index"],
                                "image_width": img_info["width"],
                                "image_height": img_info["height"],
                                "title": metadata["title"],
                                "author": metadata["author"],
                                "subject": metadata["subject"],
                                "keywords": metadata["keywords"],
                                "creator": metadata["creator"],
                                "producer": metadata["producer"],
                                "creation_date": metadata["creation_date"],
                                "modification_date": metadata["modification_date"],
                                "dimensions": page["dimensions"],
                            },
                        )
                    )

            # Upsert points to Qdrant
            upsert_to_qdrant(points)
            page_pbar.update(1)

# Process all PDFs in the input folder
input_folder = "input_pdfs"
with tqdm(total=len(os.listdir(input_folder)), desc="Processing PDFs") as pdf_pbar:
    for pdf_file in os.listdir(input_folder):
        if pdf_file.endswith(".pdf"):
            file_path = os.path.join(input_folder, pdf_file)
            process_pdf(file_path)
            pdf_pbar.update(1)

Processing PDFs:   0%|          | 0/4 [00:00<?, ?it/s]

Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)


Processing PDFs:  25%|██▌       | 1/4 [2:16:09<6:48:28, 8169.59s/it]

Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)




Error during upsert: could not broadcast input array from shape (1030,128) into shape (384,)


  if inputs_embeds[special_image_mask].numel() != image_features.numel():
Processing PDFs:  25%|██▌       | 1/4 [2:25:38<7:16:54, 8738.05s/it]


RuntimeError: MPS backend out of memory (MPS allocated: 13.54 GB, other allocations: 398.86 MB, max allowed: 18.13 GB). Tried to allocate 5.92 GB on private pool. Use PYTORCH_MPS_HIGH_WATERMARK_RATIO=0.0 to disable upper limit for memory allocations (may cause system failure).