## Install Dependencies

In [None]:
! pip install transformers torch pdf2image pytesseract faiss-cpu python-multipart Pillay

In [5]:
! pip install pdf2image

Collecting pdf2image
  Using cached pdf2image-1.17.0-py3-none-any.whl.metadata (6.2 kB)
Using cached pdf2image-1.17.0-py3-none-any.whl (11 kB)
Installing collected packages: pdf2image
Successfully installed pdf2image-1.17.0


In [1]:
from transformers import pipeline

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
qa_pipeline = pipeline("question-answering", model="deepset/roberta-base-squad2")

To support symlinks on Windows, you either need to activate Developer Mode or to run Python as an administrator. In order to activate developer mode, see this article: https://docs.microsoft.com/en-us/windows/apps/get-started/enable-your-device-for-development
Device set to use cpu


## Extract Text and Images from PDF

In [6]:
from pdf2image import convert_from_path
import pytesseract
import os

In [None]:
def extract_pdf_content(pdf_path, output_image_dir):
    # Convert PDF to images (one per page)
    images = convert_from_path(pdf_path)
    os.makedirs(output_image_dir, exist_ok=True)
    
    all_data = []
    for page_num, image in enumerate(images):
        # Save page image
        image_path = f"{output_image_dir}/page_{page_num}.jpg"
        image.save(image_path, "JPEG")
        
        # OCR with Tesseract (include bounding boxes)
        ocr_data = pytesseract.image_to_data(image, output_type=pytesseract.Output.DICT)
        
        # Extract words and bounding boxes
        page_content = []
        for i in range(len(ocr_data["text"])):
            if ocr_data["text"][i].strip():
                page_content.append({
                    "text": ocract_data["text"][i],
                    "bbox": (
                        ocr_data["left"][i],
                        ocr_data["top"][i],
                        ocr_data["width"][i] + ocr_data["left"][i],
                        ocr_data["height"][i] + ocr_data["top"][i]
                    ),
                    "page": page_num,
                    "type": "text"
                })
        
        # Add image regions (example: detect figures using heuristics)
        # For production, use an object detection model (e.g., Detectron2)
        page_content.append({
            "text": f"[IMAGE: Page {page_num}]",
            "bbox": (0, 0, image.width, image.height),  # Full page as image region
            "page": page_num,
            "type": "image",
            "image_path": image_path
        })
        
        all_data.extend(page_content)
    
    return all_data

## Generate Embeddings with LayoutLMV3

In [9]:
from transformers import LayoutLMv3Processor, LayoutLMv3Model
import torch

In [None]:
device = "cuda" if torch.cuda.is_available() else "cpu"
print(device)

cpu


In [None]:
processor = LayoutLMv3Processor.from_pretrained("microsoft/layoutlmv3-base")
model = LayoutLMv3Model.from_pretrained("microsoft/layoutlmv3-base").to(device)

def generate_embeddings(content_data):
    embeddings = []
    for item in content_data:
        if item["type"] == "text":
            # Process text with LayoutLMv3
            encoding = processor(
                item["image_path"],  # Original page image
                item["text"],
                boxes=[item["bbox"]],
                return_tensors="pt"
            ).to(device)
            
            with torch.no_grad():
                outputs = model(**encoding)
            text_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            
            embeddings.append({
                "embedding": text_embedding,
                "metadata": item
            })
        elif item["type"] == "image":
            # Process image region with LayoutLMv3's visual encoder
            encoding = processor(
                item["image_path"],
                return_tensors="pt"
            ).to(device)
            
            with torch.no_grad():
                outputs = model(**encoding)
            image_embedding = outputs.last_hidden_state.mean(dim=1).cpu().numpy()
            
            embeddings.append({
                "embedding": image_embedding,
                "metadata": item
            })
    return embeddings

## Store in Vector Database(FAISS)

In [None]:
import numpy as np
import faiss

In [None]:
class VectorDB:
    def __init__(self):
        self.index = None
        self.metadata = []
    
    def add_embeddings(self, embeddings):
        all_embeddings = [e["embedding"] for e in embeddings]
        self.metadata.extend([e["metadata"] for e in embeddings])
        
        if self.index is None:
            self.index = faiss.IndexFlatL2(all_embeddings[0].shape[1])
        
        self.index.add(np.concatenate(all_embeddings))
    
    def search(self, query_embedding, k=5):
        distances, indices = self.index.search(query_embedding, k)
        return [self.metadata[i] for i in indices[0]]

# Initialize DB
vector_db = VectorDB()

# Process PDF and store embeddings
pdf_content = extract_pdf_content("document.pdf")
embeddings = generate_embeddings(pdf_content)
vector_db.add_embeddings(embeddings)

## Query with RAG integration

In [3]:
def rag_query(question):
    # Generate question embedding
    question_encoding = processor(
        text=question,
        return_tensors="pt"
    ).to(device)
    
    with torch.no_grad():
        question_embedding = model(**question_encoding).last_hidden_state.mean(dim=1).cpu().numpy()
    
    # Retrieve relevant content
    results = vector_db.search(question_embedding)
    
    # Format context for QA
    context = " ".join([item["text"] for item in results if item["type"] == "text"])
    
    # Answer question
    answer = qa_pipeline(question=question, context=context)
    return answer

# Example usage
print(rag_query("What is the total amount on the invoice?"))

NameError: name 'processor' is not defined