In [None]:
# !pip install llama_index.llms.azure_openai
# !pip install llama_index.embeddings.azure_openai
# !pip install python-dotenv
# !pip install pymupdf
# !pip install azure
# !pip install azure-ai-documentintelligence
# !pip install surya-ocr
# !pip install pytesseract
# !pip install pandas
# !pip install llama-index llama-index-experimental



# End to End RAG Flow With OCR

In [None]:
from PIL import Image
from surya.ordering import batch_ordering
from surya.model.ordering.processor import load_processor
from surya.model.ordering.model import load_model
import pytesseract  # For OCR

# Function to perform OCR using Tesseract and layout ordering using Surya
def ocr_with_surya(image_path):
    # Load the image
    image = Image.open(image_path)

    # Perform OCR using Tesseract to extract text
    ocr_text = pytesseract.image_to_string(image)

    # Dummy bounding boxes (in practice, you'd get this from a layout model)
    # You can replace this with an actual layout model that detects bboxes
    bboxes = [[0, 0, image.size[0], image.size[1]]]

    # Load the Surya ordering model and processor
    model = load_model()
    processor = load_processor()

    # Perform layout ordering on the image
    order_predictions = batch_ordering([image], [bboxes], model, processor)

    # Return the OCR text and layout predictions
    return ocr_text, order_predictions

# Example usage
ocr_text, layout_order = ocr_with_surya("./data/passport.png")  # Using an image of the document

# Now use the extracted OCR text in your document processing
from llama_index.core import Document
document = Document(text=ocr_text)

# Use a text splitter (SentenceSplitter) to chunk the document into nodes
from llama_index.core.node_parser import SentenceSplitter
splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=50)
nodes = splitter.get_nodes_from_documents([document])

# Create a vector index from the nodes
from llama_index.core import GPTVectorStoreIndex
index = GPTVectorStoreIndex.from_documents([document])

# Query the index
# query_engine = index.as_query_engine()
# response = query_engine.query("Who does the passport belong to and what is the nationality of the person?")
# print(response)


In [None]:
import fitz  # PyMuPDF
import pytesseract
from PIL import Image
import io
from surya.ordering import batch_ordering
from surya.model.ordering.processor import load_processor
from surya.model.ordering.model import load_model

# Function to extract images from PDF and apply OCR with Surya
def ocr_with_surya_for_pdf(pdf_path):
    # Open the PDF file
    doc = fitz.open(pdf_path)
    ocr_results = []

    # Iterate through the pages of the PDF
    for page_num in range(len(doc)):
        page = doc.load_page(page_num)

        # Extract images from the page
        image_list = page.get_images(full=True)
        for img_index, img_info in enumerate(image_list):
            xref = img_info[0]
            base_image = doc.extract_image(xref)
            image_bytes = base_image["image"]
            image = Image.open(io.BytesIO(image_bytes))

            # Perform OCR on the extracted image
            ocr_text = pytesseract.image_to_string(image)

            # Dummy bounding boxes for layout ordering
            bboxes = [[0, 0, image.size[0], image.size[1]]]

            # Load the Surya ordering model and processor
            model = load_model()
            processor = load_processor()

            # Perform layout ordering on the image
            order_predictions = batch_ordering([image], [bboxes], model, processor)

            # Append OCR text and layout ordering results
            ocr_results.append({
                "page": page_num + 1,
                "image_index": img_index + 1,
                "ocr_text": ocr_text,
                "layout_order": order_predictions
            })

    return ocr_results

# Example usage with a PDF file containing scanned images
pdf_ocr_results = ocr_with_surya_for_pdf("./data/mai.pdf")

# Combine OCR results into a single text
combined_ocr_text = "\n\n".join([result['ocr_text'] for result in pdf_ocr_results])

# Now use the combined OCR text in your document processing
from llama_index.core import Document
document = Document(text=combined_ocr_text)

# Use a text splitter (SentenceSplitter) to chunk the document into nodes
from llama_index.core.node_parser import SentenceSplitter
splitter = SentenceSplitter(chunk_size=1024, chunk_overlap=50)
nodes = splitter.get_nodes_from_documents([document])

# Create a vector index from the nodes
from llama_index.core import GPTVectorStoreIndex
index = GPTVectorStoreIndex.from_documents([document])

# Query the index
# query_engine = index.as_query_engine()
# response = query_engine.query("What is this program and what university is offering it?")
# print(response)
