## Setup

In [1]:
from langchain_community.document_loaders import PyPDFLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_ollama import OllamaEmbeddings
from langchain_community.vectorstores import FAISS
from dotenv import load_dotenv
import fitz  # PyMuPDF
import os
import sys

load_dotenv()

True

### Extract images from the pdf

- Use a library like PyMuPDF (fitz) or pdf2image to extract images
- Use PyMuPDF or pdfplumber to extract text content

In [None]:
# Open the PDF file
pdf_document = "attention.pdf"  # Replace with your PDF file path
doc = fitz.open(pdf_document)

all_images = dict()

# Loop through each page and extract images
for page_number in range(len(doc)):
    page = doc[page_number]
    images = page.get_images(full=True)
    for img_index, img in enumerate(images):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]
        image_filename = f"page{page_number+1}_img{img_index+1}.{image_ext}"
        
        all_images += [image_filename, image_bytes]

        # Save the image
        with open(image_filename, "wb") as image_file:
            image_file.write(image_bytes)
        print(f"Saved: {image_filename}")

AttributeError: 'dict' object has no attribute 'add'

In [5]:
all_images

[(128, 175, 1520, 2239, 8, 'DeviceRGB', '', 'Im1', 'FlateDecode', 0),
 (182, 200, 445, 884, 8, 'DeviceRGB', '', 'Im2', 'FlateDecode', 0),
 (183, 201, 835, 1282, 8, 'DeviceRGB', '', 'Im3', 'FlateDecode', 0)]

### Process and Summarize the Extracted Text

- Send the extracted text to Azure AI (OpenAI service) or Ollama for summarization.
- Using LangChain, define a prompt to get concise and meaningful summaries.

In [3]:
## PDF loader

## testing sharepoint doc
# attention_loader = PyPDFLoader('C:/Users/ncvjqs/OneDrive - Great American Insurance Group/Regulatory Compliance - Multi-Company/A.M. Best State Rate Filings_10.19.22.pdf')

attention_loader = PyPDFLoader('attention.pdf')
attention_docs = attention_loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
attention_documents = text_splitter.split_documents(attention_docs)

# Vector embedding and vector store
embedding = OllamaEmbeddings(model="nomic-embed-text")

try:
    db = FAISS.load_local("../storage/attention_index", embedding, allow_dangerous_deserialization=True)
except Exception as e:
    db = FAISS.from_documents(attention_docs, embedding)    
    db.save_local("../storage/attention_index")

### Store Summaries in a Vector Database

- Use FAISS or Azure Cognitive Search to store vector embeddings.
- Generate embeddings using Azure AI embeddings API or a compatible embedding model.
- Ensure that each summary is linked to a document ID for easy retrieval.

### Store Images Separately

- Save extracted images in Azure Blob Storage or a local database.
- Maintain a mapping between stored images and their corresponding document IDs.

### Implement Retrieval-Augmented Generation (RAG)

- When querying:
    - Retrieve the relevant text summary from the vector database using similarity search.
    - Fetch the corresponding images from storage using the linked document ID.
- Pass retrieved text to the LLM for final response generation.

### Serving the Pipeline

- Use FastAPI or Flask to expose an API that handles document ingestion and querying.
- For indexing, batch-process incoming PDFs asynchronously.