## Setup

In [9]:
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain_core.messages import HumanMessage, SystemMessage
from langchain_community.vectorstores import FAISS
from langchain_community.document_loaders import PyPDFLoader
from langchain_ollama import OllamaEmbeddings
from langchain_openai import AzureChatOpenAI

from dotenv import load_dotenv
from PIL import Image

import io
import base64
import fitz  # PyMuPDF
import os
import sys

load_dotenv()

True

### Extract images from the pdf

- Use a library like PyMuPDF (fitz) or pdf2image to extract images
- Use PyMuPDF or pdfplumber to extract text content

In [2]:
# Open the PDF file
pdf_document = "attention.pdf"  # Replace with your PDF file path
doc = fitz.open(pdf_document)

all_images = dict()

# Loop through each page and extract images
for page_number in range(len(doc)):
    page = doc[page_number]
    images = page.get_images(full=True)
    for img_index, img in enumerate(images):
        xref = img[0]
        base_image = doc.extract_image(xref)
        image_bytes = base_image["image"]
        image_ext = base_image["ext"]
        image_filename = f"page{page_number+1}_img{img_index+1}.{image_ext}"
        
        all_images[image_filename] = image_bytes

        # Save the image
        with open(image_filename, "wb") as image_file:
            image_file.write(image_bytes)
        print(f"Saved: {image_filename}")

Saved: page3_img1.png
Saved: page4_img1.png
Saved: page4_img2.png


In [None]:
all_images

In [12]:
endpoint = os.getenv("NATL_AZURE_OPENAI_ENDPOINT")
model_name = os.getenv("NATL_AZURE_OPENAI_MODEL_NAME")
deployment = os.getenv("NATL_AZURE_OPENAI_MODEL__DEPLOYMENT_NAME")

subscription_key = os.getenv("NATL_AZURE_OPENAI_KEY")
api_version = "2024-12-01-preview"

llm = AzureChatOpenAI(
    azure_deployment=deployment,
    openai_api_version=api_version,
    azure_endpoint=endpoint,
    api_key=subscription_key,
    temperature=0.7
)

image_summaries = {}

# Process each image
for image_filename, image_bytes in all_images.items():
    # Convert image bytes to base64
    base64_image = base64.b64encode(image_bytes).decode('utf-8')
    
    # Create a message with image content
    message = HumanMessage(
        content=[
            {"type": "text", "text": """
                                        You are an assistant tasked with summarizing tables and text.
                                        Give a concise summary of the table or text.

                                        Respond only with the summary, no additional comment.
                                        Do not start your message by saying "Here is a summary" or anything like that.
                                        Your summary will be used for future retrieval, so stick with only information that will aid in accurate retrieval later on.
                                        Just give the summary as it is.
                                        """
            },
            {
                "type": "image_url",
                "image_url": {
                    "url": f"data:image/jpeg;base64,{base64_image}"
                }
            }
        ]
    )
    
    # Get the response
    response = llm.invoke([message])
    image_summaries[image_filename] = response.content

image_summaries

{'page3_img1.png': 'The diagram depicts a Transformer model architecture with separate encoder and decoder stacks each repeated N times. The encoder processes inputs by adding positional encoding to input embeddings, followed by layers of multi-head attention and feed-forward networks with add & norm steps. The decoder processes output embeddings (shifted right) with positional encoding, followed by masked multi-head attention, multi-head attention over encoder outputs, and feed-forward layers with add & norm steps. The decoder output passes through a linear layer and softmax to produce output probabilities.',
 'page4_img1.png': 'The diagram represents the scaled dot-product attention mechanism, starting with the inputs Q (query) and K (key) undergoing matrix multiplication, followed by scaling, optional masking, softmax application, and a final matrix multiplication with V (value) to produce the output.',
 'page4_img2.png': 'Multi-head attention mechanism: Inputs V, K, Q are each line

### Process and Summarize the Extracted Text

- Send the extracted text to Azure AI (OpenAI service) or Ollama for summarization.
- Using LangChain, define a prompt to get concise and meaningful summaries.

In [None]:
## PDF loader

## testing sharepoint doc
# attention_loader = PyPDFLoader('C:/Users/ncvjqs/OneDrive - Great American Insurance Group/Regulatory Compliance - Multi-Company/A.M. Best State Rate Filings_10.19.22.pdf')

attention_loader = PyPDFLoader('attention.pdf')
attention_docs = attention_loader.load()

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
attention_documents = text_splitter.split_documents(attention_docs)

# Vector embedding and vector store
embedding = OllamaEmbeddings(model="nomic-embed-text")

try:
    db = FAISS.load_local("../storage/attention_index", embedding, allow_dangerous_deserialization=True)
except Exception as e:
    db = FAISS.from_documents(attention_docs, embedding)    
    db.save_local("../storage/attention_index")

In [None]:

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
attention_documents = text_splitter.split_documents(attention_docs)

# Vector embedding and vector store
embedding = OllamaEmbeddings(model="nomic-embed-text")

try:
    db = FAISS.load_local("../storage/attention_index", embedding, allow_dangerous_deserialization=True)
except Exception as e:
    db = FAISS.from_documents(attention_docs, embedding)    
    db.save_local("../storage/attention_index")

### Store Summaries in a Vector Database

- Use FAISS or Azure Cognitive Search to store vector embeddings.
- Generate embeddings using Azure AI embeddings API or a compatible embedding model.
- Ensure that each summary is linked to a document ID for easy retrieval.

### Store Images Separately

- Save extracted images in Azure Blob Storage or a local database.
- Maintain a mapping between stored images and their corresponding document IDs.

### Implement Retrieval-Augmented Generation (RAG)

- When querying:
    - Retrieve the relevant text summary from the vector database using similarity search.
    - Fetch the corresponding images from storage using the linked document ID.
- Pass retrieved text to the LLM for final response generation.

### Serving the Pipeline

- Use FastAPI or Flask to expose an API that handles document ingestion and querying.
- For indexing, batch-process incoming PDFs asynchronously.