Set-ExecutionPolicy -Scope Process -ExecutionPolicy Bypass
.\venv\Scripts\activate

# RAG System Setup
This notebook demonstrates how to set up a Retrieval-Augmented Generation (RAG) system with the following features:

- **PDF Reading**: Supports text, tables, and images.
- **Vector Database**: Uses Milvus for storing embeddings.
- **Embeddings Model**: Local transformer-based model.
- **LLM**: Gemini model for language generation.

In [2]:
# Import necessary libraries
from langchain.document_loaders import PyPDFLoader
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import Milvus
from langchain.llms import OpenAI
import fitz  # PyMuPDF for PDF processing
import pytesseract  # OCR library
import pandas as pd
from PIL import Image, UnidentifiedImageError
from io import BytesIO
import os

In [3]:
# Configure Milvus Vector Database
from pymilvus import connections, FieldSchema, CollectionSchema, DataType, Collection

# Connect to Milvus
connections.connect("default", host="localhost", port="19530")

# Define schema for the collection
fields = [
    FieldSchema(name="id", dtype=DataType.INT64, is_primary=True),
    FieldSchema(name="embedding", dtype=DataType.FLOAT_VECTOR, dim=768)
]
schema = CollectionSchema(fields, description="Document Embeddings")

# Create collection
collection = Collection(name="document_embeddings", schema=schema)

# Create an index for the collection
index_params = {
    "index_type": "IVF_FLAT",  # Index type
    "metric_type": "L2",       # Distance metric
    "params": {"nlist": 128}   # Number of clusters
}
collection.create_index(field_name="embedding", index_params=index_params)

# Load the collection after creating the index
collection.load()

print("Index created and collection loaded successfully.")

Index created and collection loaded successfully.


In [4]:
# Load PDF and extract text using PyPDFLoader with UTF-8 decoding
from langchain.document_loaders import PyPDFLoader
pdf_path = r"metric.pdf"
try:
    loader = PyPDFLoader(pdf_path)
    documents = loader.load()
    # Decode text content to UTF-8 to avoid font issues
    for doc in documents:
        doc.page_content = doc.page_content.encode('utf-8', 'ignore').decode('utf-8')
    print(f"Successfully loaded {len(documents)} documents from the PDF with UTF-8 decoding.")
except FileNotFoundError:
    print(f"The file '{pdf_path}' was not found. Please check the file path.")
    documents = []
except Exception as e:
    print(f"An error occurred while loading the PDF file: {e}")
    documents = []

Successfully loaded 12 documents from the PDF with UTF-8 decoding.


In [6]:
# Perform OCR on images using Vertex AI
from dotenv import load_dotenv
import requests
import base64
from google.auth.transport.requests import Request
from google.oauth2 import id_token

# Load environment variables
load_dotenv()

ocr_results = []

for page_index, page_image in enumerate(pages):
    try:
        # Convert image to Base64
        image_buffer = BytesIO()
        page_image.save(image_buffer, format="PNG")
        image_buffer.seek(0)
        image_base64 = base64.b64encode(image_buffer.read()).decode('utf-8')

        # Prepare request payload
        payload = {
            "instances": [
                {"content": {"imageBytes": image_base64}}
            ]
        }

        # Get OAuth2 token
        auth_request = Request()
        token = id_token.fetch_id_token(auth_request, VERTEX_AI_ENDPOINT)

        # Perform OCR using Vertex AI
        headers = {
            "Authorization": f"Bearer {token}",
            "Content-Type": "application/json"
        }
        response = requests.post(VERTEX_AI_ENDPOINT, headers=headers, json=payload, timeout=10)

        if response.status_code == 200:
            ocr_text = response.json().get("predictions", [{}])[0].get("text", "")
            ocr_results.append({"page_index": page_index, "text": ocr_text})
            print(f"OCR Text from page {page_index}:", ocr_text)
        else:
            print(f"Error from Vertex AI for page {page_index}: {response.text}")

    except requests.exceptions.RequestException as e:
        print(f"Request error on page {page_index}: {e}")
    except Exception as e:
        print(f"Error processing page {page_index}: {e}")

# Print summary of OCR results
if ocr_results:
    print("OCR completed successfully for the following pages:")
    for result in ocr_results:
        print(f"Page {result['page_index']}: {result['text'][:100]}...")  # Print first 100 characters
else:
    print("No OCR results available.")

NameError: name 'pages' is not defined

In [None]:
# Generate Embeddings and Store in Milvus
from langchain.embeddings import HuggingFaceEmbeddings

# Initialize embeddings model
embedding_model = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")

# Generate embeddings
for doc in documents:
    embedding = embedding_model.embed_query(doc.page_content)
    collection.insert([[doc.metadata.get("id", 0)], [embedding]])

print("Embeddings stored in Milvus.")