In [23]:
import os
import logging
from langchain.text_splitter import CharacterTextSplitter
from langchain_openai.embeddings import OpenAIEmbeddings
from langchain_core.documents import Document
from google.cloud import documentai_v1beta3 as documentai
from qdrant_client import QdrantClient, models
from settings import env_settings
from dotenv import load_dotenv

load_dotenv()


logger = logging.getLogger(__name__)

QDRANT_URL = env_settings.QDRANT_URL
QDRANT_API_KEY = env_settings.QDRANT_API_KEY
OPENAI_API_KEY = env_settings.OPENAI_API_KEY
GCP_PROJECT_ID = env_settings.GCP_PROJECT_ID
GCP_LOCATION = env_settings.GCP_LOCATION
GCP_PROCESSOR_ID = env_settings.GCP_PROCESSOR_ID
GCP_PROCESSOR_VERSION = env_settings.GCP_PROCESSOR_VERSION

os.environ["GOOGLE_APPLICATION_CREDENTIALS"] = env_settings.GOOGLE_APPLICATION_CREDENTIALS


class EthanRAG:
    _qdrant_client: QdrantClient = None

    def __init__(self):
        if EthanRAG._qdrant_client is None:
            EthanRAG._qdrant_client = self.connect_qdrant()

    @staticmethod
    def connect_qdrant() -> QdrantClient:
        return QdrantClient(
            api_key=QDRANT_API_KEY,
            url=QDRANT_URL,
        )

    @staticmethod
    async def extract_text_from_pdf(file_path: str) -> str:
        try:
            logger.info(f"Processing with Google Document AI: {file_path}")
            client = documentai.DocumentProcessorServiceClient()
            name = client.processor_version_path(
                GCP_PROJECT_ID, GCP_LOCATION, GCP_PROCESSOR_ID, GCP_PROCESSOR_VERSION
            )
            with open(file_path, "rb") as pdf_file:
                raw_document = documentai.RawDocument(
                    content=pdf_file.read(), mime_type="application/pdf"
                )
            request = documentai.ProcessRequest(
                name=name,
                raw_document=raw_document,
                process_options=documentai.ProcessOptions(
                    ocr_config=documentai.OcrConfig(
                        enable_native_pdf_parsing=True)
                ),
            )
            result = client.process_document(request=request)
            document = result.document
            extracted_text = document.text
            return extracted_text
        except Exception as e:
            logger.error(f"Error extracting text from PDF: {e}")
            raise e

    @staticmethod
    async def process_and_store_pdf(file_path: str, filename: str):
        try:
            extracted_text = await EthanRAG.extract_text_from_pdf(file_path)
            text_splitter = CharacterTextSplitter(
                chunk_size=20,
                chunk_overlap=0,
                strip_whitespace=False,
                separator=""
            )
            chunks: list[Document] = text_splitter.create_documents([extracted_text])
            embeddings = OpenAIEmbeddings(
                api_key=OPENAI_API_KEY, 
                model="text-embedding-3-large"
            )
            qdrant_client = EthanRAG._qdrant_client
            # qdrant_client.create_collection(
            #     collection_name=filename,
            #     vectors_config=models.VectorParams(
            #         size=3072,
            #         distance=models.Distance.COSINE,
            #     )
            # )

            qdrant_client.upsert(
                collection_name=filename,
                points=[
                    models.PointStruct(
                        id=str(i),
                        vector=embeddings.embed_documents([chunk.page_content])[0],
                        payload={"text": chunk.page_content}
                    ) for i, chunk in enumerate(chunks)
                ],
            )
            logger.info(f"Successfully processed and stored PDF: {filename}")

        except Exception as e:
            logger.error(f"Error processing PDF: {e}")
            raise e


In [24]:
ethan_rag = EthanRAG()

await ethan_rag.process_and_store_pdf(
    file_path="K:\\EthanAI\\ethan-ai-rag\\uploads\\bala_monthly_report_dec_24-1-5.pdf",
    filename="example_collection"
)

Error processing PDF: The write operation timed out


ResponseHandlingException: The write operation timed out