In [5]:
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough, RunnableParallel
from langchain_core.output_parsers import StrOutputParser
from langchain_chroma import Chroma

from dotenv import load_dotenv
import os
load_dotenv()
from utils import document_loader, chunking, vectorstore, format_docs


In [None]:

PERSIST_DIRECTORY = "../data/chroma_db" 

TEMPLATE = """Use the following pieces of context to answer the question at the end. If you don't know the answer, just say that you don't know, don't try to make up an answer. Use three sentences maximum with maximum of 500 words. Keep the answer as concise as possible. Always say "thanks for asking!" at the end of the answer. 
{context}
Question: {input}
Helpful Answer:"""

class Rag():
    def __init__(self, template=TEMPLATE):
        self.llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)
        self.embedding = GoogleGenerativeAIEmbeddings(
            model="models/text-embedding-004",
            task_type="retrieval_document"  # Optimized for search
        )
        
        self.vectordb = None
        self.QAprompt = ChatPromptTemplate.from_template(template=template)
        self.qa_chain = None

         # Share state from disk to uvicorn workers, if any. 
        if os.path.exists(PERSIST_DIRECTORY):
            try:
                temp_db = Chroma(
                    persist_directory=PERSIST_DIRECTORY, 
                    embedding_function=self.embedding
                )
                
                if temp_db._collection.count() > 0:
                    self.vectordb = temp_db
                    print(f"Loaded existing vector store with {self.vectordb._collection.count()} chunks.")
                    
                    # Qa chain is initialized
                    self._initialize_lcel_chain(context="")

                else:
                    print("Persistent directory exists but the Chroma collection is empty.")
            except Exception as e:
                print(f"Error loading existing Chroma DB: {e}. Starting with empty state.")

    def _load_docs(self, file_path: str):
        loaded = document_loader(file_path)
        chunked = chunking(loaded)
        self.vectordb = vectorstore(chunked, self.embedding, PERSIST_DIRECTORY)
        
        self._initialize_lcel_chain()
        print(f"Document loaded and vector store created at {PERSIST_DIRECTORY}")

    def _initialize_lcel_chain(self, context: str = ""):
        if not self.vectordb:
            print("Cannot initialize chain: VectorDB is not loaded.")
            return
        
        rag_chain_from_docs = (
            RunnablePassthrough.assign(
                # context=lambda x: format_docs(x['context'])
                context= self._retriever_info(question=x["question"])
            )
            | self.QAprompt  
            | self.llm
            | StrOutputParser() # Parse the output to a string
        )

        self.qa_chain = RunnableParallel(
            answer=rag_chain_from_docs,
            input=RunnablePassthrough()
        )
    def _retriever_info(self, question: str):
        if not self.vectordb:
            return "No VectorDB loaded."
        results = self.vectordb.similarity_search("", k=3)
        return "\n\n".join([doc.page_content for doc in results])

    def _clear_db(self):
        if self.vectordb:
            try:
                self.vectordb.delete_collection()
                self.vectordb = None
                message = "Chroma DB cleared successfully."
            except Exception as e:
                message = (f"Error clearing Chroma DB: {e}")
            return (message)
        

    def ask(self, question: str):
        if self.qa_chain is None:
            return {
                'status_code': 400,
                'response': "No documents loaded. Please load documents first.",
                'sources': []
            }
        result = self.qa_chain.invoke({"question": question})
        print(result)



Loaded existing vector store with 9 chunks.


AttributeError: 'str' object has no attribute 'page_content'

In [15]:
import os
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from langchain_core.runnables import RunnablePassthrough
from langchain_chroma import Chroma

PERSIST_DIRECTORY = "../data/chroma_db"

TEMPLATE = """Use the following pieces of context to answer the question at the end. 
If you don't know the answer, just say that you don't know. 
Keep the answer concise (max 3 sentences, 500 words). 
Always end the answer with "thanks for asking!".

Context:
{context}

Question: {question}

Helpful Answer:"""

class Rag:
    def __init__(self, template=TEMPLATE):
        self.llm = ChatGoogleGenerativeAI(model="gemini-2.5-flash", temperature=0)
        self.embedding = GoogleGenerativeAIEmbeddings(
            model="models/text-embedding-004",
            task_type="retrieval_document"
        )

        self.vectordb = None
        self.QAprompt = ChatPromptTemplate.from_template(template)
        self.qa_chain = None

        if os.path.exists(PERSIST_DIRECTORY):
            try:
                temp_db = Chroma(
                    persist_directory=PERSIST_DIRECTORY,
                    embedding_function=self.embedding
                )
                if temp_db._collection.count() > 0:
                    self.vectordb = temp_db
                    print(f"Loaded existing vector store with {self.vectordb._collection.count()} chunks.")
                    self._initialize_lcel_chain()
                else:
                    print("Persistent directory exists but collection is empty.")
            except Exception as e:
                print(f"Error loading existing Chroma DB: {e}. Starting fresh.")

    def _load_docs(self, file_path: str):
        loaded = document_loader(file_path)
        chunked = chunking(loaded)

        """Assumes docs is a list of langchain Documents already split."""
        self.vectordb = Chroma.from_documents(
            documents=chunked,
            embedding=self.embedding,
            persist_directory=PERSIST_DIRECTORY
        )
        self._initialize_lcel_chain()
        print(f"✅ Vector store created at {PERSIST_DIRECTORY}")

    def _retriever_info(self, question: str):
        if not self.vectordb:
            return "No VectorDB loaded."
        results = self.vectordb.similarity_search(question, k=3)
        return "\n\n".join(doc.page_content for doc in results)

    def _initialize_lcel_chain(self):
        if not self.vectordb:
            print("Cannot initialize chain: VectorDB not loaded.")
            return

        self.qa_chain = (
            RunnablePassthrough.assign(
                context=lambda x: self._retriever_info(x["question"])
            )
            | self.QAprompt
            | self.llm
            | StrOutputParser()
        )

    def _clear_db(self):
        if self.vectordb:
            try:
                self.vectordb.delete_collection()
                self.vectordb = None
                return "✅ Chroma DB cleared successfully."
            except Exception as e:
                return f"Error clearing DB: {e}"

    def ask(self, question: str):
        if not self.qa_chain:
            return {
                "status_code": 400,
                "response": "No documents loaded. Please load documents first."
            }

        answer = self.qa_chain.invoke({"question": question})
        return {"status_code": 200, "response": answer}


In [None]:
if __name__ == "__main__":    

    path = "../data/uploads/inte.pdf"
    engine = Rag()
    engine._clear_db()

    engine._load_docs(path)
    while(True):
        que = input("What is your question?")
        result = engine.ask(que)
        print(result)

Loaded existing vector store with 9 chunks.
-----Loading Document-----
✅ Vector store created at ../data/chroma_db
{'status_code': 200, 'response': 'This context describes an internship experience, including an internship certificate, report, and project work. It details the skills gained in AI, automation, data analytics, Python programming, and Machine Learning. The internship focused on applying data-driven techniques to solve real-world problems.\nthanks for asking!'}
{'status_code': 200, 'response': "I don't know the answer because the question is missing. Please provide the question you would like me to answer based on the context. thanks for asking!"}


KeyboardInterrupt: Interrupted by user

In [17]:
result

{'status_code': 200,
 'response': "I don't know the answer because the question is missing. Please provide the question you would like me to answer based on the context. thanks for asking!"}

In [2]:
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_core.documents import Document

# Example input document(s)
docs = [Document(page_content="LangChain helps developers build LLM-based applications.")]

# Create a text splitter
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=50,
    chunk_overlap=10,
    length_function=len
)

# Split the documents into chunks
chunked_documents = text_splitter.split_documents(docs)

# Verify
for i, chunk in enumerate(chunked_documents, 1):
    print(f"Chunk {i}: {chunk.page_content}")


Chunk 1: LangChain helps developers build LLM-based
Chunk 2: LLM-based applications.
