In [16]:
import os
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_google_genai import GoogleGenerativeAIEmbeddings, ChatGoogleGenerativeAI
from langchain_core.prompts import ChatPromptTemplate
import google.generativeai as genai
from langchain_community.vectorstores import FAISS
from langchain_core.runnables import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser
from rank_bm25 import BM25Okapi
from langchain_community.retrievers import BM25Retriever

In [17]:
# Set up Google API key
os.environ["GOOGLE_API_KEY"] = "API_KEY"
genai.configure(api_key=os.environ["GOOGLE_API_KEY"])

In [18]:
def load_and_split_documents(file_paths):
    documents = []
    for file_path in file_paths:
        loader = PyPDFLoader(file_path)
        documents.extend(loader.load())

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=200)
    return text_splitter.split_documents(documents)

In [19]:
def create_vector_store(documents):
    embeddings = GoogleGenerativeAIEmbeddings(model="models/embedding-001")
    return FAISS.from_documents(documents, embeddings)

In [20]:
def create_bm25_retriever(documents):
    return BM25Retriever.from_documents(documents)

In [21]:
def create_retriever(vector_store, documents, method="dense"):
    if method == "dense":
        return vector_store.as_retriever(search_kwargs={"k": 5})
    elif method == "bm25":
        return create_bm25_retriever(documents)
    else:
        raise ValueError("Invalid retrieval method. Choose 'dense' or 'bm25'.")

In [22]:
prompt_template = """
User Query: {user_input}

Relevant Corpus Data:
{context}

You are a document analysis assistant. Based on the User Query and the relevant Corpus data, please provide a detailed and accurate response. If you need any clarification or additional information, please ask.

The answer should be in points and then subpoints. Use paragraphs only when necessary.

Focus solely on the document content to answer the user's question. But in case user wants some relevant knowledge releated to corpus then allow the outside access.  If there is a user query that cannot be answered using the provided context, respond with 'Please ask questions about the Corpus'.

Do not repeat the user's question. If the user's query is vague, provide answers and also suggest more specific questions.

Chat History:
{chat_history}
"""


In [23]:
def format_docs(docs):
    return "\n\n".join(doc.page_content for doc in docs)

In [24]:
def create_rag_chain(retriever):
    llm = ChatGoogleGenerativeAI(model="gemini-1.5-pro", temperature=0)
    prompt = ChatPromptTemplate.from_template(prompt_template)

    rag_chain = (
        {
            "context": lambda x: format_docs(retriever.get_relevant_documents(x["user_input"])),
            "user_input": lambda x: x["user_input"],
            "chat_history": lambda x: x["chat_history"]
        }
        | prompt
        | llm
        | StrOutputParser()
    )
    return rag_chain


In [None]:
def main():
    # Load and process documents
    file_paths = ["Corpus.pdf"]  # Add more file paths as needed
    documents = load_and_split_documents(file_paths)

    # Create vector store and retriever
    vector_store = create_vector_store(documents)
    retriever = create_retriever(vector_store, documents, method="dense")  # Change to "bm25" if needed

    # Create RAG chain
    rag_chain = create_rag_chain(retriever)

    # Chat loop
    chat_history = ""
    while True:
        user_input = input("User: ")
        if user_input.lower() == 'exit':
            break

        try:
            response = rag_chain.invoke({"user_input": user_input, "chat_history": chat_history})
            print("AI:", response)

            chat_history += f"Human: {user_input}\nAI: {response}\n\n"
        except Exception as e:
            print(f"An error occurred: {e}")

if __name__ == "__main__":
    main()

User: What is hover?
AI: HOVER stands for HOppyVERification. Here are some key points about it:

* **Purpose:** HOVER is a dataset designed for testing and improving "many-hop" evidence extraction and fact verification in AI models. 
    * This means it challenges models to make connections across multiple pieces of information, rather than finding simple, direct matches.

* **Structure:**
    *  HOVER presents claims that need to be verified.
    *  Evidence for these claims is spread across multiple Wikipedia articles (up to four).
    *  The connections between the claim and the evidence form "reasoning graphs" of various complexities.

* **Challenges:**
    *  HOVER claims often require information from several sources, making it harder for models to find all the necessary evidence.
    *  The claims are written in a way that avoids simple word matching, forcing models to understand the meaning and relationships between concepts.
    *  Many claims are multi-sentence, adding the di