In [1]:
import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")

In [2]:
def replace_t_with_space(list_of_documents):
    """
        Replace all the tab ('\t') keys with white space in the page content of list of documents.

        Args:
            list_of_documents: A list of document obj, each with 'page_content' attribute.
        Return:
            The modified list of documents with tab characters replaced by white spaces
    """
    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', " ")
    return list_of_documents

In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
#from helper_functions import Helpers

class Data_Ingestion_Pipe:
    """
    A pipeline that showcases the ingestion of documet data into vectorstore
    """
    def __init__(self, file_path: str = r"D:\My Files\RAG-Techniques\RAG.pdf"):
        self.file_path = file_path
        #self.helper_func = Helpers()
        #self.embed_provider = Embedding_Provider()

    
    def encode_pdf(self, chunk_size: int =1000, chunk_overlap: int = 200):
        """
        Set of setps to stores the pdf documents in vectorestore in the form of embeddings
        Args:
            file_path: denotes the location of the file
            chunk_size : denote the size of each chunk the document to be split into
            chunk_overlap: connecting words in each chunk

        Return:
            A FAISS vector store containing the encoded book content.
        """
        #loads the pdf file
        try:
            loader = PyPDFLoader(self.file_path)
            docs = loader.load()
        except FileNotFoundError as e:
            raise f"Error occured: {e}"
        # split the doc file into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = chunk_size, chunk_overlap = chunk_overlap
        )
        doc_chunks = text_splitter.split_documents(documents=docs)

        cleaned_texts = replace_t_with_space(doc_chunks)
        #embeddings
        embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        #vector db
        faiss_vstore = FAISS.from_documents(cleaned_texts, embedding=embedding)
        return faiss_vstore

In [4]:
def doc_retriever():
    """
    retrieves top k similar documents
    """
    obj = Data_Ingestion_Pipe()
    vstore = obj.encode_pdf()
    retriever = vstore.as_retriever(search_kwargs={"k": 3})
    return retriever

In [5]:
from langchain_groq import ChatGroq

llm = ChatGroq(model="llama3-8b-8192")

In [6]:
#tool
from langchain_community.tools import DuckDuckGoSearchResults

web_search = DuckDuckGoSearchResults()

#Retriever Evaluation

In [7]:
from pydantic import BaseModel, Field

class RetrieverInputEvaluator(BaseModel):
    relevance_score: float = Field(..., description="describe the relevance of retrieved document to query. The score should be between 0 and 1")
    

In [8]:
from langchain_core.prompts import PromptTemplate

def retrieveal_evaluator(query: str, document: str)-> float:
    """
    Evaluates the relevance of retrieved documents with the query
    """
    prompt = PromptTemplate(
        input_variables=["query", "document"],
        template="""
        On the scale of 0 to 1 how much relevent is the document to the following query.
        \nQuery: {query}
        \nDocument: {document}
        Relevance Score:
        """
    )
    chain = prompt | llm.with_structured_output(RetrieverInputEvaluator)
    input_variables = {"query": query, "document": document}
    result = chain.invoke(input_variables).relevance_score
    return result

#Knowledge Refinement

In [9]:
class KnowledgeInputRefinement(BaseModel):
    key_points: str = Field(..., description="documents to extract key information from")

In [10]:
from typing_extensions import List

def knowledge_refinement(document: str)->List[str]:
    """
    Takes out key points from the retrieved and combined documents
    """
    prompt = PromptTemplate(
        input_variables=["document"],
        template="""
            Extract the key points from the following documents in bullet points format:
            \nDocuments: {documents}
            \nKey points:
        """
    )
    chain = prompt | llm.with_structured_output(KnowledgeInputRefinement)
    result = chain.invoke({"document": document}).key_points
    return [point.strip() for point in result.split('\n') if point.strip()]

#Query Rewritter

In [11]:
class QueryInputReWritter(BaseModel):
    rewritten_query: str = Field(..., description="The query to rewrite.")

In [12]:
def query_rewritter(query: str)->str:
    """
    rewrites the initial query into one that is suitable for web search
    """
    prompt = PromptTemplate(
        input_variables=["query"],
        template="""
         Rewrite the following query to make it more suitable for a web search:
         \nQuery:{query}
         \nRewritten Query:
        """
    )
    chain = prompt | llm.with_structured_output(QueryInputReWritter)
    result = chain.invoke({"query": query}).query.strip()
    return result

#Helper function to pasre web search results

In [13]:
from typing_extensions import Tuple
import json

def parse_search_results(result_string: str)-> List[Tuple[str, str]]:
    """
    Parse a JSON string of search results into a list of title-link tuples.
    Args:
        results_string (str): A JSON-formatted string containing search results.

    Returns:
        List[Tuple[str, str]]: A list of tuples, where each tuple contains the title and link of a search result.
                               If parsing fails, an empty list is returned.
    """
    try:
        results = json.load(result_string)
        return [(result.get("title","Untitled"), result.get("url")) for result in results]
    except json.JSONDecodeError:
        print("Error parsing search results. Returning empty list")
        return []

#Helper Function

In [14]:
def evaluate_documents(query: str, documents: List[str]) -> List[float]:
    """
    Evaluate the relevance of documents based on a query.

    Args:
        query (str): The query string.
        documents (List[str]): A list of document contents to evaluate.

    Returns:
        List[float]: A list of relevance scores for each document.
    """
    return [retrieveal_evaluator(query, doc) for doc in documents]

In [15]:
def perform_web_search(query: str) -> Tuple[List[str], List[Tuple[str, str]]]:
    """
    Perform a web search based on a query.

    Args:
        query (str): The query string to search for.

    Returns:
        Tuple[List[str], List[Tuple[str, str]]]: 
            - A list of refined knowledge obtained from the web search.
            - A list of tuples containing titles and links of the sources.
    """
    rewritten_query = query_rewritter(query)
    web_results = web_search.run(rewritten_query)
    web_knowledge = knowledge_refinement(web_results)
    sources = parse_search_results(web_results)
    return web_knowledge, sources

In [16]:
def generate_response(query: str, knowledge: str, sources: List[Tuple[str, str]]) -> str:
    """
    Generate a response to a query using knowledge and sources.

    Args:
        query (str): The query string.
        knowledge (str): The refined knowledge to use in the response.
        sources (List[Tuple[str, str]]): A list of tuples containing titles and links of the sources.

    Returns:
        str: The generated response.
    """
    response_prompt = PromptTemplate(
        input_variables=["query", "knowledge", "sources"],
        template="Based on the following knowledge, answer the query. Include the sources with their links (if available) at the end of your answer:\nQuery: {query}\nKnowledge: {knowledge}\nSources: {sources}\nAnswer:"
    )
    input_variables = {
        "query": query,
        "knowledge": knowledge,
        "sources": "\n".join([f"{title}: {link}" if link else title for title, link in sources])
    }
    response_chain = response_prompt | llm
    return response_chain.invoke(input_variables).content

#CRAG Process

In [17]:
def crag_process(query: str, faiss_index: FAISS) -> str:
    """
    Process a query by retrieving, evaluating, and using documents or performing a web search to generate a response.

    Args:
        query (str): The query string to process.
        faiss_index (FAISS): The FAISS index used for document retrieval.

    Returns:
        str: The generated response based on the query.
    """
    print(f"\nProcessing query: {query}")

    # Retrieve and evaluate documents
    retrieved_docs = retrieveal_evaluator(query, faiss_index)
    eval_scores = evaluate_documents(query, retrieved_docs)
    
    print(f"\nRetrieved {len(retrieved_docs)} documents")
    print(f"Evaluation scores: {eval_scores}")

    # Determine action based on evaluation scores
    max_score = max(eval_scores)
    sources = []
    
    if max_score > 0.7:
        print("\nAction: Correct - Using retrieved document")
        best_doc = retrieved_docs[eval_scores.index(max_score)]
        final_knowledge = best_doc
        sources.append(("Retrieved document", ""))

    elif max_score < 0.3:
        print("\nAction: Incorrect - Performing web search")
        final_knowledge, sources = perform_web_search(query)
        
    else:
        print("\nAction: Ambiguous - Combining retrieved document and web search")
        best_doc = retrieved_docs[eval_scores.index(max_score)]
        # Refine the retrieved knowledge
        retrieved_knowledge = knowledge_refinement(best_doc)
        web_knowledge, web_sources = perform_web_search(query)
        final_knowledge = "\n".join(retrieved_knowledge + web_knowledge)
        sources = [("Retrieved document", "")] + web_sources

    print("\nFinal knowledge:")
    print(final_knowledge)
    
    print("\nSources:")
    for title, link in sources:
        print(f"{title}: {link}" if link else title)

    # Generate response
    print("\nGenerating response...")
    response = generate_response(query, final_knowledge, sources)

    print("\nResponse generated")
    return response

In [18]:
query = "What are the main causes of climate change?"
vectorstore = Data_Ingestion_Pipe().encode_pdf()
result = crag_process(query, vectorstore)
print(f"Query: {query}")
print(f"Answer: {result}")

  from .autonotebook import tqdm as notebook_tqdm



Processing query: What are the main causes of climate change?


TypeError: 'float' object is not iterable