In [1]:
# import sys
# import os
# sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), "..")))

import os
from dotenv import load_dotenv
load_dotenv()

os.environ["GROQ_API_KEY"] = os.getenv("GROQ_API_KEY")
os.environ["LANGSMITH_API_KEY"] = os.getenv("LANGSMITH_API_KEY")
os.environ["LANGSMITH_PROJECT"] = "Simple RAG"
os.environ["LANGSMITH_TRACING"] = "true"

In [2]:
def replace_t_with_space(list_of_documents):
    """
        Replace all the tab ('\t') keys with white space in the page content of list of documents.

        Args:
            list_of_documents: A list of document obj, each with 'page_content' attribute.
        Return:
            The modified list of documents with tab characters replaced by white spaces
    """
    for doc in list_of_documents:
        doc.page_content = doc.page_content.replace('\t', " ")
    return list_of_documents

In [3]:
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.vectorstores import FAISS
#from helper_functions import Helpers

class Data_Ingestion_Pipe:
    """
    A pipeline that showcases the ingestion of documet data into vectorstore
    """
    def __init__(self, file_path: str = r"D:\My Files\RAG-Techniques\RAG.pdf"):
        self.file_path = file_path
        #self.helper_func = Helpers()
        #self.embed_provider = Embedding_Provider()
    
    def encode_pdf(self, chunk_size: int =1000, chunk_overlap: int = 200):
        """
        Set of setps to stores the pdf documents in vectorestore in the form of embeddings
        Args:
            file_path: denotes the location of the file
            chunk_size : denote the size of each chunk the document to be split into
            chunk_overlap: connecting words in each chunk

        Return:
            A FAISS vector store containing the encoded book content.
        """
        #loads the pdf file
        try:
            loader = PyPDFLoader(self.file_path)
            docs = loader.load()
        except FileNotFoundError as e:
            raise f"Error occured: {e}"
        # split the doc file into chunks
        text_splitter = RecursiveCharacterTextSplitter(
            chunk_size = chunk_size, chunk_overlap = chunk_overlap
        )
        doc_chunks = text_splitter.split_documents(documents=docs)
        cleaned_texts = replace_t_with_space(doc_chunks)
        #embeddings
        embedding = HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
        #vector db
        faiss_vstore = FAISS.from_documents(cleaned_texts, embedding=embedding)
        return faiss_vstore

In [4]:
def doc_retriever(vstore, k):
    """
    retrieves top k similar documents
    """
    retriever =  vstore.as_retriever(search_kwargs={"k": k})
    return retriever

In [None]:
#from helper_functions import show_context

# if __name__ == "__main__":
#     question = input("Enter the retrieval query: ")
#     vstore = Data_Ingestion_Pipe().encode_pdf() ### add chunk_size and chunk_overlap
#     retriever = doc_retriever(vstore=vstore, k=2)
#     docs = retriever.get_relevant_documents(question)
#     result = [doc.page_content for doc in docs]
#     print(result)

In [11]:
from langchain_groq import ChatGroq
from langsmith import traceable

llm = ChatGroq(model="llama3-8b-8192", temperature=1)
question = input("Enter the retrieval query: ")
# Add decorator so this function is traced in LangSmith
@traceable()
def rag_bot(question: str) -> dict:
    # LangChain retriever will be automatically traced
    vstore = Data_Ingestion_Pipe().encode_pdf()
    retriever = doc_retriever(vstore=vstore, k=2)
    docs = retriever.invoke(question)
    docs_string = " ".join(doc.page_content for doc in docs)

    instructions = f"""You are a helpful assistant who is good at analyzing source information and answering questions.       Use the following source documents to answer the user's questions.       If you don't know the answer, just say that you don't know.       Use three sentences maximum and keep the answer concise.

Documents:
{docs_string}"""

    # langchain ChatModel will be automatically traced
    ai_msg = llm.invoke([
            {"role": "system", "content": instructions},
            {"role": "user", "content": question},
        ],
    )

    return {"answer": ai_msg.content, "documents": docs}

In [12]:
from langsmith import Client

client = Client()

# Define the examples for the dataset
examples = [
    {
        "inputs": {"question": "How does the ReAct agent use self-reflection? "},
        "outputs": {"answer": "ReAct integrates reasoning and acting, performing actions - such tools like Wikipedia search API - and then observing / reasoning about the tool outputs."},
    },
    {
        "inputs": {"question": "What are the types of biases that can arise with few-shot prompting?"},
        "outputs": {"answer": "The biases that can arise with few-shot prompting include (1) Majority label bias, (2) Recency bias, and (3) Common token bias."},
    },
    {
        "inputs": {"question": "What are five types of adversarial attacks?"},
        "outputs": {"answer": "Five types of adversarial attacks are (1) Token manipulation, (2) Gradient based attack, (3) Jailbreak prompting, (4) Human red-teaming, (5) Model red-teaming."},
    }
]

# Create the dataset and examples in LangSmith
dataset_name = "Lilian Weng Blogs Q&A"
dataset = client.create_dataset(dataset_name=dataset_name)
client.create_examples(
    dataset_id=dataset.id,
    examples=examples
)

LangSmithConflictError: Conflict for /datasets. HTTPError('409 Client Error: Conflict for url: https://api.smith.langchain.com/datasets', '{"detail":"Dataset with this name already exists."}')

In [13]:
from typing_extensions import Annotated, TypedDict
from langchain_groq import ChatGroq

# Grade output schema
class CorrectnessGrade(TypedDict):
    explanation: Annotated[str, ..., "Explain your reasoning for the score"]
    correct: Annotated[bool, ..., "True if the answer is correct, False otherwise."]

# Grade prompt
correctness_instructions = """You are a teacher grading a quiz. 

You will be given a QUESTION, the GROUND TRUTH (correct) ANSWER, and the STUDENT ANSWER. 

Here is the grade criteria to follow:
(1) Grade the student answers based ONLY on their factual accuracy relative to the ground truth answer. 
(2) Ensure that the student answer does not contain any conflicting statements.
(3) It is OK if the student answer contains more information than the ground truth answer, as long as it is factually accurate relative to the  ground truth answer.

Correctness:
A correctness value of True means that the student's answer meets all of the criteria.
A correctness value of False means that the student's answer does not meet all of the criteria.

Explain your reasoning in a step-by-step manner to ensure your reasoning and conclusion are correct. 

Avoid simply stating the correct answer at the outset."""

# Grader LLM
grader_llm = ChatGroq(model="gemma2-9b-it", temperature=0).with_structured_output(CorrectnessGrade, method="json_schema", strict=True)

def correctness(inputs: dict, outputs: dict, reference_outputs: dict) -> bool:
    """An evaluator for RAG answer accuracy"""
    answers = f"""\
QUESTION: {inputs['question']}
GROUND TRUTH ANSWER: {reference_outputs['answer']}
STUDENT ANSWER: {outputs['answer']}"""

    # Run evaluator
    grade = grader_llm.invoke([
        {"role": "system", "content": correctness_instructions}, 
        {"role": "user", "content": answers}
    ])
    return grade["correct"]

In [14]:
from langsmith import Client

def target(inputs: dict) -> dict:
    return rag_bot(inputs["question"])

experiment_results = client.evaluate(
    target,
    data=dataset_name,
    evaluators=[correctness], #groundedness, relevance, retrieval_relevance
    experiment_prefix="rag-doc-relevance",
    metadata={"version": "LCEL context, gemma2-9b-it-preview"},
)
# Explore results locally as a dataframe if you have pandas installed
experiment_results.to_pandas()

View the evaluation results for experiment: 'rag-doc-relevance-64afe964' at:
https://smith.langchain.com/o/b30f5232-cf83-47ae-9e6c-8156860113d4/datasets/b7114e3e-eb43-4b5e-8a0a-41f10b48b3b6/compare?selectedSessions=a1c4318d-cb1a-4f48-8351-a18c5718497b




3it [01:19, 26.46s/it]

KeyboardInterrupt

