### Corrective RAG

### Importing necessary libraries 

In [2]:
import torch
from langchain.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.vectorstores import Chroma

In [3]:
print(torch.cuda.is_available())
print("*"*100)
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using torch {torch.__version__} ({DEVICE})")

True
****************************************************************************************************
Using torch 2.1.2+cu118 (cuda)


In [4]:
model_name = "BAAI/bge-small-en-v1.5"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True}
embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)

  from .autonotebook import tqdm as notebook_tqdm
  return self.fget.__get__(instance, owner)()


In [5]:
loader = DirectoryLoader('Data',
                        glob='*.pdf',
                        loader_cls=PyPDFLoader)

documents = loader.load()

In [6]:
len(documents)

224

In [7]:
unique_sources = set()

for doc in documents:
    if 'source' in doc.metadata:
        unique_sources.add(doc.metadata['source'])

unique_sources = list(unique_sources)

In [8]:
print("Number of unique sources are : " , len(unique_sources))
print("Unique sources:", unique_sources)

Number of unique sources are :  9
Unique sources: ['Data\\lebs106.pdf', 'Data\\lebs108.pdf', 'Data\\lebs104.pdf', 'Data\\lebs1ps.pdf', 'Data\\lebs101.pdf', 'Data\\lebs107.pdf', 'Data\\lebs102.pdf', 'Data\\lebs105.pdf', 'Data\\lebs103.pdf']


In [9]:
text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000, chunk_overlap=100)
texts = text_splitter.split_documents(documents=documents)

vector_store = Chroma.from_documents(texts, embeddings, collection_metadata={"hnsw:space": "cosine"}, persist_directory="stores/data_cosine")

print("*"*100)
print("Chroma Vectore Store Created: " , vector_store)
print("*"*100)


****************************************************************************************************
Chroma Vectore Store Created:  <langchain_community.vectorstores.chroma.Chroma object at 0x000001B2681A8700>
****************************************************************************************************


In [10]:
retriever = vector_store.as_retriever(search_kwargs={"k": 2})

In [11]:
retriever.invoke("principles of management")

[Document(metadata={'page': 34, 'source': 'Data\\lebs102.pdf'}, page_content='Principles of management are general guidelines, which can be used for conduct in work places under certain situations. They help managers to take and implement decisions.NatureThe nature of management principles can be discussed under the heads- formed by practice; general guidelines; universal; flexible; behavioural; contingent; and cause and effect relationship SignificanceProper understanding of significance of management principles is essential to make sound decisions by managers. The significance can be discussed under the following heads- Increase in efficiency; Optimum utilisation of resources; Scientific decision making; Adaptation to changing environment; Fulfilling social responsibilities; Proper research and development; Training managers; and Effective administration.Scientific ManagementTaylor’s principles of scientific management are — Science, not the rule of thumb; Harmony not discord; Cooper

In [12]:
from langchain_community.chat_models import ChatOllama
llm = ChatOllama(model='mistral', format="json", temperature=0)

In [13]:
from langchain.prompts import PromptTemplate
from langchain_core.output_parsers import JsonOutputParser
from langchain_core.output_parsers import StrOutputParser

In [14]:
### Retrieval Grader

# Prompt
prompt = PromptTemplate(
    template="""You are a teacher grading a quiz. You will be given: 
    1/ a QUESTION
    2/ A FACT provided by the student
    
    You are grading RELEVANCE RECALL:
    A score of 1 means that ANY of the statements in the FACT are relevant to the QUESTION. 
    A score of 0 means that NONE of the statements in the FACT are relevant to the QUESTION. 
    1 is the highest (best) score. 0 is the lowest score you can give. 
    
    Explain your reasoning in a step-by-step manner. Ensure your reasoning and conclusion are correct. 
    
    Avoid simply stating the correct answer at the outset.
    
    Question: {question} \n
    Fact: \n\n {documents} \n\n
    
    Give a binary score 'yes' or 'no' score to indicate whether the document is relevant to the question. \n
    Provide the binary score as a JSON with a single key 'score' and no premable or explanation.
    """,
    input_variables=["question", "documents"],
)

retrieval_grader = prompt | llm | JsonOutputParser()
question = "recruitment process"
docs = retriever.invoke(question)
doc_txt = docs[1].page_content
print(retrieval_grader.invoke({"question": question, "documents": doc_txt}))

{'score': 1}
