In [None]:
from langchain_community.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter

In [None]:
# step1: Load raw PDF(s)
DATA_PATH="data/"
def load_pdf_files(data):
    loader=DirectoryLoader(
        data,
        glob="*.pdf",
        loader_cls =PyPDFLoader)

    documents=loader.load()
    return documents


In [None]:
documents = load_pdf_files(data=DATA_PATH)
print("length of PDF pages:",len(documents))

length of PDF pages: 969


In [None]:
# Step 2:Create Chunks
def create_chunks(extracted_data):
    text_splitter= RecursiveCharacterTextSplitter(
        chunk_size=500,
        chunk_overlap=50)
    text_chunks=text_splitter.split_documents(extracted_data)
    return text_chunks

In [None]:
text_chunks =create_chunks(extracted_data=documents)
print("length of Text Chunks:",len(text_chunks))

length of Text Chunks: 9742


In [None]:
# Step 3: Create Vector Embeddings
from langchain_huggingface import HuggingFaceEmbeddings
def get_embedding_model():
    embedding_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
    return embedding_model

In [None]:
embedding_model=get_embedding_model()

  from .autonotebook import tqdm as notebook_tqdm


In [None]:
# Step 4 :store embedding in FAISS
from langchain_community.vectorstores import FAISS
DB_FAISS_PATH="vectorstore/db_faiss"
db=FAISS.from_documents(text_chunks,embedding_model)
db.save_local(DB_FAISS_PATH)

In [None]:
import os
from langchain_huggingface import HuggingFaceEndpoint
from langchain_core.prompts import PromptTemplate
from langchain.chains import RetrievalQA
from langchain_huggingface import HuggingFaceEmbeddings


In [None]:
# step 1: Setup LLM (Mistral with HuggingFace)
HUGGINGFACE_API_KEY= os.environ.get("HUGGINGFACE_API_KEY")
HUGGINGFACE_REPO_ID="mistralai/Mistral-7B-Instruct-v0.3"


In [None]:
def load_llm(huggingface_repo_id):
    llm=HuggingFaceEndpoint(
        repo_id = huggingface_repo_id,
        temperature =0.5,
        model_kwargs={
            "token":HUGGINGFACE_API_KEY,
            "max_length": "512"
                    })
    
    return llm

In [None]:
# Step 2: Connect LLm with FAISS and Create chain

DB_FAISS_PATH="vectorstore/db_faiss"

CUSTOM_PROMPT_TEMPLATE= """
use the pieces of information provided in the context to answer uwer's question.
if you don't know the answer, just say that you dont know, don't try to make up an answer.
Don't provide anythin out of the given context
context:{context}
Question:{question}
Start the answer directly. No small talk please.

"""

In [None]:
def set_custom_prompt(custom_prompt_template):
    prompt= PromptTemplate(template=custom_prompt_template,input_variable=["context","question"])
    return prompt

In [None]:
#Load database
embedding_model=HuggingFaceEmbeddings(model_name="sentence-transformers/all-MiniLM-L6-v2")
db = FAISS.load_local(DB_FAISS_PATH,embedding_model,allow_dangerous_deserialization =True)


In [None]:
#Create QA chain
HUGGINGFACE_REPO_ID="mistralai/Mistral-7B-Instruct-v0.3"
qa_chain=RetrievalQA.from_chain_type(
    llm=load_llm(HUGGINGFACE_REPO_ID),
    chain_type="stuff",
    retriever=db.as_retriever(search_kwargs={'k':3}),
    return_source_documents=True,
    chain_type_kwargs={"prompt":set_custom_prompt(CUSTOM_PROMPT_TEMPLATE)}
)

In [None]:
# Invoke with a single query
user_query= "how to differentiate coliform from other Gram negative bacteria?"
response =qa_chain.invoke({'query':user_query})
print("RESULT:",response["result"])
# print("SOURCE DOCUMENTS:",response["source_documents"])




RESULT: Coliform bacteria can be differentiated from other Gram-negative bacteria based on their ability to ferment lactose and produce gas. This is tested using a medium called MacConkey agar, which selects for lactose-fermenting bacteria and inhibits the growth of other bacteria. If a colony on MacConkey agar appears red, it indicates the presence of lactose-fermenting bacteria, which are likely coliform bacteria. However, it's important to note that not all coliform bacteria produce gas, so this test should be used in combination with other tests for accurate identification.
