In [3]:
from langchain_groq import ChatGroq
from langchain_community.document_loaders import PyPDFLoader,DirectoryLoader
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.retrievers import EnsembleRetriever,ContextualCompressionRetriever
from langchain_community.document_transformers import LongContextReorder,EmbeddingsRedundantFilter
from langchain.retrievers.document_compressors import DocumentCompressorPipeline
from langchain_community.retrievers import BM25Retriever
from langchain_cohere import CohereRerank
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import PromptTemplate
from langchain_community.vectorstores import FAISS
from langchain_core.runnables import RunnableParallel,RunnablePassthrough,RunnableLambda
from langchain_google_genai import GoogleGenerativeAIEmbeddings
from dotenv import load_dotenv
load_dotenv()

True

## Document Loading

In [4]:
path_to_got = "data/got/game of thrones.pdf"
path_to_sh = "data/Sherlock Holmes/cano.pdf"
def pdf_loader(path):
    """Document Loading"""
    loader = PyPDFLoader(path)
    return loader.load()
got_doc = pdf_loader(path_to_got)
sh_doc = pdf_loader(path_to_sh)
print("Document Loaded")

Document Loaded


## Text Chunking

In [5]:
def text_splitter(doc:str)->str:
    """Text chunking """
    splitter = RecursiveCharacterTextSplitter(chunk_size=1500,chunk_overlap=250)
    return splitter.split_documents(doc)
got_chunks = text_splitter(got_doc)
sh_chunks = text_splitter(sh_doc)
print(f"length of game of throne chunks is {len(got_chunks)} and length of SherLock Holmes is {len(sh_chunks)}")

length of game of throne chunks is 1503 and length of SherLock Holmes is 3135


### Embedding Model

In [6]:
def embedding_model():
    """Embedding model"""
    return GoogleGenerativeAIEmbeddings(model = "models/embedding-001")
embedding_model = embedding_model()

### Creating Vector Store(FAISS)

In [9]:
path_to_got = "./faiss/got"
path_to_sh = "./faiss/sherlock_holmes"

def vector_store_create(chunks):
    """Vector store Creation"""
    return FAISS.from_documents(documents=chunks,
                                    embedding = embedding_model) 

got_vector_store = vector_store_create(got_chunks)
sh_vector_store = vector_store_create(sh_chunks)

got_vector_store.save_local(path_to_got)
sh_vector_store.save_local(path_to_sh)

print("vector store created")

vector store created


### Loading Vector Store

In [10]:
path_to_got = "./faiss/got"
path_to_sh = "./faiss/sherlock_holmes"

got_vector_store = FAISS.load_local(embeddings=embedding_model,folder_path=path_to_got,allow_dangerous_deserialization=True)
sh_vector_store = FAISS.load_local(embeddings=embedding_model,folder_path=path_to_sh,allow_dangerous_deserialization=True)

### Retriever

In [11]:
got_retriever = got_vector_store.as_retriever(search_type="similarity",search_kwargs={"k":3})
sh_retriever = sh_vector_store.as_retriever(search_type="similarity",search_kwargs={"k":3})
got_retriever.invoke("who is jon snow")

[Document(id='99ab10ef-242f-48f7-9078-280023e8079e', metadata={'producer': 'Acrobat Web Capture 7.0', 'creator': 'PyPDF', 'creationdate': '2008-06-25T22:32:27+07:00', 'author': 'George R.R. Martin', 'keywords': 'Book One of A Song of Ice and Fire', 'moddate': '2018-01-12T10:46:54+01:00', 'title': 'A Game of Thrones', 'source': 'data/got/game of thrones.pdf', 'total_pages': 755, 'page': 177, 'page_label': '178'}, page_content='Jon smiled at him. “I’m sorry about your wrist. Robb used the same move on me once, \nonly with a wooden blade. It hurt like seven hells, but yours must be worse. Look, if you \nwant, I can show you how to defend that.”\nAlliser Thorne overheard him. “Lord Snow wants to take my place now.” He sneered. \n“I’d have an easier time teaching a wolf to juggle than you will training this aurochs.”\n“I’ll take that wager, Ser Alliser,” Jon said. “I’d love to see Ghost juggle.”\nJon heard Grenn suck in his breath, shocked. Silence fell.\nThen Tyrion Lannister guffawed. Thr

### Prompt

In [12]:
parser = StrOutputParser()
prompt = PromptTemplate(
    template="""You are a highly accurate story writter .
        Use ONLY the given context to answer the user's question.
        If the context does not contain the information needed, simply reply:
        "I don't know based on the given context."
        CONTEXT:
        {context}
        QUESTION:
        {question}
        Your Answer:""",
input_variables=["context", "question"])

In [13]:
merger_retrival = RunnableParallel({
    "got_ret":got_retriever,
    "sh_ret":sh_retriever
}) | RunnableLambda(lambda x: x["got_ret"]+x["sh_ret"])

### Keyword Retriever(BM25)

In [14]:
bm25_retriever_got = BM25Retriever.from_documents(got_chunks)
bm25_retriever_got.k=3

bm25_retriever_wot = BM25Retriever.from_documents(sh_chunks)
bm25_retriever_wot.k=3

## Ensemble Retriever(Hybrid Retriever)

In [15]:
got_hybrid = EnsembleRetriever(
    retrievers=[bm25_retriever_got,got_retriever],
    weights=[0.5,0.5]
)
sh_hybrid = EnsembleRetriever(
    retrievers=[bm25_retriever_wot,sh_retriever],
    weights=[0.5,0.5]
)

In [16]:
retriever_chain = RunnableParallel({
    "got":got_hybrid,
    "sh":sh_hybrid
}) | RunnableLambda(lambda x :x["got"]+x["sh"])


## RE-ranking(CohereRerank) & Extra Chunks Filtering

In [17]:
# reranker using cohere
reranker = CohereRerank(model="rerank-english-v3.0")  # anks documents by how well they answer the user's question.
filter = EmbeddingsRedundantFilter(embeddings=embedding_model) # Removes duplicate or highly similar chunks.
reordering = LongContextReorder()  # Reorders documents to maximize coherence in long context windows
pipeline = DocumentCompressorPipeline(transformers=[reranker,filter,reordering])


### ContextualCompressionRetriever

In [18]:
compression_retriever = ContextualCompressionRetriever(
    base_retriever=retriever_chain,
    base_compressor=pipeline
)

### Extracting Page Content

In [19]:
def context(document):
    return "\n\n".join(doc.page_content for doc in document)

In [20]:

parallel_chain = RunnableParallel({
    "context": compression_retriever | RunnableLambda(context),
    "question":RunnablePassthrough()
})

## Text Generation Model

In [21]:
model = ChatGroq(model="gemma2-9b-it",max_tokens=512)

## Final Chain

In [22]:
final_chain = parallel_chain | prompt | model | parser

In [24]:
from IPython.display import Markdown,display

In [25]:
query = input("enter query:")
result = final_chain.invoke(query)
display(Markdown(result))

Jon Snow is a bastard, oathbreaker, motherless, friendless, and damned.  He is the son of Eddard Stark and a woman whose name is never mentioned.  He left his home to join the Night's Watch.  Jon believes that he is better than the other brothers at the Wall, which causes them to hate him. 


In [26]:
query = input("enter query:")
result = final_chain.invoke(query)
display(Markdown(result))

He is a detective who is an expert in observation and deduction. He has written monographs on technical subjects.  
