In [1]:
from langchain.document_loaders import PyPDFLoader , Docx2txtLoader , CSVLoader
from langchain.vectorstores import LanceDB
from langchain.retrievers import BM25Retriever , EnsembleRetriever
from langchain.embeddings import OllamaEmbeddings
import os


In [15]:
curr_dir = os.getcwd()
combined_docs = []
for path in os.listdir(curr_dir):
    file_fmt = os.path.splitext(path)[1]
    if file_fmt==".pdf":
        loader = PyPDFLoader(path)
        combined_docs.extend(loader.load())
    elif file_fmt==".docx":
        loader = Docx2txtLoader(path)
        combined_docs.extend(loader.load())
    elif file_fmt==".csv":
        loader = CSVLoader(path)
        combined_docs.extend(loader.load())
    else:
        print(f"Unsupported Format : {file_fmt}")


Unsupported Format : .ipynb


IM USING ONLY TOP 100 DOCUMENTS FOR CONVENIENCE

In [22]:
combined_docs = combined_docs[:100]

In [23]:
from langchain.text_splitter import RecursiveCharacterTextSplitter

text_splitter = RecursiveCharacterTextSplitter(chunk_size=1000 , chunk_overlap = 200)
splitted_docs = text_splitter.split_documents(combined_docs)

In [24]:
embedding_model = OllamaEmbeddings(model="nomic-embed-text:latest")
vectordb = LanceDB.from_documents(splitted_docs , embedding_model)

In [25]:
vector_retriever = vectordb.as_retriever()

lexicaldb = BM25Retriever.from_documents(splitted_docs)
lexicaldb.k = 4

In [26]:
hybrid_search = EnsembleRetriever(retrievers=[vector_retriever , lexicaldb], weights=[.60,.40])

In [40]:
from langchain_core.prompts import ChatPromptTemplate

prompt=ChatPromptTemplate.from_template(
    "\
Your are a personal assistant based on the user questions , \
Answer in a clear and concise manner and also make sure to use the terminologies  associated with it . \
Use the give context to enhance your understanding on specific topics. \
<context> {context} </context> \
<question> {input} </question> \
"
)

In [41]:
from langchain_community.llms import ollama

llm = ollama.Ollama(model="llama3.2:1b-instruct-q2_K")


In [42]:
from langchain.chains.combine_documents import create_stuff_documents_chain
from langchain.chains.retrieval import create_retrieval_chain

combining_chain = create_stuff_documents_chain(llm=llm, prompt=prompt)
retrieval_chain = create_retrieval_chain(hybrid_search , combining_chain)


In [47]:
answer = retrieval_chain.invoke({"input":"What is the top llm in 2024"})

In [None]:
answer["answer"]