In [11]:
import os
import torch
from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain.document_loaders import PyPDFLoader
from langchain.schema import Document
from langchain.vectorstores import Chroma
from langchain.retrievers import ParentDocumentRetriever
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.storage import InMemoryStore
from langchain.document_loaders import TextLoader
from langchain.chains import RetrievalQA
from langchain import PromptTemplate, LLMChain
from langchain.llms import CTransformers

In [3]:
torch.cuda.is_available()
DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using torch {torch.__version__} ({DEVICE})")

Using torch 2.1.2+cu118 (cuda)


In [4]:
model_name = "BAAI/bge-small-en-v1.5"
encode_kwargs = {'normalize_embeddings': True}

bge_embeddings = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs={'device': 'cuda'},
    encode_kwargs=encode_kwargs
)

  from .autonotebook import tqdm as notebook_tqdm


In [5]:
loaders = [
    PyPDFLoader(r"C:\\Users\\ishaan.kohli\\Downloads\\RAFT.pdf")

]
docs = []
for loader in loaders:
    docs.extend(loader.load())

In [6]:
parent_splitter = RecursiveCharacterTextSplitter(chunk_size=2000)
child_splitter = RecursiveCharacterTextSplitter(chunk_size=400)
vectorstore = Chroma(collection_name="split_parents", embedding_function=bge_embeddings,collection_metadata={"hnsw:space": "cosine"}, persist_directory="stores/PC_memory")
# load_vector_store = Chroma(persist_directory="stores/pet_cosine", embedding_function=bge_embeddings)
store = InMemoryStore()

In [7]:
big_chunks_retriever = ParentDocumentRetriever(
    vectorstore=vectorstore,
    docstore=store,
    child_splitter=child_splitter,
    parent_splitter=parent_splitter,
)

In [8]:
big_chunks_retriever.add_documents(docs)

In [12]:
local_llm = r"D://LLM_Models//zephyr-7b-beta.Q5_K_S.gguf"

config = {
'max_new_tokens': 1024,
'repetition_penalty': 1.1,
'temperature': 0.1,
'top_k': 50,
'top_p': 0.9,
'stream': True,
'threads': int(os.cpu_count() / 2)
}



llm = CTransformers(
    model=local_llm,
    model_type="mistral",
    lib="avx2", #for CPU use
    **config
)

print("LLM Initialized...")

LLM Initialized...


In [10]:
prompt_template = """Use the following pieces of information to answer the user's question.
If you don't know the answer, just say that you don't know, don't try to make up an answer.

Context: {context}
Question: {question}

Only return the helpful answer below and nothing else.
Helpful answer:
"""

In [13]:
def generate_response(text_input):
    query = text_input

    retrieved_docs = big_chunks_retriever.get_relevant_documents(query)

    text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50)
    texts = text_splitter.split_documents(retrieved_docs)

    vector_store = Chroma.from_documents(texts, bge_embeddings, collection_metadata={"hnsw:space": "cosine"}, persist_directory="stores/relevant_from_PC")

    prompt = PromptTemplate(template=prompt_template, input_variables=['context', 'question'])
    
    load_vector_store = Chroma(persist_directory="stores\\relevant_from_PC", embedding_function=bge_embeddings)

    # load_vector_store = vector_store


    retriever = load_vector_store.as_retriever(search_kwargs={"k":2})


    chain_type_kwargs = {"prompt": prompt}

    qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=retriever, return_source_documents=True, chain_type_kwargs=chain_type_kwargs, verbose=True)

    final_sol = qa(query)
    print(final_sol)
    
    return final_sol

In [14]:
input = generate_response("What are the benifits of RAFT?")

  warn_deprecated(
  warn_deprecated(
  warn_deprecated(
  warn_deprecated(




[1m> Entering new RetrievalQA chain...[0m

[1m> Finished chain.[0m
{'query': 'What are the benifits of RAFT?', 'result': "RAFT (Reference Attentive Fusion Transformer) is a technique that enables models to learn domain-specific knowledge through fine-tuning while ensuring robustness against inaccurate document retrievals by understanding the dynamics between the question, the retrieved documents, and the appropriate answer. RAFT trains the model to ignore irrelevant or distracting documents and cites verbatim the relevant sequence from the retrieved documents that would help answer the question, coupled with its chain-of-thought-style response to improve the model's ability to reason consistently across domain specific benchmark datasets like RAG.", 'source_documents': [Document(page_content='performance. RAFT aims to not only enable models to learn\ndomain specific knowledge through fine-tuning, but also\nto ensure robustness against inaccurate retrievals. This is\nachieved by t

In [15]:
print(input['result'])

RAFT (Reference Attentive Fusion Transformer) is a technique that enables models to learn domain-specific knowledge through fine-tuning while ensuring robustness against inaccurate document retrievals by understanding the dynamics between the question, the retrieved documents, and the appropriate answer. RAFT trains the model to ignore irrelevant or distracting documents and cites verbatim the relevant sequence from the retrieved documents that would help answer the question, coupled with its chain-of-thought-style response to improve the model's ability to reason consistently across domain specific benchmark datasets like RAG.
