# Retrieval-Augmented Generation with Vector Stores




In [1]:

from langchain_nvidia_ai_endpoints._common import NVEModel

from functools import partial
from rich.console import Console
from rich.style import Style
from rich.theme import Theme

console = Console()
base_style = Style(color="#76B900", bold=True)
pprint = partial(console.print, style=base_style)

from getpass import getpass
import requests
import os

hard_reset = False  ## <-- Set to True if you want to reset your NVIDIA_API_KEY
while "nvapi-" not in os.environ.get("NVIDIA_API_KEY", "") or hard_reset:
    try: 
        assert not hard_reset
        response = requests.get("http://docker_router:8070/get_key").json()
        assert response.get('nvapi_key')
    except: response = {'nvapi_key' : getpass("NVIDIA API Key: ")}
    os.environ["NVIDIA_API_KEY"] = response.get("nvapi_key")
    try: requests.post("http://docker_router:8070/set_key/", json={'nvapi_key' : os.environ["NVIDIA_API_KEY"]}).json()
    except: pass
    hard_reset = False
    if "nvapi-" not in os.environ.get("NVIDIA_API_KEY", ""):
        print("[!] API key assignment failed. Make sure it starts with `nvapi-` as generated from the model pages.")

print(f"Retrieved NVIDIA_API_KEY beginning with \"{os.environ.get('NVIDIA_API_KEY')[:9]}...\"")
from langchain_nvidia_ai_endpoints._common import NVEModel
NVEModel().available_models

Retrieved NVIDIA_API_KEY beginning with "nvapi-Bud..."


{'playground_smaug_72b': '008cff6d-4f4c-4514-b61e-bcfad6ba52a7',
 'playground_gemma_7b': '1361fa56-61d7-4a12-af32-69a3825746fa',
 'ai-recurrentgemma-2b': '2f495340-a99f-4b4b-89bd-1beb003dd896',
 'ai-parakeet-ctc-riva': '22164014-a6cc-4a6f-b048-f3a303e745bb',
 'playground_nemotron_steerlm_8b': '1423ff2f-d1c7-4061-82a7-9e8c67afd43a',
 'playground_yi_34b': '347fa3f3-d675-432c-b844-669ef8ee53df',
 'ai-gemma-2b': '04174188-f742-4069-9e72-d77c2b77d3cb',
 'playground_mamba_chat': '381be320-4721-4664-bd75-58f8783b43c7',
 'ai-embed-qa-4': '09c64e32-2b65-4892-a285-2f585408d118',
 'ai-phi-3-mini': '4a58c6cb-a9b4-4014-99de-3e704d4ae687',
 'ai-microsoft-kosmos-2': '6018fed7-f227-48dc-99bc-3fd4264d5037',
 'playground_kosmos_2': '0bcd1a8c-451f-4b12-b7f0-64b4781190d1',
 'playground_deplot': '3bc390c7-eeec-40f7-a64d-0c6a719985f7',
 'playground_nemotron_qa_8b': '0c60f14d-46cb-465e-b994-227e1c3d5047',
 'ai-vista-3d': '72311276-923f-4478-a506-d5b80914728a',
 'ai-llama2-70b': '2fddadfb-7e76-4c8a-9b82-f7d3f

In [2]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_PROJECT'] = "researchAssistant"


In [3]:
conversation = [  
    "[User]  Hello! My name is Beras, and I'm a big blue bear! Can you please tell me about the rocky mountains?",
    "[Agent] The Rocky Mountains are a beautiful and majestic range of mountains that stretch across North America",
    "[Beras] Wow, that sounds amazing! Ive never been to the Rocky Mountains before, but Ive heard many great things about them.",
    "[Agent] I hope you get to visit them someday, Beras! It would be a great adventure for you!"
    "[Beras] Thank you for the suggestion! Ill definitely keep it in mind for the future.",
    "[Agent] In the meantime, you can learn more about the Rocky Mountains by doing some research online or watching documentaries about them."
    "[Beras] I live in the arctic, so I'm not used to the warm climate there. I was just curious, ya know!",
    "[Agent] Absolutely! Lets continue the conversation and explore more about the Rocky Mountains and their significance!"
]



In [6]:
%%time
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain.vectorstores import FAISS


CPU times: total: 31.2 ms
Wall time: 461 ms


In [None]:
embedder = NVIDIAEmbeddings(model="nvolveqa_40k")

convstore = FAISS.from_texts(conversation, embedder)
retriever = convstore.as_retriever()

In [13]:
pprint(retriever.invoke("What is your name?"))

In [14]:
pprint(retriever.invoke("Where are the ROcky Mountains"))

### **Step 3:** Incorporating Conversation Retrieval Into Our Chain

***always-on RAG formulation***:
- **A retriever is always retrieving context by default**.
- **A generator is acting on the retrieved context**.

In [15]:
from langchain.document_transformers import LongContextReorder
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda
from langchain.schema.runnable.passthrough import RunnableAssign
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

from functools import partial
from operator import itemgetter

In [16]:
def RPting(preface = ""):
    """Simple passthrough "prints, then returns" chain"""
    def print_and_return(x, preface):
        print(f'{preface}{x}') 
        return x
    return RunnableLambda(partial(print_and_return, preface=preface))  

def docs2str(docs, title="Document"):
    """Useful utility for making chunks into context string. Optional, but useful"""
    out_str = ""
    for doc in docs:
        doc_name = getattr(doc, 'metadata', {}).get('Title', title)
        if doc_name:
            out_str += f"[Quote from {doc_name}] "
        out_str += getattr(doc, 'page_content', str(doc)) + "\n"
    return out_str

## Optional; Reorders longer documents to center of output text
long_reorder = RunnableLambda(LongContextReorder().transform_documents)
########################################################################


llm = ChatNVIDIA(model = 'mixtral_8x7b') | StrOutputParser()

context_prompt = ChatPromptTemplate.from_messages([
    ('system',
        "Answer the question using only the context"
        "\n\nQuestion: {question} \n\n Context: {context}"
    ), ('user', "{question}" ),
])

chain = (
    {
        'context': convstore.as_retriever() | long_reorder | docs2str,
        'question': (lambda x:x)
    }
    | context_prompt
    # | RPrint()
    | llm
    | StrOutputParser() 
)

pprint(chain.invoke("Where does Beras live?"))

In [17]:
pprint(chain.invoke("Where are the rocky mountains"))


In [18]:
pprint(chain.invoke("Where are the Rocky Mountains? Are they close to California?"))

In [19]:
pprint(chain.invoke(
    "Where are the Rocky Mountains? Please include"
    " the author's reasoning, but provide more information!"
))

In [20]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter

convstore = FAISS.from_texts(conversation, embedding=embedder)

def save_memory_and_get_output(d, vstore):
    """Accepts 'input'/'output' dictionary and saves to convstore"""
    vstore.add_texts([f"User said {d.get('input')}", f"Agent said {d.get('output')}"])
    return d.get('output')

chat_prompt = ChatPromptTemplate.from_messages([
    ('system', "A user has asked a question: {input}\n\n Context: \n{context}\n\n"
    "Please continue the conversation by responding! Keep it brief and conversational!" ),
    ('user', '{input}')
])

conv_chain = ({
    'context': convstore.as_retriever() | long_reorder |docs2str, # population 'context' w8 retriver
    'input' : (lambda x :x)
}
| RunnableAssign({'output' : chat_prompt | llm}) 
| partial(save_memory_and_get_output, vstore=convstore))


In [21]:
pprint(conv_chain.invoke("I'm glad you agree! I can't wait to get some ice cream there! It's such a good food!"))


In [22]:
pprint(conv_chain.invoke("Can you guess what my favorite food is?"))
print()




In [23]:
pprint(conv_chain.invoke("Actually, it's honey! Not sure where you got that idea?"))


In [24]:
pprint(conv_chain.invoke("I see! Fair enough! Do you know my favorite food now?"))

In [25]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import ArxivLoader

In [26]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap = 100,
    separators=["\n\n", "\n", ".", ";", ",", " ", ""],
)

### TODO: Please pick some papers and add them to the list as you'd like
print("Loading Documents")
docs = [
    ArxivLoader(query="1706.03762").load(),  ## Attention Is All You Need Paper
    ArxivLoader(query="1810.04805").load(),  ## BERT Paper
    ArxivLoader(query="2005.11401").load(),  ## RAG Paper
    ArxivLoader(query="2205.00445").load(),  ## MRKL Paper
    ArxivLoader(query="2310.06825").load(),  ## Mistral Paper
    ArxivLoader(query="2306.05685").load(),  ## LLM-as-a-Judge
    ArxivLoader(query="2210.03629").load(),  ## ReAct Paper
    ArxivLoader(query="2112.10752").load(),  ## Latent Stable Diffusion Paper
    ArxivLoader(query="2103.00020").load(),  ## CLIP Paper
]

## Cut the paper short if references is included.
## This is a standard string in papers.
for doc in docs:
    content = doc[0].page_content
    if "References" in content:
        doc[0].page_content = content[:content.index("\nReferences")]

print('chunkin documents')
docs_chunks = [text_splitter.split_documents(doc) for doc in docs]
docs_chunk = [ [c for c in dchunks if len(c.page_content) > 200] for dchunks in docs_chunks  ]


doc_string = 'Available Documents:'
doc_metadata = []

for chunks in docs_chunks:
    metadata = getattr(chunks[0], 'metadata', {})
    doc_string += "\n - " + metadata.get('Title')
    doc_metadata += [str(metadata)]


extra_chunks = [doc_string] + doc_metadata


## Printing out some summary information for reference
pprint(doc_string, '\n')

for i, chunks in enumerate(docs_chunks):
    print(f"Document {i}")
    print(f" - Metadata: {chunks[0].metadata}")
    print(f" - # Chunks: {len(chunks)}")
    print()

print(f"extra_chunks :{extra_chunks}")

Loading Documents
chunkin documents


Document 0
 - Metadata: {'Published': '2023-08-02', 'Title': 'Attention Is All You Need', 'Authors': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin', 'Summary': 'The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks in an encoder-decoder configuration. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer, based\nsolely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to be\nsuperior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014\nEnglish-to-German translation task, improving over the existing best results,\nincluding ensembles by over 2 BLEU. On the WMT 2014 English-to-Frenc

In [27]:
pip install arxiv

Note: you may need to restart the kernel to use updated packages.


In [28]:
pip install pymupdf




In [30]:

from faiss import IndexFlatL2

from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.prompts import ChatPromptTemplate

embedder = NVIDIAEmbeddings(model="nvolveqa_40k", model_type=None)


#construct series of document vector stores
print("constructing vector stores")
vecstores = [FAISS.from_texts(extra_chunks, embedder)]

pprint(vecstores)
print("-"*30)
vecstores += [FAISS.from_documents(doc_chunks, embedder) for doc_chunks in docs_chunks]
pprint(vecstores)

constructing vector stores


------------------------------


In [31]:
embed_dims = len(embedder.embed_query("test"))
def default_FAISS():
    '''Useful utility for making an empty FAISS vectorstore'''
    return FAISS(
        embedding_function=embedder,
        index=IndexFlatL2(embed_dims),
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
        normalize_L2=False
    )

def aggregate_vstores(vectorstores):
    ## Initialize an empty FAISS Index and merge others into it
    agg_vstore = default_FAISS()
    for vstore in vectorstores:
        agg_vstore.merge_from(vstore)
    return agg_vstore

if 'docstore' not in globals():
    docstore = aggregate_vstores(vecstores)

print(f"Constructed aggregate docstore with {len(docstore.docstore._dict)} chunks")

Constructed aggregate docstore with 543 chunks


In [None]:
from langchain.document_transformers import LongContextReorder
from langchain_core.runnables import RunnableLambda
from langchain_core.runnables.passthrough import RunnableAssign
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

import gradio as gr
from functools import partial
from operator import itemgetter


llm = ChatNVIDIA(model="mixtral_8x7b") | StrOutputParser()
convstore = default_FAISS()

def save_memory_and_get_output(d, vstore):
    """Accepts 'input'/'output' dictionary and saves to convstore"""
    vstore.add_texts([
        f"User previously responded with {d.get('input')}",
        f"Agent previously responded with {d.get('output')}"
    ])
    return d.get('output')

initial_msg = (
    "Hello! I am a document chat agent here to help the user!"
    f" I have access to the following documents: {doc_string}\n\nHow can I help you?"
)

chat_prompt = ChatPromptTemplate.from_messages([("system",
    "You are a document chatbot. Help the user as they ask questions about documents."
    " User messaged just asked: {input}\n\n"
    " From this, we have retrieved the following potentially-useful info: "
    " Conversation History Retrieval:\n{history}\n\n"
    " Document Retrieval:\n{context}\n\n"
    " (Answer only from retrieval. Only cite sources that are used. Make your response conversational.)"
), ('user', '{input}')])

################################################################################################

retrieval_chain = (
    {'input' : (lambda x: x)}
    | RunnableAssign({'history' : itemgetter('input') | convstore.as_retriever() | long_reorder | docs2str})
    | RunnableAssign({'context' : itemgetter('input') | docstore.as_retriever()  | long_reorder | docs2str})
)

################################################################################################

stream_chain = chat_prompt | llm

def chat_gen(message, history=[], return_buffer=True):
    buffer = ""
    ## First perform the retrieval based on the input message
    retrieval = retrieval_chain.invoke(message)
    line_buffer = ""

    ## stream the results of the stream_chain
    for token in stream_chain.stream(retrieval):
        buffer += token
        if not return_buffer:
            line_buffer += token
            if "\n" in line_buffer:
                line_buffer = ""
            if ((len(line_buffer)>84 and token and token[0] == " ") or len(line_buffer)>100):
                line_buffer = ""
                yield "\n"
                token = "  " + token.lstrip()
        yield buffer if return_buffer else token

    ## save the chat exchange to the conversation memory buffer
    save_memory_and_get_output({'input':  message, 'output': buffer}, convstore)


## Start of Agent Event Loop
test_question = "Tell me about transformer arcitecture!" 

##  make sure  thing works
for response in chat_gen(test_question, return_buffer=False):
    print(response, end='')

In [None]:
chatbot = gr.Chatbot(value = [[None, initial_msg]])
demo = gr.ChatInterface(chat_gen, chatbot=chatbot).queue()

try:
    demo.launch(debug=True, share=True, show_api=False)
    demo.close()
except Exception as e:
    demo.close()
    print(e)
    raise e

In [None]:
## Save and compress your index
docstore.save_local("docstore_index")
!tar czvf docstore_index.tgz docstore_index

!rm -rf docstore_index