# Retrieval-Augmented Generation with Vector Stores




In [8]:
%%capture
## ^^ Comment out if you want to see the pip install process

## Necessary for Colab, not necessary for course environment
# %pip install -q langchain langchain-nvidia-ai-endpoints gradio rich
# %pip install -q arxiv pymupdf faiss-cpu
    
## If you're in colab and encounter a typing-extensions issue,
##  restart your runtime and try again
from langchain_nvidia_ai_endpoints._common import NVEModel

from functools import partial
from rich.console import Console
from rich.style import Style
from rich.theme import Theme

console = Console()
base_style = Style(color="#76B900", bold=True)
pprint = partial(console.print, style=base_style)

from getpass import getpass
import requests
import os

hard_reset = False  ## <-- Set to True if you want to reset your NVIDIA_API_KEY
while "nvapi-" not in os.environ.get("NVIDIA_API_KEY", "") or hard_reset:
    try: 
        assert not hard_reset
        response = requests.get("http://docker_router:8070/get_key").json()
        assert response.get('nvapi_key')
    except: response = {'nvapi_key' : getpass("NVIDIA API Key: ")}
    os.environ["NVIDIA_API_KEY"] = response.get("nvapi_key")
    try: requests.post("http://docker_router:8070/set_key/", json={'nvapi_key' : os.environ["NVIDIA_API_KEY"]}).json()
    except: pass
    hard_reset = False
    if "nvapi-" not in os.environ.get("NVIDIA_API_KEY", ""):
        print("[!] API key assignment failed. Make sure it starts with `nvapi-` as generated from the model pages.")

print(f"Retrieved NVIDIA_API_KEY beginning with \"{os.environ.get('NVIDIA_API_KEY')[:9]}...\"")
from langchain_nvidia_ai_endpoints._common import NVEModel
NVEModel().available_models

In [9]:
os.environ['LANGCHAIN_TRACING_V2'] = 'true'
os.environ['LANGCHAIN_ENDPOINT'] = 'https://api.smith.langchain.com'
os.environ['LANGCHAIN_API_KEY'] = 'ls__ae038ddbc62c4af090ecc776a2e1254b'
os.environ['LANGCHAIN_PROJECT'] = "researchAssistant"


In [10]:
conversation = [  ## This conversation was generated partially by an AI system, and modified to exhibit desirable properties
    "[User]  Hello! My name is Beras, and I'm a big blue bear! Can you please tell me about the rocky mountains?",
    "[Agent] The Rocky Mountains are a beautiful and majestic range of mountains that stretch across North America",
    "[Beras] Wow, that sounds amazing! Ive never been to the Rocky Mountains before, but Ive heard many great things about them.",
    "[Agent] I hope you get to visit them someday, Beras! It would be a great adventure for you!"
    "[Beras] Thank you for the suggestion! Ill definitely keep it in mind for the future.",
    "[Agent] In the meantime, you can learn more about the Rocky Mountains by doing some research online or watching documentaries about them."
    "[Beras] I live in the arctic, so I'm not used to the warm climate there. I was just curious, ya know!",
    "[Agent] Absolutely! Lets continue the conversation and explore more about the Rocky Mountains and their significance!"
]



In [11]:
%%time
## ^^ This cell will be timed to see how long the conversation embedding takes
from langchain_nvidia_ai_endpoints import NVIDIAEmbeddings
from langchain.vectorstores import FAISS


CPU times: total: 0 ns
Wall time: 16.4 ms


In [12]:
embedder = NVIDIAEmbeddings(model="nvolveqa_40k")

convstore = FAISS.from_texts(conversation, embedder)
retriever = convstore.as_retriever()

In [13]:
pprint(retriever.invoke("What is your name?"))

In [14]:
pprint(retriever.invoke("Where are the ROcky Mountains"))

### **Step 3:** Incorporating Conversation Retrieval Into Our Chain

Now that we have our loaded retriever component as a chain, we can incorporate it into our existing chat system as before. Specifically, we can start with an ***always-on RAG formulation*** where:
- **A retriever is always retrieving context by default**.
- **A generator is acting on the retrieved context**.

In [15]:
from langchain.document_transformers import LongContextReorder
from langchain.prompts import ChatPromptTemplate
from langchain.schema.output_parser import StrOutputParser
from langchain.schema.runnable import RunnableLambda
from langchain.schema.runnable.passthrough import RunnableAssign
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

from functools import partial
from operator import itemgetter

In [16]:
def RPting(preface = ""):
    """Simple passthrough "prints, then returns" chain"""
    def print_and_return(x, preface):
        print(f'{preface}{x}') 
        return x
    return RunnableLambda(partial(print_and_return, preface=preface))  

def docs2str(docs, title="Document"):
    """Useful utility for making chunks into context string. Optional, but useful"""
    out_str = ""
    for doc in docs:
        doc_name = getattr(doc, 'metadata', {}).get('Title', title)
        if doc_name:
            out_str += f"[Quote from {doc_name}] "
        out_str += getattr(doc, 'page_content', str(doc)) + "\n"
    return out_str

## Optional; Reorders longer documents to center of output text
long_reorder = RunnableLambda(LongContextReorder().transform_documents)
########################################################################


llm = ChatNVIDIA(model = 'mixtral_8x7b') | StrOutputParser()

context_prompt = ChatPromptTemplate.from_messages([
    ('system',
        "Answer the question using only the context"
        "\n\nQuestion: {question} \n\n Context: {context}"
    ), ('user', "{question}" ),
])

chain = (
    {
        'context': convstore.as_retriever() | long_reorder | docs2str,
        'question': (lambda x:x)
    }
    | context_prompt
    # | RPrint()
    | llm
    | StrOutputParser() 
)

pprint(chain.invoke("Where does Beras live?"))

In [17]:
pprint(chain.invoke("Where are the rocky mountains"))


In [18]:
pprint(chain.invoke("Where are the Rocky Mountains? Are they close to California?"))

In [19]:
pprint(chain.invoke(
    "Where are the Rocky Mountains? Please include"
    " the author's reasoning, but provide more information!"
))

In [20]:
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser
from operator import itemgetter

convstore = FAISS.from_texts(conversation, embedding=embedder)

def save_memory_and_get_output(d, vstore):
    """Accepts 'input'/'output' dictionary and saves to convstore"""
    vstore.add_texts([f"User said {d.get('input')}", f"Agent said {d.get('output')}"])
    return d.get('output')

chat_prompt = ChatPromptTemplate.from_messages([
    ('system', "A user has asked a question: {input}\n\n Context: \n{context}\n\n"
    "Please continue the conversation by responding! Keep it brief and conversational!" ),
    ('user', '{input}')
])

conv_chain = ({
    'context': convstore.as_retriever() | long_reorder |docs2str, # population 'context' w8 retriver
    'input' : (lambda x :x)
}
| RunnableAssign({'output' : chat_prompt | llm}) 
| partial(save_memory_and_get_output, vstore=convstore))


In [21]:
pprint(conv_chain.invoke("I'm glad you agree! I can't wait to get some ice cream there! It's such a good food!"))


In [22]:
pprint(conv_chain.invoke("Can you guess what my favorite food is?"))
print()




In [23]:
pprint(conv_chain.invoke("Actually, it's honey! Not sure where you got that idea?"))


In [24]:
pprint(conv_chain.invoke("I see! Fair enough! Do you know my favorite food now?"))

In [25]:
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

from langchain_community.vectorstores import FAISS
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.document_loaders import ArxivLoader

In [26]:
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1000, chunk_overlap = 100,
    separators=["\n\n", "\n", ".", ";", ",", " ", ""],
)

### TODO: Please pick some papers and add them to the list as you'd like
print("Loading Documents")
docs = [
    ArxivLoader(query="1706.03762").load(),  ## Attention Is All You Need Paper
    ArxivLoader(query="1810.04805").load(),  ## BERT Paper
    ArxivLoader(query="2005.11401").load(),  ## RAG Paper
    ArxivLoader(query="2205.00445").load(),  ## MRKL Paper
    ArxivLoader(query="2310.06825").load(),  ## Mistral Paper
    ArxivLoader(query="2306.05685").load(),  ## LLM-as-a-Judge
    ## Some longer papers
    ArxivLoader(query="2210.03629").load(),  ## ReAct Paper
    ArxivLoader(query="2112.10752").load(),  ## Latent Stable Diffusion Paper
    ArxivLoader(query="2103.00020").load(),  ## CLIP Paper
    ## TODO: Feel free to add more
]

## Cut the paper short if references is included.
## This is a standard string in papers.
for doc in docs:
    content = doc[0].page_content
    if "References" in content:
        doc[0].page_content = content[:content.index("\nReferences")]

print('chunkin documents')
docs_chunks = [text_splitter.split_documents(doc) for doc in docs]
docs_chunk = [ [c for c in dchunks if len(c.page_content) > 200] for dchunks in docs_chunks  ]


doc_string = 'Available Documents:'
doc_metadata = []

for chunks in docs_chunks:
    metadata = getattr(chunks[0], 'metadata', {})
    doc_string += "\n - " + metadata.get('Title')
    doc_metadata += [str(metadata)]


extra_chunks = [doc_string] + doc_metadata


## Printing out some summary information for reference
pprint(doc_string, '\n')

for i, chunks in enumerate(docs_chunks):
    print(f"Document {i}")
    print(f" - Metadata: {chunks[0].metadata}")
    print(f" - # Chunks: {len(chunks)}")
    print()

print(f"extra_chunks :{extra_chunks}")

Loading Documents
chunkin documents


Document 0
 - Metadata: {'Published': '2023-08-02', 'Title': 'Attention Is All You Need', 'Authors': 'Ashish Vaswani, Noam Shazeer, Niki Parmar, Jakob Uszkoreit, Llion Jones, Aidan N. Gomez, Lukasz Kaiser, Illia Polosukhin', 'Summary': 'The dominant sequence transduction models are based on complex recurrent or\nconvolutional neural networks in an encoder-decoder configuration. The best\nperforming models also connect the encoder and decoder through an attention\nmechanism. We propose a new simple network architecture, the Transformer, based\nsolely on attention mechanisms, dispensing with recurrence and convolutions\nentirely. Experiments on two machine translation tasks show these models to be\nsuperior in quality while being more parallelizable and requiring significantly\nless time to train. Our model achieves 28.4 BLEU on the WMT 2014\nEnglish-to-German translation task, improving over the existing best results,\nincluding ensembles by over 2 BLEU. On the WMT 2014 English-to-Frenc

In [27]:
pip install arxiv

Note: you may need to restart the kernel to use updated packages.


In [28]:
pip install pymupdf




In [29]:
?getattr

[1;31mDocstring:[0m
getattr(object, name[, default]) -> value

Get a named attribute from an object; getattr(x, 'y') is equivalent to x.y.
When a default argument is given, it is returned when the attribute doesn't
exist; without it, an exception is raised in that case.
[1;31mType:[0m      builtin_function_or_method

In [30]:


from faiss import IndexFlatL2

from langchain_community.docstore.in_memory import InMemoryDocstore
from langchain_core.prompts import ChatPromptTemplate

embedder = NVIDIAEmbeddings(model="nvolveqa_40k", model_type=None)


#construct series of document vector stores
print("constructing vector stores")
vecstores = [FAISS.from_texts(extra_chunks, embedder)]

pprint(vecstores)
print("-"*30)
vecstores += [FAISS.from_documents(doc_chunks, embedder) for doc_chunks in docs_chunks]
pprint(vecstores)

constructing vector stores


------------------------------


In [31]:
embed_dims = len(embedder.embed_query("test"))
def default_FAISS():
    '''Useful utility for making an empty FAISS vectorstore'''
    return FAISS(
        embedding_function=embedder,
        index=IndexFlatL2(embed_dims),
        docstore=InMemoryDocstore(),
        index_to_docstore_id={},
        normalize_L2=False
    )

def aggregate_vstores(vectorstores):
    ## Initialize an empty FAISS Index and merge others into it
    ## We'll use default_faiss for simplicity, though it's tied to your embedder by reference
    agg_vstore = default_FAISS()
    for vstore in vectorstores:
        agg_vstore.merge_from(vstore)
    return agg_vstore

if 'docstore' not in globals():
    ## Unintuitive optimization; merge_from seems to optimize constituent vector stores away
    docstore = aggregate_vstores(vecstores)

print(f"Constructed aggregate docstore with {len(docstore.docstore._dict)} chunks")

Constructed aggregate docstore with 543 chunks


In [35]:
from langchain.document_transformers import LongContextReorder
from langchain_core.runnables import RunnableLambda
from langchain_core.runnables.passthrough import RunnableAssign
from langchain_nvidia_ai_endpoints import ChatNVIDIA, NVIDIAEmbeddings

from langchain_core.prompts import ChatPromptTemplate
from langchain_core.output_parsers import StrOutputParser

import gradio as gr
from functools import partial
from operator import itemgetter


llm = ChatNVIDIA(model="mixtral_8x7b") | StrOutputParser()
convstore = default_FAISS()

def save_memory_and_get_output(d, vstore):
    """Accepts 'input'/'output' dictionary and saves to convstore"""
    vstore.add_texts([
        f"User previously responded with {d.get('input')}",
        f"Agent previously responded with {d.get('output')}"
    ])
    return d.get('output')

initial_msg = (
    "Hello! I am a document chat agent here to help the user!"
    f" I have access to the following documents: {doc_string}\n\nHow can I help you?"
)

chat_prompt = ChatPromptTemplate.from_messages([("system",
    "You are a document chatbot. Help the user as they ask questions about documents."
    " User messaged just asked: {input}\n\n"
    " From this, we have retrieved the following potentially-useful info: "
    " Conversation History Retrieval:\n{history}\n\n"
    " Document Retrieval:\n{context}\n\n"
    " (Answer only from retrieval. Only cite sources that are used. Make your response conversational.)"
), ('user', '{input}')])

################################################################################################

retrieval_chain = (
    {'input' : (lambda x: x)}
    ## TODO: Make sure to retrieve history & context from convstore & docstore, respectively.
    ## HINT: Our solution uses RunnableAssign, itemgetter, long_reorder, and docs2str
    | RunnableAssign({'history' : itemgetter('input') | convstore.as_retriever() | long_reorder | docs2str})
    | RunnableAssign({'context' : itemgetter('input') | docstore.as_retriever()  | long_reorder | docs2str})
    #| RPrint()
)

################################################################################################

stream_chain = chat_prompt | llm

def chat_gen(message, history=[], return_buffer=True):
    buffer = ""
    ## First perform the retrieval based on the input message
    retrieval = retrieval_chain.invoke(message)
    line_buffer = ""

    ## Then, stream the results of the stream_chain
    for token in stream_chain.stream(retrieval):
        buffer += token
        ## If you're using standard print, keep line from getting too long
        if not return_buffer:
            line_buffer += token
            if "\n" in line_buffer:
                line_buffer = ""
            if ((len(line_buffer)>84 and token and token[0] == " ") or len(line_buffer)>100):
                line_buffer = ""
                yield "\n"
                token = "  " + token.lstrip()
        yield buffer if return_buffer else token

    ## Lastly, save the chat exchange to the conversation memory buffer
    save_memory_and_get_output({'input':  message, 'output': buffer}, convstore)


## Start of Agent Event Loop
test_question = "Tell me about RAG!"  ## <- modify as desired

## Before you launch your gradio interface, make sure your thing works
for response in chat_gen(test_question, return_buffer=False):
    print(response, end='')

RAG is an acronym that stands for ReAct's Generative Apprentice. It is a model for
  synergizing reasoning and acting in language models, as described in the paper "ReAct:
  Synergizing Reasoning and Acting in Language Models" by Andrew Le, Doug Downey, and Yejin
  Choi.

The RAG model is designed to improve the performance of language models on tasks that
  require both reasoning and acting, such as question answering and dialogue systems. It does
  this by using a generative model to generate responses, rather than simply selecting a
  response from a predefined set of options. This allows the model to generate more creative
  and flexible responses, and to better handle tasks that require complex reasoning.

The RAG model consists of two main components: a retriever and a generator. The retriever
  is responsible for retrieving relevant information from a large corpus of text, such
  as a database or the internet. The generator then uses this information to generate a
  response to 

In [34]:
?itemgetter


[1;31mInit signature:[0m [0mitemgetter[0m[1;33m([0m[0mself[0m[1;33m,[0m [1;33m/[0m[1;33m,[0m [1;33m*[0m[0margs[0m[1;33m,[0m [1;33m**[0m[0mkwargs[0m[1;33m)[0m[1;33m[0m[1;33m[0m[0m
[1;31mDocstring:[0m     
itemgetter(item, ...) --> itemgetter object

Return a callable object that fetches the given item(s) from its operand.
After f = itemgetter(2), the call f(r) returns r[2].
After g = itemgetter(2, 5, 3), the call g(r) returns (r[2], r[5], r[3])
[1;31mFile:[0m           c:\program files\windowsapps\pythonsoftwarefoundation.python.3.11_3.11.2288.0_x64__qbz5n2kfra8p0\lib\operator.py
[1;31mType:[0m           type
[1;31mSubclasses:[0m     

In [36]:
chatbot = gr.Chatbot(value = [[None, initial_msg]])
demo = gr.ChatInterface(chat_gen, chatbot=chatbot).queue()

try:
    demo.launch(debug=True, share=True, show_api=False)
    demo.close()
except Exception as e:
    demo.close()
    print(e)
    raise e

Running on local URL:  http://127.0.0.1:7860

Could not create share link. Please check your internet connection or our status page: https://status.gradio.app.


In [None]:
## Save and compress your index
docstore.save_local("docstore_index")
!tar czvf docstore_index.tgz docstore_index

!rm -rf docstore_index