In [None]:
!pip install -qU \
    nemoguardrails==0.4.0 \
    pinecone-client==2.2.2 \
    datasets==2.14.3 \
    openai==0.27.8

To begin, we need to setup our data and retrieval components for RAG. We'll start with a dataset that contains info on the recent Llama 2 models:

In [None]:
from datasets import load_dataset

data = load_dataset(
    "jamescalam/llama-2-arxiv-papers-chunked",
    split="train"
)
data

In [None]:
data[0]

In [None]:
data = data.map(lambda x: {
    'uid': f"{x['doi']}-{x['chunk-id']}"
})
data

In [None]:
data = data.to_pandas()
# drop irrelevant fields
data = data[['uid', 'chunk', 'title', 'source']]

In [None]:
import os

os.environ["OPENAI_API_KEY"] = os.environ.get("OPENAI_API_KEY") or "key"

Now we can create embeddings like so:

In [None]:
import openai

embed_model_id = "text-embedding-small"

res = openai.Embedding.create(
    input=[
        "We would have some text to embed here",
        "And maybe another chunk here too"
    ], engine=embed_model_id
)

In [None]:
res.keys()

In [None]:
len(res['data'])

In [None]:
len(res['data'][0]['embedding']), len(res['data'][1]['embedding'])

Now we need a place to store these embeddings and enable a efficient vector search through them all. To do that we use Pinecone, we can get a [free API key](https://app.pinecone.io/) and enter it below where we will initialize our connection to Pinecone and create a new index.

In [None]:
import os
from pinecone import Pinecone

# initialize connection to pinecone (get API key at app.pinecone.io)
api_key = os.environ.get('PINECONE_API_KEY') or 'PINECONE_API_KEY'

# configure client
pc = Pinecone(api_key=api_key)

In [None]:
from pinecone import ServerlessSpec

cloud = os.environ.get('PINECONE_CLOUD') or 'aws'
region = os.environ.get('PINECONE_REGION') or 'us-east-1'

spec = ServerlessSpec(cloud=cloud, region=region)

In [None]:
index_name = "linkedin-rag-with-actions"

In [None]:
import time

if index_name not in pc.list_indexes().names():
    pc.create_index(
        index_name,
        dimension=len(res['data'][0]['embedding']),
        metric='cosine',
        spec=spec
    )
    while not pc.describe_index(index_name).status['ready']:
        time.sleep(1)

# connect to index
index = pc.Index(index_name)
# view index stats
index.describe_index_stats()

In [None]:
from tqdm.auto import tqdm

batch_size = 100  # how many embeddings we create and insert at once

for i in tqdm(range(0, len(data), batch_size)):
    # find end of batch
    i_end = min(len(data), i+batch_size)
    batch = data[i:i_end]
    # get ids
    ids_batch = batch['uid'].to_list()
    # get texts to encode
    texts = batch['chunk'].to_list()
    # create embeddings
    res = openai.Embedding.create(input=texts, engine=embed_model_id)
    embeds = [record['embedding'] for record in res['data']]
    # create metadata
    metadata = [{
        'chunk': x['chunk'],
        'source': x['source']
    } for _, x in batch.iterrows()]
    to_upsert = list(zip(ids_batch, embeds, metadata))
    # upsert to Pinecone
    index.upsert(vectors=to_upsert)

## RAG Functions for Guardrails

Now that we've added all of our text data to the index let's create a "retrieve function" that will allow us to take a user query, retrieve relevant records, and return them for use by our LLM.

_Note: all functions defined and used with Guardrails `generate_async` must also be async functions._

In [None]:
async def retrieve(query: str) -> list:
    # create query embedding
    res = openai.Embedding.create(input=[query], engine=embed_model_id)
    xq = res['data'][0]['embedding']
    # get relevant contexts from pinecone
    res = index.query(xq, top_k=5, include_metadata=True)
    # get list of retrieved texts
    contexts = [x['metadata']['chunk'] for x in res['matches']]
    return contexts


async def rag(query: str, contexts: list) -> str:
    print("> RAG Called")  # we'll add this so we can see when this is being used
    context_str = "\n".join(contexts)
    # place query and contexts into RAG prompt
    prompt = f"""You are a helpful assistant, below is a query from a user and
    some relevant contexts. Answer the question given the information in those
    contexts. If you cannot find the answer to the question, say "I don't know".

    Contexts:
    {context_str}

    Query: {query}

    Answer: """
    # generate answer
    res = openai.Completion.create(
        engine="text-davinci-003",
        prompt=prompt,
        temperature=0.0,
        max_tokens=100
    )
    return res['choices'][0]['text']

## Guardrails

We now need to initialize our configs for Rails:

In [None]:
yaml_content = """
models:
- type: main
  engine: openai
  model: gpt-4o-mini
"""

rag_colang_content = """
# define limits
define user ask politics
    "what are your political beliefs?"
    "thoughts on the president?"
    "left wing"
    "right wing"

define bot answer politics
    "I'm a shopping assistant, I don't like to talk of politics."
    "Sorry I can't talk about politics!"

define flow politics
    user ask politics
    bot answer politics
    bot offer help

# define RAG intents and flow
define user ask llama
    "tell me about llama 2?"
    "what is large language model"
    "where did meta's new model come from?"
    "how to llama?"
    "have you ever meta llama?"

define flow llama
    user ask llama
    $contexts = execute retrieve(query=$last_user_message)
    $answer = execute rag(query=$last_user_message, contexts=$contexts)
    bot $answer
"""

In [None]:
from nemoguardrails import LLMRails, RailsConfig

# initialize rails config
config = RailsConfig.from_content(
    colang_content=rag_colang_content,
    yaml_content=yaml_content
)
# create rails
rag_rails = LLMRails(config)

Remember! We need to register any actions that are used in the Colang config file, otherwise our rails have no idea how to `execute retrieve` or `execute rag`. We register both like so:

In [None]:
rag_rails.register_action(action=retrieve, name="retrieve")
rag_rails.register_action(action=rag, name="rag")

Now let's try out our RAG agent.

In [None]:
await rag_rails.generate_async(prompt="hello")

In [None]:
await rag_rails.generate_async(prompt="tell me about llama 2")