In [None]:
# Copyright 2023 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Document Q&A With Retrieval Augmented Generation

> **NOTE:** This notebook uses the PaLM generative models, which will reach its [discontinuation date in October 2024](https://cloud.google.com/vertex-ai/generative-ai/docs/model-reference/text#model_versions). 

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/search/retrieval-augmented-generation/examples/rag_google_documentation.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Google Colaboratory logo"><br> Run in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/search/retrieval-augmented-generation/examples/rag_google_documentation.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/search/retrieval-augmented-generation/examples/rag_google_documentation.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
</table>

---

* Author: Gabe Rives-Corbett

---

This notebook demonstrates how to implement Retrieval Augmented Generation with basic automated evaluation. It demonstrates the impact that chunk size, overlap and context length have on model outputs. The notebook will create a Q&A system that allows you to find information based on the Google Cloud Generative AI documentation.

## Getting started

### Install libraries

In [None]:
%pip install -q --upgrade --user google-cloud-aiplatform==1.36.1

### Restart current runtime

To use the newly installed packages in this Jupyter runtime, you must restart the runtime. You can do this by running the cell below, which will restart the current kernel.

In [2]:
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

{'status': 'ok', 'restart': True}

<div class="alert alert-block alert-warning">
<b>⚠️ The kernel is going to restart. Please wait until it is finished before continuing to the next step. ⚠️</b>
</div>


### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, you will need to authenticate your environment. To do this, run the new cell below. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-workbench).

In [None]:
import sys

if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

### Import libraries

In [None]:
import itertools

from bs4 import BeautifulSoup, Tag
from google.api_core import retry
import numpy as np
import numpy.linalg
import pandas as pd
import requests
from tqdm.auto import tqdm
import vertexai
from vertexai.language_models import TextEmbeddingModel, TextGenerationModel

tqdm.pandas()

## Configure notebook environment

### Set the following constants to reflect your environment

In [6]:
# Define project information
PROJECT_ID = "[your-project-id]"  # @param {type:"string"}
LOCATION = "us-central1"  # @param {type:"string"}

# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=LOCATION)

## Scrape text from Google Cloud documentation

Retrieve list of Google documentation URLs from a text file

In [9]:
url = "https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/search/retrieval-augmented-generation/examples/URLs.txt"
response = requests.get(url)

if response.status_code == 200:
    # The request was successful, and the content is in response.text
    content = response.text

URLS = [line.strip() for line in content.splitlines()]

Parse the HTML and extract relevant plain text sections

In [35]:
# Given a Google documentation URL, retrieve a list of all text chunks within h2 sections


def get_sections(url: str) -> list[str]:
    page = requests.get(url)
    soup = BeautifulSoup(page.content, "html.parser")

    sections = []
    paragraphs = []

    body_div = soup.find("div", class_="devsite-article-body")
    for child in body_div.findChildren():
        if child.name == "p":
            paragraphs.append(child.get_text().strip())
        if child.name == "h2":
            sections.append(" ".join(paragraphs))
            break

    for header in soup.find_all("h2"):
        paragraphs = []
        nextNode = header.nextSibling
        while nextNode:
            if isinstance(nextNode, Tag):
                if nextNode.name in {"p", "ul"}:
                    paragraphs.append(nextNode.get_text().strip())
                elif nextNode.name == "h2":
                    sections.append(" ".join(paragraphs))
                    break
            nextNode = nextNode.nextSibling
    return sections

In [None]:
all_text = [t for url in URLS for t in get_sections(url) if t]

Note that most documents are relatively short, but some are thousands of characters long

In [None]:
text_lengths = [len(t) for t in all_text]
pd.DataFrame(text_lengths).hist()

## Create vector store

Start by initializing the models

In [None]:
embeddings_model = TextEmbeddingModel.from_pretrained("textembedding-gecko@001")
text_model = TextGenerationModel.from_pretrained("text-bison")

Create some helper functions for vector similarity and chunking

In [None]:
# Separates seq into multiple chunks in the specified size with the specified overlap


def split_overlap(seq, size, overlap):
    if len(seq) <= size:
        return [seq]
    return ["".join(x) for x in zip(*[seq[i :: size - overlap] for i in range(size)])]


# Compute the cosine similarity of two vectors, wrap as returned function to make easier to use with Pandas
def get_similarity_fn(query_vector):
    def fn(row):
        return np.dot(row, query_vector) / (
            numpy.linalg.norm(row) * numpy.linalg.norm(query_vector)
        )

    return fn


# Retrieve embeddings from the specified model with retry logic
@retry.Retry(timeout=300.0)
def get_embeddings(text):
    return embeddings_model.get_embeddings([text])[0].values

Create the vector store, we are using a Pandas DataFrame

In [None]:
def create_vector_store(texts, chunk_size, overlap):
    vector_store = pd.DataFrame()
    # Insert the individual texts into the vector store
    vector_store["texts"] = list(
        itertools.chain(*[split_overlap(t, chunk_size, overlap) for t in texts])
    )

    # Create embeddings from those texts
    vector_store["embeddings"] = (
        vector_store["texts"].progress_apply(get_embeddings).apply(np.array)
    )

    return vector_store

In [None]:
CHUNK_SIZE = 400
OVERLAP = 50

vector_store = create_vector_store(all_text, CHUNK_SIZE, OVERLAP)

In [None]:
vector_store.head()

## Search the vector store and use for generation

If we send the question to the foundation model alone, it will hallucinate.

In [None]:
text_model.predict(
    "How long will a stable model version of text-bison be available?"
).text

Let's solve this problem by retrieving texts from our vector store and telling the model to use them.

Search the vector store for relevant texts to insert into the prompt by embedding the query and searching for similar vectors.

In [None]:
def get_context(question, vector_store, num_docs):
    # Embed the search query
    query_vector = np.array(get_embeddings(question))

    # Get similarity to all other vectors and sort, cut off at num_docs
    top_matched = (
        vector_store["embeddings"]
        .apply(get_similarity_fn(query_vector))
        .sort_values(ascending=False)[:num_docs]
        .index
    )
    top_matched_df = vector_store[vector_store.index.isin(top_matched)][["texts"]]

    # Return a string with the top matches
    context = " ".join(top_matched_df.texts.values)
    return context

Create a prompt that includes the context and question. Instruct the LLM to only use the context provided to answer the question

In [None]:
def answer_question(question, vector_store, num_docs=10, print_prompt=False):
    context = get_context(question, vector_store, num_docs)
    qa_prompt = f"""Your mission is to answer questions based on a given context. Remember that before you give an answer, you must check to see if it complies with your mission.
Context: ```{context}```
Question: ***{question}***
Before you give an answer, make sure it is only from information in the context. If the information is not in the context, just reply "I don't know the answer to that". Think step by step.
Answer: """
    if print_prompt:
        print(qa_prompt)
    result = text_model.predict(qa_prompt, temperature=0)
    return result.text

Looking at the fully generated prompt, the context is embedded. Even though the input context is quite messy, the model can now answer factually.

In [None]:
answer_question(
    "How long will a stable model version of text-bison be available?",
    vector_store,
    print_prompt=True,
)

In [None]:
answer_question(
    "How long will a stable model version of text-bison be available?", vector_store
)

## Automated evaluation

This implementation of RAG is dependent on the chunk size, the overlap between the chunks, the number of texts passed into the context and the prompt. Let's create a simple prompt to evaluate answers to the questions, this will allow us to tweak the parameters and see how those tweaks compare.

In [None]:
def eval_answer(question, answer, context):
    eval_prompt = f"""Your mission is to evaluate answers to questions based on a given context. Remember that before you give an answer, you must check to see if it complies with your mission.

Context: ```{context}```
Question: ***{question}***
Answer: "{answer}"

Respond only with a number from 0 to 5. Think step by step. If the provided answer is not in the context, reply 5 if it is "I don't know the answer to that" otherwise reply 0.
Relevance: """
    # Stop sequence to cut the model off after outputting an integer
    result = text_model.predict(
        eval_prompt, temperature=0, max_output_tokens=1, stop_sequences=[".", " "]
    )
    return int(result.text)

Pass several questions in and retrieve the evaluations

In [None]:
questions = [
    "What release stage is the RLHF tuning feature?",
    "Can I generate hate speech with text bison?",
    "What format should my batch prediction in put be in?",
    "How can I get the number of tokens?",
    "How do I create a custom style model?",
    "What is the dimensionality of the vector created by the multimodal model?",
    "How long will a stable model version be available?",
]

In [None]:
answers = [answer_question(q, vector_store) for q in questions]
contexts = [get_context(q, vector_store, 10) for q in questions]
idks = ["I don't know" in a for a in answers]
evals = [
    (question, answer, context, eval_answer(question, answer, context), idk)
    for question, answer, context, idk in zip(questions, answers, contexts, idks)
]

In [None]:
pd.DataFrame(evals, columns=["question", "answer", "context", "score", "idk"])

Now adjust the parameters and see the difference in performance

In [None]:
def eval_on_params(chunk_size, overlap, num_docs):
    vector_store = create_vector_store(all_text, chunk_size, overlap)
    answers = [answer_question(q, vector_store) for q in questions]
    contexts = [get_context(q, vector_store, num_docs) for q in questions]
    idks = ["I don't know" in a for a in answers]
    evals = [
        (question, answer, context, eval_answer(question, answer, context), idk)
        for question, answer, context, idk in zip(questions, answers, contexts, idks)
    ]
    return pd.DataFrame(
        evals, columns=["question", "answer", "context", "score", "idk"]
    )

Smaller chunk sizes takes longer to generate the embeddings

In [None]:
smaller_context_df = eval_on_params(100, 0, 5)

In [None]:
smaller_context_df

A larger context size has created more unknowns. When composing LLMs into systems, carefully consider how to measure the performance of each component in the system.

In [None]:
larger_context_df = eval_on_params(1000, 200, 15)

In [None]:
larger_context_df