In [None]:
# Copyright 2024 Google LLC
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     https://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

# Q&A Chatbot with Vertex AI Search for summarized website results without advanced indexing

<table align="left">
  <td style="text-align: center">
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/search/vertexai-search-options/vertex_ai_search_website_summary.ipynb">
      <img width="32px" src="https://www.gstatic.com/pantheon/images/bigquery/welcome_page/colab-logo.svg" alt="Google Colaboratory logo"><br> Open in Colab
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/colab/import/https:%2F%2Fraw.githubusercontent.com%2FGoogleCloudPlatform%2Fgenerative-ai%2Fmain%2Fsearch%2Fvertexai-search-options%2Fvertex_ai_search_website_summary.ipynb">
      <img width="32px" src="https://lh3.googleusercontent.com/JmcxdQi-qOpctIvWKgPtrzZdJJK-J3sWE1RsfjZNwshCFgE_9fULcNpuXYTilIR2hjwN" alt="Google Cloud Colab Enterprise logo"><br> Open in Colab Enterprise
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/main/search/vertexai-search-options/vertex_ai_search_website_summary.ipynb">
      <img src="https://www.gstatic.com/images/branding/gcpiconscolors/vertexai/v1/32px.svg" alt="Vertex AI logo"><br> Open in Vertex AI Workbench
    </a>
  </td>
  <td style="text-align: center">
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vertexai-search-options/vertex_ai_search_website_summary.ipynb">
      <img width="32px" src="https://upload.wikimedia.org/wikipedia/commons/9/91/Octicons-mark-github.svg" alt="GitHub logo"><br> View on GitHub
    </a>
  </td>
</table>

<div style="clear: both;"></div>

<b>Share to:</b>

<a href="https://www.linkedin.com/sharing/share-offsite/?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vertexai-search-options/vertex_ai_search_website_summary.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/8/81/LinkedIn_icon.svg" alt="LinkedIn logo">
</a>

<a href="https://bsky.app/intent/compose?text=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vertexai-search-options/vertex_ai_search_website_summary.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/7/7a/Bluesky_Logo.svg" alt="Bluesky logo">
</a>

<a href="https://twitter.com/intent/tweet?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vertexai-search-options/vertex_ai_search_website_summary.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/53/X_logo_2023_original.svg" alt="X logo">
</a>

<a href="https://reddit.com/submit?url=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vertexai-search-options/vertex_ai_search_website_summary.ipynb" target="_blank">
  <img width="20px" src="https://redditinc.com/hubfs/Reddit%20Inc/Brand/Reddit_Logo.png" alt="Reddit logo">
</a>

<a href="https://www.facebook.com/sharer/sharer.php?u=https%3A//github.com/GoogleCloudPlatform/generative-ai/blob/main/search/vertexai-search-options/vertex_ai_search_website_summary.ipynb" target="_blank">
  <img width="20px" src="https://upload.wikimedia.org/wikipedia/commons/5/51/Facebook_f_logo_%282019%29.svg" alt="Facebook logo">
</a>    

| | |
|-|-|
|Author | [Neeraj Shivhare](https://github.com/nshivhar) |

## Objective

The main goal of this code is to provide a way to query a website data store in Vertex AI Search, retrieve the most relevant webpage, and extract and summarize its content. This can be used to build a question-answering system or to simply retrieve and present information from a website in a concise manner.

### Key Features
- Vertex AI Search Integration: Utilizes the Discovery Engine API to query a website data store in Vertex AI Search.
- Top Result Retrieval: Selects the first (presumably most relevant) URL from the search results.
- Webpage Content Extraction: Fetches the webpage content using requests and extracts relevant information (title, description, page content) using BeautifulSoup.
- Gemini 1.5 Summarization: Using Gemini 1.5 to summarize the extracted page content. This would involve sending the page_content to the Gemini API for summarization.


## How to use the notebook
- Initialization: Initialize the notebook by providing your `project_id` and `data_store_id`.
- Search: Call the `get_page_contents` method with your `search_query`. This method will:
     1. Perform the search using Vertex AI Search.
     2. Extract the first link from the results.
     3. Fetch and format the content from the link.
- Print the formatted document details (title, source, description, page content).
- Summarize the content using Gemini 1.5 (not shown in the code).

## Getting Started

### Install Vertex AI SDK for Python

In [None]:
%pip install google-cloud-discoveryengine==0.12.1 langchain_google_vertexai

### Restart kernel

In [1]:
# Restart kernel after installs so that your environment can access the new packages
import IPython

app = IPython.Application.instance()
app.kernel.do_shutdown(True)

### Authenticate your notebook environment (Colab only)

If you are running this notebook on Google Colab, run the following cell to authenticate your environment. This step is not required if you are using [Vertex AI Workbench](https://cloud.google.com/vertex-ai-notebooks?hl=en).

In [1]:
import sys

# Additional authentication is required for Google Colab
if "google.colab" in sys.modules:
    # Authenticate user to Google Cloud
    from google.colab import auth

    auth.authenticate_user()

### imports

In [1]:
import logging
from typing import TypeVar

from bs4 import BeautifulSoup
from google.api_core.client_options import ClientOptions
from google.cloud import discoveryengine_v1beta as discoveryengine
from google.protobuf import json_format
from langchain_core.prompts import PromptTemplate
from langchain_google_vertexai import VertexAI
import requests

Output_co = TypeVar("Output_co", covariant=True)

### Initialization

In [None]:
# Use the environment variable if the user doesn't provide Project ID.
import os

import vertexai

PROJECT_ID = "[your-project-id]"  # @param {type: "string", placeholder: "[your-project-id]" isTemplate: true}
if not PROJECT_ID or PROJECT_ID == "[your-project-id]":
    PROJECT_ID = str(os.environ.get("GOOGLE_CLOUD_PROJECT"))

LOCATION = os.environ.get("GOOGLE_CLOUD_REGION", "us-central1")

vertexai.init(project=PROJECT_ID, location=LOCATION)
!gcloud config set project {project_id}

DATA_STORE_LOCATION = "global"  # @param {type: "string"}
DATA_STORE_ID = "your_web_datastore_id"  # @param {type: "string"}

logging.basicConfig(level=logging.INFO)
logger = logging.getLogger(__name__)

### Initialize the Discovery Engine client

In [4]:
logger.info("Initializing Discovery Engine client...")
client_options = (
    ClientOptions(api_endpoint=f"{LOCATION}-discoveryengine.googleapis.com")
    if LOCATION != "global"
    else None
)
client = discoveryengine.SearchServiceClient(client_options=client_options)

### Search the data store using the Google Cloud Discovery Engine API

In [5]:
def get_relevant_snippets(search_query: str) -> None | (discoveryengine.SearchResponse):
    """
    Search the data store using the Google Cloud Discovery Engine API.

    Args:
        search_query (str): The search query string.

    Returns:
        Optional[discoveryengine.SearchResponse]:
        The search response from the Discovery Engine API.
    """
    logger.info("Searching data store with query: %s", search_query)
    try:
        serving_config = client.serving_config_path(
            project=PROJECT_ID,
            location=LOCATION,
            data_store=DATA_STORE_ID,
            serving_config="default_config",
        )

        content_search_spec = discoveryengine.SearchRequest.ContentSearchSpec(
            snippet_spec=discoveryengine.SearchRequest.ContentSearchSpec.SnippetSpec(
                return_snippet=True
            )
        )

        request = discoveryengine.SearchRequest(
            serving_config=serving_config,
            query=search_query,
            page_size=5,
            content_search_spec=content_search_spec,
            query_expansion_spec=discoveryengine.SearchRequest.QueryExpansionSpec(
                condition=discoveryengine.SearchRequest.QueryExpansionSpec.Condition.AUTO,
            ),
            spell_correction_spec=discoveryengine.SearchRequest.SpellCorrectionSpec(
                mode=discoveryengine.SearchRequest.SpellCorrectionSpec.Mode.AUTO
            ),
        )

        response = client.search(request)
        logger.info("Search successful.")
        return response

    except Exception as e:  # pylint: disable=broad-exception-caught
        logger.error("Error during data store search: %s", e)
        return None

### Extracts the first link from the top search response.

In [6]:
def get_first_link(response: discoveryengine.SearchResponse | None) -> str | None:
    """
    Extracts the first link from the search response.

    Args:
        response (Optional[discoveryengine.SearchResponse]):
          The search response object from the Discovery Engine API.

    Returns:
        Optional[str]: The first link extracted from the search results.
    """
    logger.info("Extracting first link from search response...")
    if response is None or not response.results:
        logger.error("No results found or empty response.")
        return None

    try:
        first_result = response.results[0]
        result_json = json_format.MessageToDict(
            first_result.document._pb  # pylint: disable=protected-access
        )
        derived_struct_data = result_json.get("derivedStructData", {})
        link = derived_struct_data.get("link", None)
        logger.info("First link extracted successfully: %s", link)
        return link
    except Exception as e:  # pylint: disable=broad-exception-caught
        logger.error("Error extracting link from results: %s", e)
        return None

### Loads and formats the full text from the given link using requests and BeautifulSoup.

In [7]:
def load_and_format_page_content(link: str) -> dict[str, str] | None:
    """
    Loads and formats the full text from the given link using requests and
     BeautifulSoup.

    Args:
        link (str): The URL to fetch and extract the content from.

    Returns:
        Optional[Dict[str, str]]:
        A dictionary with formatted source,
          title, description,
          and page content.
    """
    logger.info("Loading and formatting page content from: %s", link)
    try:
        response = requests.get(link)
        response.raise_for_status()  # Ensure we notice bad responses

        soup = BeautifulSoup(response.text, "html.parser")

        # Extract title, source, description, and page content
        title = soup.title.string.strip() if soup.title else "No title available"
        source = link
        description_meta = soup.find("meta", {"name": "description"})
        description = (
            description_meta["content"]
            if description_meta
            else "No description available"
        )
        page_content = " ".join(p.get_text() for p in soup.find_all("p"))

        logger.info("Page content loaded and formatted successfully.")
        return {
            "source": source,
            "title": title,
            "description": description,
            "page_content": page_content,
        }
    except Exception as e:  # pylint: disable=broad-exception-caught
        logger.error("Error loading content from link: %s", e)
        return None

### Performs a search, extracts the first link, and retrieves and formats the full text from the link.

In [8]:
def get_page_contents(search_query: str) -> str | None:
    """
    Performs a search, extracts the first link,
      and retrieves and formats the full text from the link.

    Args:
        search_query (str): The search query string.

    Returns:
        Optional[str]:
        The full text extracted from the first link of the search results.
    """
    logger.info("Getting page contents for query: %s", search_query)
    response = get_relevant_snippets(search_query)
    link = get_first_link(response)
    if link:
        details = load_and_format_page_content(link)
        logger.info("Page contents retrieved successfully.")
        return details if details else None
    logger.warning("No link found for the query.")
    return None

#### Prompt for response summarization

In [9]:
WEBPAGE_EXTRACTION_TEMPLATE = """ You are a helpful and informative Q&A bot. A user will provide you with text content from a web page and ask questions related to it. 
Your task is to analyze the content and answer the user's questions accurately and concisely. 

Here's how you should approach each request:

1. Thoroughly read the provided web page content.
2. Understand the user's question.
3. Identify the relevant information within the content.
4. Formulate a clear and concise answer based on the content.Whenever possible, use bullet points to summarize the answer.
5. If the answer cannot be found in the content**, say "I'm sorry, but I cannot find the answer to that question in the provided text."

Example:

User:
Here's the content from a web page: {context} 
My question is: What are the use cases of LangChain on Vertex AI?

Bot:
[Provide a concise answer based on the web page content, Use Markdown and bullet Points where ever applicable. If the answer is not found, say you cannot find it.] 
"""
WEBPAGE_EXTRACTION_PROMPT = PromptTemplate(
    input_variables=["context"], template=WEBPAGE_EXTRACTION_TEMPLATE
)

### Create Q&A chain

In [13]:
def get_chain():
    """Return a RunnableSerializable Chain"""
    logger.info("Building VertexAI chain...")
    search_llm_kwargs = {"prompt": WEBPAGE_EXTRACTION_PROMPT}

    return VertexAI(
        model_name="gemini-1.5-flash",
        verbose=False,
        search_llm_kwargs=search_llm_kwargs,
        return_direct=False,
        generation_config={
            "temperature": 0.2,
            "top_p": 0.95,
            "top_k": 10,
            "max_output_tokens": 4000,
        },
    )

### Invoking chain with query

In [14]:
def invoke(query: str) -> Output_co:
    """Invoke chain and return the answer"""
    logger.info("Invoking chain with query: %s", query)
    page_content = get_page_contents(query)
    if page_content:
        logger.info("Page content retrieved successfully.")
        chain = get_chain()
        formatted_prompt = WEBPAGE_EXTRACTION_PROMPT.format(context=page_content)
        response = chain(formatted_prompt)
        return response["result"] if "result" in response else response
    logger.warning("No relevant context found to summarize.")
    return "No relevant context found to summarize."

In [15]:
search_query = "What are the benefits of LangChain on Vertex AI?"
invoke(search_query)