# Create a Vertex AI Datastore and Search Engine

<table align="left">

  <td>
    <a href="https://colab.research.google.com/github/GoogleCloudPlatform/generative-ai/blob/main/search/create_datastore_and_search.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/colab-logo-32px.png" alt="Colab logo"> Run in Colab
    </a>
  </td>
  <td>
    <a href="https://github.com/GoogleCloudPlatform/generative-ai/blob/main/search/reate_datastore_and_search.ipynb">
      <img src="https://cloud.google.com/ml-engine/images/github-logo-32px.png" alt="GitHub logo">
      View on GitHub
    </a>
  </td>
  <td>
    <a href="https://console.cloud.google.com/vertex-ai/workbench/deploy-notebook?download_url=https://raw.githubusercontent.com/GoogleCloudPlatform/generative-ai/blob/main/search/reate_datastore_and_search.ipynb">
      <img src="https://lh3.googleusercontent.com/UiNooY4LUgW_oTvpsNhPpQzsstV5W8F7rYgxgGBD85cWJoLmrOzhVs_ksK_vgx40SHs7jCqkTkCk=e14-rj-sc0xffffff-h130-w32" alt="Vertex AI logo">
      Open in Vertex AI Workbench
    </a>
  </td>
</table>

---


* Authors: Kara Greenfield
* Created: 22 Nov 2023

---

## Objective

This notebook shows how to create and populate a Vertex AI Datastore, how to create a Vertex Search Engine connected to that datastore, and how to submit queries through the search engine.



Services used in the notebook:

- ✅ Vertex AI Search for document search and retrieval

## Install pre-requisites

If running in Colab install the pre-requisites into the runtime. Otherwise it is assumed that the notebook is running in Vertex AI Workbench. 


In [62]:
! pip install google-cloud-aiplatform -q
! pip install google-cloud-discoveryengine -q
! pip install langchain -q

---
#### ⚠️ Do not forget to restart the kernel!
---


## Authenticate

If running in Colab authenticate with `google.colab.google.auth` otherwise assume that running on Vertex AI Workbench.


In [1]:
import sys

if "google.colab" in sys.modules:
    from google.colab import auth as google_auth

    google_auth.authenticate_user()

from google.auth import default
creds, _ = default()


## Configure notebook environment


In [57]:
PROJECT_ID = "<PROJECT_ID>"

# Separate locations are used for the datastore and search engine vs the LLM, because vertexai can't be initialized in the global region
LOCATION = "global" 
LLM_LOCATION = "<LLM_LOCATION>" 


In [59]:
PROJECT_ID = "learning-405915"
LOCATION = "global" # Replace with your data store location
LLM_LOCATION = "us-central1"


## Create and Populate a Datastore


In [20]:
from google.cloud import discoveryengine_v1alpha

def create_data_store(data_store_name):
    # Create a client
    client = discoveryengine_v1alpha.DataStoreServiceClient()

    # Initialize request argument(s)
    data_store = discoveryengine_v1alpha.DataStore()
    data_store.display_name = data_store_name
    data_store.industry_vertical = "GENERIC"
    data_store.content_config = "CONTENT_REQUIRED"
    

    request = discoveryengine_v1alpha.CreateDataStoreRequest(
        parent=f"projects/{PROJECT_ID}/locations/{LOCATION}/collections/default_collection",
        data_store=data_store,
        data_store_id=data_store_name,
    )

    # Make the request
    # The try block is necessary to prevent execution from haulting due to an error being thrown when the datastore takes a while to instantiate
    try:
        operation = client.create_data_store(request=request)
        response = operation.result(timeout=90)
    except:
        print("long-running operation")

In [23]:
# The datastore name can only contain lowercase letters, numbers, and hyphens
DATASTORE_ID = "alphabet-contracts"

create_data_store(DATASTORE_ID)

In [47]:
from google.cloud import storage
storage_client = storage.Client()

def import_single_document(
    project_id: str,
    location: str,
    data_store_id: str,
    gcs_uri: str,
) -> str:
    #  For more information, refer to:
    # https://cloud.google.com/generative-ai-app-builder/docs/locations#specify_a_multi-region_for_your_data_store
    client_options = (
        ClientOptions(api_endpoint=f"{location}-discoveryengine.googleapis.com")
        if location != "global"
        else None
    )

    # Create a client
    client = discoveryengine_v1alpha.DocumentServiceClient(client_options=client_options)

    # The full resource name of the search engine branch.
    # e.g. projects/{project}/locations/{location}/dataStores/{data_store_id}/branches/{branch}
    parent = client.branch_path(
        project=project_id,
        location=location,
        data_store=data_store_id,
        branch="default_branch",
    )

    request = discoveryengine_v1alpha.ImportDocumentsRequest(
        parent=parent,
        gcs_source=discoveryengine_v1alpha.GcsSource(
            input_uris=[gcs_uri], data_schema="content"
        ),
        # Options: `FULL`, `INCREMENTAL`
        reconciliation_mode=discoveryengine_v1alpha.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
    )
    
    # Make the request
    operation = client.import_documents(request=request)

    print(f"Waiting for operation to complete: {operation.operation.name}")
    response = operation.result()

    # Once the operation is complete,
    # get information from operation metadata
    metadata = discoveryengine_v1alpha.ImportDocumentsMetadata(operation.metadata)

    # Handle the response
    print(response)
    print(metadata)

    return operation.operation.name


def import_documents(project_id: str,
    location: str,
    data_store_id: str,
    gcs_uri: str,):
    
    
    source_documents_gs_uri = "gs://cloud-samples-data/gen-app-builder/search/alphabet-investor-pdfs"
    bucket_name = source_documents_gs_uri.split('/')[2]
    bucket = storage_client.get_bucket(bucket_name)
    source_documents = list(bucket.list_blobs(prefix=source_documents_gs_uri[source_documents_gs_uri.find(bucket_name) + len(bucket_name) + 1:]))
    
    for doc in source_documents:
        gsu = f'gs://{bucket.name}/{doc.name}'
        import_single_document(project_id, location, data_store_id, gsu)
    


In [49]:
source_documents_gs_uri = "cloud-samples-data/gen-app-builder/search/alphabet-investor-pdfs"

import_documents(PROJECT_ID, LOCATION, DATASTORE_ID, source_documents_gs_uri)

Waiting for operation to complete: projects/861331366245/locations/global/collections/default_collection/dataStores/alphabet-contracts/branches/0/operations/import-documents-13165973765131656714
error_config {
  gcs_prefix: "gs://861331366245_us_import_content/errors13165973765131656769"
}

create_time {
  seconds: 1700710067
  nanos: 698322000
}
update_time {
  seconds: 1700710068
  nanos: 987636000
}
success_count: 1

Waiting for operation to complete: projects/861331366245/locations/global/collections/default_collection/dataStores/alphabet-contracts/branches/0/operations/import-documents-15353831033435618437
error_config {
  gcs_prefix: "gs://861331366245_us_import_content/errors15353831033435618422"
}

create_time {
  seconds: 1700710070
  nanos: 860566000
}
update_time {
  seconds: 1700710071
  nanos: 899856000
}
success_count: 1

Waiting for operation to complete: projects/861331366245/locations/global/collections/default_collection/dataStores/alphabet-contracts/branches/0/operat

## Create a Search Engine

This is used to set the search_tier to enterprise and to enable advanced LLM features.

Enterprise tier is required to get extractive answers from a search query and advanced LLM features are required to sumarize search results.


In [52]:
def create_engine(project_id: str, location: str, data_store_name: str):
    # Create a client
    client = discoveryengine_v1alpha.EngineServiceClient()
    

    # Initialize request argument(s)
    engine = discoveryengine_v1alpha.Engine()
    engine.display_name = data_store_name
    engine.data_store_ids.append(data_store_name)
    engine.solution_type = "SOLUTION_TYPE_SEARCH" # This can be replaced with "SOLUTION_TYPE_CHAT" to make a chat engine instead
    engine.industry_vertical = "GENERIC"
    engine.search_engine_config.search_tier = "SEARCH_TIER_ENTERPRISE"
    engine.search_engine_config.search_add_ons.append("SEARCH_ADD_ON_LLM")
       

    request = discoveryengine_v1alpha.CreateEngineRequest(
        parent = f"projects/{project_id}/locations/{location}/collections/default_collection",
        engine = engine,
        engine_id = engine.display_name,
    )

    # Make the request
    operation = client.create_engine(request=request)
    response = operation.result(timeout=90)



In [53]:
create_engine(PROJECT_ID, LOCATION, DATASTORE_ID)

alphabet-contracts


## Query your Datastore

In [63]:
import vertexai
from langchain.llms import VertexAI

# Initialize Vertex AI SDK
vertexai.init(project=PROJECT_ID, location=LLM_LOCATION)

llm = VertexAI(
    model_name='text-bison-32k',
    max_output_tokens=256,
    temperature=0,
    top_p=0.8,
    top_k=40,
    verbose=True,
)

In [67]:
from langchain.chains import RetrievalQA
from langchain.retrievers import GoogleVertexAISearchRetriever


def get_retrieval_qa(datastore_id):
    search_retriever = GoogleVertexAISearchRetriever(
        project_id=PROJECT_ID,
        location_id=LOCATION,
        data_store_id=datastore_id,
        max_documents=5,
    )
    return RetrievalQA.from_chain_type(
        llm=llm,
        chain_type="stuff",
        retriever=search_retriever,
        return_source_documents=True,
        output_key="response"
        
    )

In [69]:
query = "Who is the CEO of DeepMind?"

result = get_retrieval_qa(DATASTORE_ID)({"query": query})
print(result['response'])

 The CEO of DeepMind is Demis Hassabis.
