In [1]:
PROJECT_ID = "lramsey-dev"
REGION = "us-central1"
import os
os.environ["PROJECT_ID"] = PROJECT_ID

In [2]:
import sys
sys.path.append("../../common/src")
sys.path.append("../src")

In [8]:
import tempfile
import os
from typing import List, Optional, Tuple, Dict
from common.utils.logging_handler import Logger
from common.models import (UserQuery, QueryResult, QueryEngine, QueryDocument,
                           QueryReference, QueryDocumentChunk, BatchJobModel)
from common.utils.errors import (ResourceNotFoundException,
                                 ValidationError)
from common.utils.http_exceptions import InternalServerError
from utils.errors import NoDocumentsIndexedException
from google.cloud import storage
from services.query import query_prompts
from services.query.vector_store import VectorStore
from services.query.data_source import DataSource
    
from config import (PROJECT_ID, DEFAULT_QUERY_CHAT_MODEL,
                        DEFAULT_QUERY_EMBEDDING_MODEL)

In [9]:
storage_client = storage.Client(project=PROJECT_ID)

In [10]:
query_engine = "lukman-test-vs-7"
doc_url = "gs://lramsey-dev-mira-demo-docs"
user_id = "epKVwPZeIBz1rrqcDm1B"
is_public=True
query_description="test vertex search"

In [11]:
from common.models.llm_query import QE_TYPE_VERTEX_SEARCH, QE_TYPE_LLM_SERVICE

# create q_engine
params = {"depth_limit": 1}
query_engine_type = QE_TYPE_VERTEX_SEARCH
embedding_type = DEFAULT_QUERY_EMBEDDING_MODEL
llm_type = DEFAULT_QUERY_EMBEDDING_MODEL
vector_store_type = "langchain_pgvector"
q_engine = QueryEngine(name=query_engine,
                     created_by=user_id,
                     query_engine_type=QE_TYPE_VERTEX_SEARCH,
                     llm_type=llm_type,
                     description=query_description,
                     embedding_type=embedding_type,
                     vector_store=vector_store_type,
                     params=params,
                     doc_url=doc_url)
q_engine.save()

<common.models.llm_query.QueryEngine at 0x132d2cdc0>

In [12]:
from google.cloud import discoveryengine_v1alpha as discoveryengine
location = "global"
project_id = PROJECT_ID
parent = "projects/lramsey-dev/locations/global/collections/default_collection"

In [22]:
from services.query.vertex_search import (create_data_store, 
                                          create_search_engine, 
                                          import_documents_to_datastore, 
                                          import_documents_gcs, wait_for_operation, datastore_id_from_name, inventory_gcs_files,
                                          wait_for_operation)

In [15]:
data_store_id = datastore_id_from_name(q_engine.name)
data_store_id

'lukman-test-vs-7'

In [17]:
data_url = q_engine.doc_url
project_id = PROJECT_ID
location = "global"

docs_to_be_processed = []
docs_processed = []
docs_not_processed = []

In [20]:
docs_to_be_processed = inventory_gcs_files(data_url)

In [21]:
docs_to_be_processed

['A59449_20240106.pdf',
 'FQHC-bg-20210101.pdf',
 'Federally Qualified Health Center (FQHC) - Dental Billing Guide - Horizon NJ Health.pdf',
 'Mary Foley - Understanding the Prospective Payment System.pdf',
 'test/genai-sample-doc.pdf']

In [23]:
operation = create_data_store(q_engine, project_id, data_store_id)
wait_for_operation(operation)

In [25]:
client = discoveryengine.DocumentServiceClient()
parent = client.branch_path(
    project=project_id,
    location=location,
    data_store=data_store_id,
    branch="default_branch",
)

In [26]:
data_url

'gs://lramsey-dev-mira-demo-docs'

In [27]:
operation = import_documents_gcs(data_url,
                                 docs_to_be_processed,
                                 client,
                                 parent)

In [28]:
print(f"Waiting for import operation to complete: {operation.operation.name}")

Waiting for import operation to complete: projects/944045413892/locations/global/collections/default_collection/dataStores/lukman-test-vs-7/branches/0/operations/import-documents-16429642499957673036


In [29]:
wait_for_operation(operation)

In [30]:
metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)

In [31]:
metadata

create_time {
  seconds: 1708189132
  nanos: 603150000
}
update_time {
  seconds: 1708189133
  nanos: 943553000
}
success_count: 5

In [32]:
operation.__dict__

{'_polling': <google.api_core.retry.retry_unary.Retry at 0x1089189d0>,
 '_result': error_config {
   gcs_prefix: "gs://944045413892_us_import_content/errors16429642499957673123"
 },
 '_exception': None,
 '_result_set': True,
 '_polling_thread': None,
 '_done_callbacks': [],
 '_operation': name: "projects/944045413892/locations/global/collections/default_collection/dataStores/lukman-test-vs-7/branches/0/operations/import-documents-16429642499957673036"
 metadata {
   type_url: "type.googleapis.com/google.cloud.discoveryengine.v1alpha.ImportDocumentsMetadata"
   value: "\n\014\010\314\313\303\256\006\020\260\255\315\237\002\022\014\010\315\313\303\256\006\020\350\363\365\301\003\030\005"
 }
 done: true
 response {
   type_url: "type.googleapis.com/google.cloud.discoveryengine.v1alpha.ImportDocumentsResponse"
   value: "\022@\n>gs://944045413892_us_import_content/errors16429642499957673123"
 },
 '_refresh': functools.partial(<bound method OperationsClient.get_operation of <google.api_core

In [36]:
operation.result()

error_config {
  gcs_prefix: "gs://944045413892_us_import_content/errors16429642499957673123"
}

In [38]:
!gsutil ls -R gs://944045413892_us_import_content/errors16429642499957673123

gs://944045413892_us_import_content/errors16429642499957673123/:
gs://944045413892_us_import_content/errors16429642499957673123/


In [41]:
if metadata.success_count == len(docs_to_be_processed):
  docs_processed = docs_to_be_processed
else:
  # TODO: build list of documents processed/not processed from results
  pass

In [42]:
docs_processed

['A59449_20240106.pdf',
 'FQHC-bg-20210101.pdf',
 'Federally Qualified Health Center (FQHC) - Dental Billing Guide - Horizon NJ Health.pdf',
 'Mary Foley - Understanding the Prospective Payment System.pdf',
 'test/genai-sample-doc.pdf']

In [43]:
operation = create_search_engine(q_engine, project_id, data_store_id)

In [45]:
q_engine.index_id = data_store_id
q_engine.update()

<common.models.llm_query.QueryEngine at 0x132d2cdc0>

In [46]:
from services.query.vertex_search import query_vertex_search, perform_vertex_search

In [47]:
search_query = "what qualifies as an encounter in medicaid billing?"

In [48]:
client = discoveryengine.SearchServiceClient()

In [49]:
serving_config = client.serving_config_path(
    project=project_id,
    location=location,
    data_store=data_store_id,
    serving_config="default_config",
)

In [50]:
content_search_spec = discoveryengine.SearchRequest.ContentSearchSpec(
    snippet_spec=discoveryengine.SearchRequest.ContentSearchSpec.SnippetSpec(
        return_snippet=True
    ),
    summary_spec=discoveryengine.SearchRequest.ContentSearchSpec.SummarySpec(
        summary_result_count=5,
        include_citations=True,
        ignore_adversarial_query=True,
        ignore_non_summary_seeking_query=True,
    ),
)

In [51]:
request = discoveryengine.SearchRequest(
    serving_config=serving_config,
    query=search_query,
    page_size=10,
    content_search_spec=content_search_spec,
    query_expansion_spec=discoveryengine.SearchRequest.QueryExpansionSpec(
        condition=discoveryengine.SearchRequest.QueryExpansionSpec.Condition.AUTO,
    ),
    spell_correction_spec=discoveryengine.SearchRequest.SpellCorrectionSpec(
        mode=discoveryengine.SearchRequest.SpellCorrectionSpec.Mode.AUTO
    ),
)

In [52]:
response = client.search(request)

In [56]:
len(response.results)

4

In [58]:
results = response.results
type(results[0])

google.cloud.discoveryengine_v1alpha.types.search_service.SearchResponse.SearchResult

In [60]:
type(response)

google.cloud.discoveryengine_v1alpha.services.search_service.pagers.SearchPager

In [89]:
document = results[0].document

In [112]:
type(document)

google.cloud.discoveryengine_v1alpha.types.document.Document

In [115]:
from google.protobuf.json_format import MessageToDict

In [116]:
import proto

In [118]:
document_dict = proto.Message.to_dict(document)

In [119]:
document_dict.keys()

dict_keys(['name', 'id', 'derived_struct_data', 'schema_id', 'parent_document_id'])

In [123]:
document_dict["derived_struct_data"]["snippets"][0]["snippet"]

'96 | FEDERALLY <b>QUALIFIED</b> ... See the Physician-Related Services/Healthcare Professional Services <b>Medicaid Billing</b> Guide. How do I <b>bill</b> for more than one <b>encounter</b>&nbsp;...'

In [124]:
document_dict["derived_struct_data"]

{'title': 'Federally Qualified Health Centers Billing Guide',
 'snippets': [{'snippet_status': 'SUCCESS',
   'snippet': '96 | FEDERALLY <b>QUALIFIED</b> ... See the Physician-Related Services/Healthcare Professional Services <b>Medicaid Billing</b> Guide. How do I <b>bill</b> for more than one <b>encounter</b>&nbsp;...'}],
 'link': 'gs://lramsey-dev-mira-demo-docs/FQHC-bg-20210101.pdf'}

In [125]:
document_data = proto.Message.to_dict(document)["derived_struct_data"]

In [126]:
document_data["link"]

'gs://lramsey-dev-mira-demo-docs/FQHC-bg-20210101.pdf'

In [127]:
document_data["snippets"][0]["snippet"]

'96 | FEDERALLY <b>QUALIFIED</b> ... See the Physician-Related Services/Healthcare Professional Services <b>Medicaid Billing</b> Guide. How do I <b>bill</b> for more than one <b>encounter</b>&nbsp;...'