In [1]:
PROJECT_ID = "lramsey-dev"
REGION = "us-central1"
import os
os.environ["PROJECT_ID"] = PROJECT_ID

In [2]:
#! gcloud auth login

In [3]:
import sys
sys.path.append("../../common/src")
sys.path.append("../src")

In [4]:
import tempfile
import os
from typing import List, Optional, Tuple, Dict
from common.utils.logging_handler import Logger
from common.models import (UserQuery, QueryResult, QueryEngine, QueryDocument,
                           QueryReference, QueryDocumentChunk, BatchJobModel)
from common.utils.errors import (ResourceNotFoundException,
                                 ValidationError)
from common.utils.http_exceptions import InternalServerError
from utils.errors import NoDocumentsIndexedException
from google.cloud import storage
from google.cloud import storage
from services import llm_generate, embeddings
from services.query import query_prompts
from services.query.vector_store import VectorStore
from services.query.data_source import DataSource
    
from config import (PROJECT_ID, DEFAULT_QUERY_CHAT_MODEL,
                        DEFAULT_QUERY_EMBEDDING_MODEL)
import spacy

INFO: [config/config.py:57 - <module>()] Namespace File not found, setting job namespace as default
INFO: [config/config.py:107 - <module>()] ENABLE_GOOGLE_LLM = True
INFO: [config/config.py:108 - <module>()] ENABLE_OPENAI_LLM = True
INFO: [config/config.py:109 - <module>()] ENABLE_COHERE_LLM = True
INFO: [config/config.py:110 - <module>()] ENABLE_GOOGLE_MODEL_GARDEN = True
INFO: [config/config.py:111 - <module>()] ENABLE_TRUSS_LLAMA2 = True
INFO: [config/vector_store_config.py:40 - <module>()] Default vector store = [matching_engine]
INFO: [config/vector_store_config.py:49 - <module>()] PG_HOST = [localhost]
INFO: [config/vector_store_config.py:50 - <module>()] PG_DBNAME = [pgvector]
ERROR: [config/vector_store_config.py:77 - <module>()] Cannot connect to pgvector instance at localhost: (psycopg2.OperationalError) connection to server at "localhost" (::1), port 5432 failed: FATAL:  database "pgvector" does not exist

(Background on this error at: https://sqlalche.me/e/14/e3q8)
INFO: [

In [5]:
# do this once per vm environment
#!python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")
storage_client = storage.Client(project=PROJECT_ID)

In [6]:
query_engine = "lukman-test-vs-6"
doc_url = "gs://lramsey-dev-mira-demo-docs"
user_id = "epKVwPZeIBz1rrqcDm1B"
is_public=True
query_description="test vertex search"

In [7]:
#query_engine = "lukman-test-html"
#doc_url = "https://health.ny.gov/health_care/medicaid/rates/manual/apg_provider_manual_december.htm"
#user_id = "epKVwPZeIBz1rrqcDm1B"
#query_description="test depth limit"
#storage_client = storage.Client(project=PROJECT_ID)

In [8]:
from common.models.llm_query import QE_TYPE_VERTEX_SEARCH, QE_TYPE_LLM_SERVICE

In [9]:
# create q_engine
params = {"depth_limit": 1}
query_engine_type = QE_TYPE_VERTEX_SEARCH
embedding_type = DEFAULT_QUERY_EMBEDDING_MODEL
llm_type = DEFAULT_QUERY_EMBEDDING_MODEL
vector_store_type = "langchain_pgvector"
q_engine = QueryEngine(name=query_engine,
                     created_by=user_id,
                     query_engine_type=QE_TYPE_VERTEX_SEARCH,
                     llm_type=llm_type,
                     description=query_description,
                     embedding_type=embedding_type,
                     vector_store=vector_store_type,
                     params=params,
                     doc_url=doc_url)
q_engine.save()

<common.models.llm_query.QueryEngine at 0x138a491c0>

In [10]:
from google.cloud import discoveryengine_v1alpha as discoveryengine

In [11]:
location = "global"
project_id = PROJECT_ID

In [12]:
parent = "projects/lramsey-dev/locations/global/collections/default_collection"

In [15]:
# Create a Data Store...
data_store_id = q_engine.name

dss_client = discoveryengine.DataStoreServiceClient()

ds_parent = parent

content_config = discoveryengine.DataStore.ContentConfig.CONTENT_REQUIRED
data_store = discoveryengine.DataStore(display_name=q_engine.name,
                                       industry_vertical="GENERIC",
                                       solution_types=["SOLUTION_TYPE_SEARCH"],
                                       content_config=content_config)

In [16]:
data_store

display_name: "lukman-test-vs-6"
industry_vertical: GENERIC
solution_types: SOLUTION_TYPE_SEARCH
content_config: CONTENT_REQUIRED

In [17]:
ds_request = discoveryengine.CreateDataStoreRequest(parent=ds_parent,
                                                    data_store_id=data_store_id,
                                                    data_store=data_store)

operation = dss_client.create_data_store(request=ds_request)


In [18]:
operation.done()

True

In [19]:
gcs_uri = doc_url

ds_client = discoveryengine.DocumentServiceClient()

parent = ds_client.branch_path(
  project=project_id,
  location=location,
  data_store=data_store_id,
  branch="default_branch",
)

gcs_uris = [
  f"{gcs_uri}/*.pdf",
#  f"{gcs_uri}/*.html",
]

request = discoveryengine.ImportDocumentsRequest(
  parent=parent,
  gcs_source=discoveryengine.GcsSource(
      input_uris=gcs_uris, data_schema="content"
  ),
  # Options: `FULL`, `INCREMENTAL`
  reconciliation_mode=discoveryengine.ImportDocumentsRequest.ReconciliationMode.INCREMENTAL,
)

# Make the request
operation = ds_client.import_documents(request=request)

In [20]:
response = operation.result()

In [21]:
response

error_config {
  gcs_prefix: "gs://944045413892_us_import_content/errors18351720666327887083"
}

In [22]:
metadata = discoveryengine.ImportDocumentsMetadata(operation.metadata)

In [23]:
metadata

create_time {
  seconds: 1707773899
  nanos: 151629000
}
update_time {
  seconds: 1707774144
  nanos: 128862000
}
success_count: 4

In [25]:
parent = f"projects/{project_id}/locations/global/collections/default_collection"
engine = discoveryengine.Engine()
engine.display_name = q_engine.name
engine.solution_type = "SOLUTION_TYPE_SEARCH"
engine.data_store_ids = [data_store_id]
request = discoveryengine.CreateEngineRequest(parent=parent,
                                            engine=engine,
                                            engine_id=data_store_id)

In [26]:
es_client = discoveryengine.EngineServiceClient()
operation = es_client.create_engine(request=request)

In [27]:
operation.done()

True

In [None]:
from services.query.vertex_search import create_client, import_documents_gcs

In [None]:
client,parent = create_client(PROJECT_ID, location, data_store_id)

In [None]:
parent

In [None]:
operation = import_documents_gcs(doc_url, client, parent)

In [None]:
from services import llm_generate, embeddings
from services.query import query_prompts
from services.query.vector_store import VectorStore
from services.query.data_source import DataSource
from services.query.query_service import datasource_from_url
from config import (PROJECT_ID, DEFAULT_QUERY_CHAT_MODEL,
                    DEFAULT_QUERY_EMBEDDING_MODEL)
import spacy

In [None]:
doc_name, index_doc_url, doc_filepath = doc_filepaths[0]

In [None]:
text_chunks = data_source.chunk_document(doc_name, index_doc_url, doc_filepath)

In [None]:
len(text_chunks)

In [None]:
text_chunks[8]

In [None]:
data_source.text_to_sentence_list(text_chunks[6])

In [None]:
embeddings.get_embeddings(text_chunks, embedding_type)

In [None]:
from utils.html_helper import html_to_text, html_to_sentence_list
query_description = "Policies and guidance on billing for dental procedures using CDT codes for Federally Qualified Health Centers (FQHC).  Includes information on state agency reporting of billing against current dental terminology (CDT) codes.  Also includes information on billing, coding and other guidelines that support the implementation of the CY 2023 Medicare Physician Fee Schedule Final Rule on Dental Services."

In [None]:
from services.query.query_service import process_documents, vector_store_from_query_engine

vector_store = vector_store_from_query_engine(q_engine)

In [None]:
text_chunks

In [None]:
from w3lib.html import replace_escape_chars
import re

def clean_text(text):
  # Replace specific unprocessable characters
  cleaned_text = text.replace("\x00", "")

  # replace escape characters
  cleaned_text = replace_escape_chars(cleaned_text)

  # remove all non-printable characters
  cleaned_text = re.sub(r"[^\x20-\x7E]", "", cleaned_text)

  return cleaned_text

In [None]:
text_chunks = [clean_text(x) for x in text_chunks]

In [None]:
text_chunks

In [None]:
QueryEngine.delete_by_id(q_engine.id)

In [None]:
for i in range(0, len(text_chunks)):
  clean_text = html_to_text(text_chunks[i])
  sentences = html_to_sentence_list(text_chunks[i])
  query_doc_chunk = QueryDocumentChunk(
                        query_engine_id=q_engine.id,
                        query_document_id=query_doc.id,
                        index=i+index_base,
                        text=text_chunks[i],
                        clean_text=clean_text,
                        sentences=sentences)

In [None]:
storage_client = storage.Client(project=PROJECT_ID)

vector_store = VectorStore(q_engine)

In [None]:
docs_processed

In [None]:
docs_not_processed