In [1]:
PROJECT_ID = "gcp-mira-develop"
REGION = "us-central1"
import os
os.environ["PROJECT_ID"] = PROJECT_ID

In [2]:
#! gcloud auth login

In [3]:
import sys
sys.path.append("../../common/src")
sys.path.append("../src")

In [5]:
import tempfile
import os
from typing import List, Optional, Tuple, Dict
from common.utils.logging_handler import Logger
from common.models import (UserQuery, QueryResult, QueryEngine, QueryDocument,
                           QueryReference, QueryDocumentChunk, BatchJobModel)
from common.utils.errors import (ResourceNotFoundException,
                                 ValidationError)
from common.utils.http_exceptions import InternalServerError
from utils.errors import NoDocumentsIndexedException
from google.cloud import storage
from services import llm_generate, embeddings
from services.query import query_prompts
from services.query.vector_store import VectorStore
from services.query.data_source import DataSource

from config import (PROJECT_ID, DEFAULT_QUERY_CHAT_MODEL,
                    DEFAULT_QUERY_EMBEDDING_MODEL)

In [6]:
query_engine = "lukman-test"
doc_url = "gs://mira-demo-docs"
user_id = "RkRwdej4IlTvdHdZWCie"
is_public=True
storage_client = storage.Client(project=PROJECT_ID)

In [7]:
from services.query.query_service import datasource_from_url

In [8]:
data_source = datasource_from_url(doc_url, storage_client)

In [11]:
temp_dir = "/tmp"
doc_filepaths = data_source.download_documents(doc_url, temp_dir)

INFO: [query/data_source.py:55 - download_documents()] downloading gs://mira-demo-docs from bucket mira-demo-docs


In [13]:
doc_name, index_doc_url, doc_filepath = doc_filepaths[0]

In [41]:
text_chunks = data_source.chunk_document(doc_name, index_doc_url, doc_filepath)

INFO: [query/data_source.py:89 - chunk_document()] generating index data for A59449_20240106.pdf
INFO: [query/data_source.py:148 - read_doc()] Reading pdf file A59449_20240106.pdf with 9 pages
INFO: [query/data_source.py:151 - read_doc()] Finished reading pdf file A59449_20240106.pdf


In [16]:
len(text_chunks)

9

In [27]:
text_chunks[8]

'REVISION HISTORY DATE REVISION HISTORY NUMBER REVISION HISTORY EXPLANATION\nAssociated Documents\nRelated Local Coverage Documents\nN/A\nRelated National Coverage Documents\nN/A\nStatutory Requirements URLs\nN/A\nRules and Regulations URLs\nN/A\nCMS Manual Explanations URLs\nN/A\nOther URLs\nN/A\nPublic Versions\nUPDATED ON EFFECTIVE DATES STATUS\n06/01/2023 06/08/2023 - N/A Currently in Effect (This Version)\nKeywords\nDental Services•\nCreated on 01/06/2024. Page 9 of \n9'

In [28]:
embedding_type = DEFAULT_QUERY_EMBEDDING_MODEL
embeddings.get_embeddings(text_chunks, embedding_type)

INFO: [services/embeddings.py:59 - get_embeddings()] generating embeddings with VertexAI-Embedding
INFO: [services/embeddings.py:116 - generate_embeddings()] generating embeddings for embedding type VertexAI-Embedding
INFO: [services/embeddings.py:116 - generate_embeddings()] generating embeddings for embedding type VertexAI-Embedding


([True, True, True, True, True, True, True, True, True],
 array([[ 0.03835627, -0.06541958, -0.00505753, ...,  0.0473529 ,
         -0.05531731, -0.00435076],
        [ 0.04384657, -0.03203368, -0.02900794, ...,  0.02707246,
         -0.03703633,  0.0042127 ],
        [ 0.0202451 , -0.02688655, -0.02893464, ...,  0.06515977,
         -0.07371525,  0.00849769],
        ...,
        [-0.00973345, -0.04267366, -0.02435035, ...,  0.0152466 ,
         -0.05134102,  0.00717109],
        [ 0.02009973, -0.05496827, -0.04531021, ...,  0.00680298,
         -0.0600516 ,  0.0019963 ],
        [ 0.05907924, -0.02999166, -0.03159067, ...,  0.01713085,
         -0.04988874, -0.00134143]]))

In [32]:
from utils.html_helper import html_to_text, html_to_sentence_list
query_description = "Policies and guidance on billing for dental procedures using CDT codes for Federally Qualified Health Centers (FQHC).  Includes information on state agency reporting of billing against current dental terminology (CDT) codes.  Also includes information on billing, coding and other guidelines that support the implementation of the CY 2023 Medicare Physician Fee Schedule Final Rule on Dental Services."

In [33]:
# create q_engine
llm_type = DEFAULT_QUERY_EMBEDDING_MODEL
vector_store_type = "langchain_pgvector"
q_engine = QueryEngine(name=query_engine,
                     created_by=user_id,
                     llm_type=llm_type,
                     description=query_description,
                     embedding_type=embedding_type,
                     vector_store=vector_store_type,
                     is_public=is_public,
                     doc_url=doc_url)
q_engine.save()

<common.models.llm_query.QueryEngine at 0x136f52a90>

In [34]:
from services.query.query_service import process_documents, vector_store_from_query_engine

vector_store = vector_store_from_query_engine(q_engine)

Exception: Failed to create vector extension: (psycopg2.OperationalError) connection to server at "127.0.0.1", port 5432 failed: FATAL:  database "pgvector" does not exist

(Background on this error at: https://sqlalche.me/e/14/e3q8)

In [35]:
text_chunks

['Billing and Coding Article\nArticle - Billing and Coding: Dental Services (A59449)\nLinks in PDF documents are not guaranteed to work. To follow a web link, please use the MCD Website.\nNOT AN LCD REFERENCE ARTICLE\nThis article is not in direct support of an LCD. Learn more\nContractor Information\nCONTRACTOR NAME CONTRACT TYPE CONTRACT NUMBER JURISDICTION STATES\nPalmetto GBA A and B MAC 10111 - MAC A J - J Alabama  \nPalmetto GBA A and B MAC 10112 - MAC B J - J Alabama  \nPalmetto GBA A and B MAC 10211 - MAC A J - J Georgia  \nPalmetto GBA A and B MAC 10212 - MAC B J - J Georgia  \nPalmetto GBA A and B MAC 10311 - MAC A J - J Tennessee  \nPalmetto GBA A and B MAC 10312 - MAC B J - J Tennessee  \nPalmetto GBA A and B and HHH MAC 11201 - MAC A J - M South Carolina  \nPalmetto GBA A and B and HHH MAC 11202 - MAC B J - M South Carolina  \nPalmetto GBA A and B and HHH MAC 11301 - MAC A J - M Virginia  \nPalmetto GBA A and B and HHH MAC 11302 - MAC B J - M Virginia  \nPalmetto GBA A and

In [40]:
from w3lib.html import replace_escape_chars
import re

def clean_text(text):
  # Replace specific unprocessable characters
  cleaned_text = text.replace("\x00", "")

  # replace escape characters
  cleaned_text = replace_escape_chars(cleaned_text)

  # remove all non-printable characters
  cleaned_text = re.sub(r"[^\x20-\x7E]", "", cleaned_text)

  return cleaned_text

In [42]:
text_chunks = [clean_text(x) for x in text_chunks]

In [43]:
text_chunks

['Billing and Coding ArticleArticle - Billing and Coding: Dental Services (A59449)Links in PDF documents are not guaranteed to work. To follow a web link, please use the MCD Website.NOT AN LCD REFERENCE ARTICLEThis article is not in direct support of an LCD. Learn moreContractor InformationCONTRACTOR NAME CONTRACT TYPE CONTRACT NUMBER JURISDICTION STATESPalmetto GBA A and B MAC 10111 - MAC A J - J Alabama  Palmetto GBA A and B MAC 10112 - MAC B J - J Alabama  Palmetto GBA A and B MAC 10211 - MAC A J - J Georgia  Palmetto GBA A and B MAC 10212 - MAC B J - J Georgia  Palmetto GBA A and B MAC 10311 - MAC A J - J Tennessee  Palmetto GBA A and B MAC 10312 - MAC B J - J Tennessee  Palmetto GBA A and B and HHH MAC 11201 - MAC A J - M South Carolina  Palmetto GBA A and B and HHH MAC 11202 - MAC B J - M South Carolina  Palmetto GBA A and B and HHH MAC 11301 - MAC A J - M Virginia  Palmetto GBA A and B and HHH MAC 11302 - MAC B J - M Virginia  Palmetto GBA A and B and HHH MAC 11401 - MAC A J - M

In [44]:
QueryEngine.delete_by_id(q_engine.id)

In [30]:
for i in range(0, len(text_chunks)):
  clean_text = html_to_text(text_chunks[i])
  sentences = html_to_sentence_list(text_chunks[i])
  query_doc_chunk = QueryDocumentChunk(
                        query_engine_id=q_engine.id,
                        query_document_id=query_doc.id,
                        index=i+index_base,
                        text=text_chunks[i],
                        clean_text=clean_text,
                        sentences=sentences)

NameError: name 'q_engine' is not defined

In [7]:
storage_client = storage.Client(project=PROJECT_ID)

vector_store = VectorStore(q_engine)

In [10]:
docs_processed

[<common.models.llm_query.QueryDocument at 0x10b68c8e0>]

In [11]:
docs_not_processed

[]