In [1]:
PROJECT_ID = "gcp-mira-develop"
REGION = "us-central1"
import os
os.environ["PROJECT_ID"] = PROJECT_ID

In [2]:
#! gcloud auth login

In [3]:
import sys
sys.path.append("../../common/src")
sys.path.append("../src")

In [4]:
import tempfile
import os
from typing import List, Optional, Tuple, Dict
from common.utils.logging_handler import Logger
from common.models import (UserQuery, QueryResult, QueryEngine, QueryDocument,
                           QueryReference, QueryDocumentChunk, BatchJobModel)
from common.utils.errors import (ResourceNotFoundException,
                                 ValidationError)
from common.utils.http_exceptions import InternalServerError
from utils.errors import NoDocumentsIndexedException
from google.cloud import storage
from services import llm_generate, embeddings
from services.query import query_prompts
from services.query.vector_store import VectorStore
from services.query.data_source import DataSource

from config import (PROJECT_ID, DEFAULT_QUERY_CHAT_MODEL,
                    DEFAULT_QUERY_EMBEDDING_MODEL)
import spacy

INFO: [config/config.py:55 - <module>()] Namespace File not found, setting job namespace as default
INFO: [config/config.py:105 - <module>()] ENABLE_GOOGLE_LLM = True
INFO: [config/config.py:106 - <module>()] ENABLE_OPENAI_LLM = True
INFO: [config/config.py:107 - <module>()] ENABLE_COHERE_LLM = True
INFO: [config/config.py:108 - <module>()] ENABLE_GOOGLE_MODEL_GARDEN = True
INFO: [config/config.py:109 - <module>()] ENABLE_TRUSS_LLAMA2 = True
INFO: [config/vector_store_config.py:40 - <module>()] Default vector store = [langchain_pgvector]
INFO: [config/vector_store_config.py:49 - <module>()] PG_HOST = [127.0.0.1]
INFO: [config/vector_store_config.py:50 - <module>()] PG_DBNAME = [pgvector]


In [5]:
from spacy.lang.en import English

nlp_en = English()
nlp_en.add_pipe("sentencizer")

<spacy.pipeline.sentencizer.Sentencizer at 0x13526d900>

In [6]:
# do this once per vm environment
#!python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")

In [7]:
query_engine = "lukman-test-gs"
doc_url = "gs://mira-demo-docs"
user_id = "RkRwdej4IlTvdHdZWCie"
is_public=True
storage_client = storage.Client(project=PROJECT_ID)

In [8]:
from services.query.query_service import datasource_from_url

In [9]:
data_source = datasource_from_url(doc_url, storage_client)

In [10]:
temp_dir = "/tmp"
doc_filepaths = data_source.download_documents(doc_url, temp_dir)

INFO: [query/data_source.py:57 - download_documents()] downloading gs://mira-demo-docs from bucket mira-demo-docs


In [11]:
doc_name, index_doc_url, doc_filepath = doc_filepaths[0]

In [12]:
text_chunks = data_source.chunk_document(doc_name, index_doc_url, doc_filepath)

INFO: [query/data_source.py:91 - chunk_document()] generating index data for A59449_20240106.pdf
INFO: [query/data_source.py:171 - read_doc()] Reading pdf file A59449_20240106.pdf with 9 pages
INFO: [query/data_source.py:174 - read_doc()] Finished reading pdf file A59449_20240106.pdf


In [13]:
len(text_chunks)

9

In [14]:
text_chunks[8]

'REVISION HISTORY DATE REVISION HISTORY NUMBER REVISION HISTORY EXPLANATIONAssociated DocumentsRelated Local Coverage DocumentsN/ARelated National Coverage DocumentsN/AStatutory Requirements URLsN/ARules and Regulations URLsN/ACMS Manual Explanations URLsN/AOther URLsN/APublic VersionsUPDATED ON EFFECTIVE DATES STATUS06/01/2023 06/08/2023 - N/A Currently in Effect (This Version)KeywordsDental ServicesCreated on 01/06/2024. Page 9 of 9'

In [15]:
data_source.text_to_sentence_list(text_chunks[6])

['If a dentist believes that Medicare will deny some or all the services or items because of medical necessity or an inextricable link may not be present, an Advance Beneficiary Notice of Noncoverage (ABN) should be issued in writing to the Medicare beneficiary.',
 'The ABN is optional when Medicare never covers a service, for example, a benefit category denial, but should be used if Medicare does cover the service for some diagnoses, but the dentist believes it will not be covered for a particular situation.',
 'To learn more about the ABN process, visit this A/B MACs website at Interactive ABN (palmettogba.com)',
 '.This billing and coding article is not to be construed nor imply coverage of dental screening services, dental prophylaxis, treatment of simple dental caries, routine tooth extractions, dental prosthetics/splints/dentures/oral appliances, nor definitive reconstruction or restoration of dental structures because of the removal of identified infection and/or the source.',
 

In [None]:
embedding_type = DEFAULT_QUERY_EMBEDDING_MODEL
embeddings.get_embeddings(text_chunks, embedding_type)

In [None]:
from utils.html_helper import html_to_text, html_to_sentence_list
query_description = "Policies and guidance on billing for dental procedures using CDT codes for Federally Qualified Health Centers (FQHC).  Includes information on state agency reporting of billing against current dental terminology (CDT) codes.  Also includes information on billing, coding and other guidelines that support the implementation of the CY 2023 Medicare Physician Fee Schedule Final Rule on Dental Services."

In [None]:
# create q_engine
llm_type = DEFAULT_QUERY_EMBEDDING_MODEL
vector_store_type = "langchain_pgvector"
q_engine = QueryEngine(name=query_engine,
                     created_by=user_id,
                     llm_type=llm_type,
                     description=query_description,
                     embedding_type=embedding_type,
                     vector_store=vector_store_type,
                     is_public=is_public,
                     doc_url=doc_url)
q_engine.save()

In [None]:
from services.query.query_service import process_documents, vector_store_from_query_engine

vector_store = vector_store_from_query_engine(q_engine)

In [None]:
text_chunks

In [None]:
from w3lib.html import replace_escape_chars
import re

def clean_text(text):
  # Replace specific unprocessable characters
  cleaned_text = text.replace("\x00", "")

  # replace escape characters
  cleaned_text = replace_escape_chars(cleaned_text)

  # remove all non-printable characters
  cleaned_text = re.sub(r"[^\x20-\x7E]", "", cleaned_text)

  return cleaned_text

In [None]:
text_chunks = [clean_text(x) for x in text_chunks]

In [None]:
text_chunks

In [None]:
QueryEngine.delete_by_id(q_engine.id)

In [None]:
for i in range(0, len(text_chunks)):
  clean_text = html_to_text(text_chunks[i])
  sentences = html_to_sentence_list(text_chunks[i])
  query_doc_chunk = QueryDocumentChunk(
                        query_engine_id=q_engine.id,
                        query_document_id=query_doc.id,
                        index=i+index_base,
                        text=text_chunks[i],
                        clean_text=clean_text,
                        sentences=sentences)

In [None]:
storage_client = storage.Client(project=PROJECT_ID)

vector_store = VectorStore(q_engine)

In [None]:
docs_processed

In [None]:
docs_not_processed