In [1]:
PROJECT_ID = "gcp-mira-develop"
REGION = "us-central1"
import os
os.environ["PROJECT_ID"] = PROJECT_ID

In [2]:
#! gcloud auth login

In [3]:
import sys
sys.path.append("../../common/src")
sys.path.append("../src")

In [4]:
import tempfile
import os
from typing import List, Optional, Tuple, Dict
from common.utils.logging_handler import Logger
from common.models import (UserQuery, QueryResult, QueryEngine, QueryDocument,
                           QueryReference, QueryDocumentChunk, BatchJobModel)
from common.utils.errors import (ResourceNotFoundException,
                                 ValidationError)
from common.utils.http_exceptions import InternalServerError
from utils.errors import NoDocumentsIndexedException
from google.cloud import storage
from services import llm_generate, embeddings
from services.query import query_prompts
from services.query.vector_store import VectorStore
from services.query.data_source import DataSource

from config import (PROJECT_ID, DEFAULT_QUERY_CHAT_MODEL,
                    DEFAULT_QUERY_EMBEDDING_MODEL)
import spacy

INFO: [config/config.py:55 - <module>()] Namespace File not found, setting job namespace as default
INFO: [config/config.py:105 - <module>()] ENABLE_GOOGLE_LLM = True
INFO: [config/config.py:106 - <module>()] ENABLE_OPENAI_LLM = True
INFO: [config/config.py:107 - <module>()] ENABLE_COHERE_LLM = True
INFO: [config/config.py:108 - <module>()] ENABLE_GOOGLE_MODEL_GARDEN = True
INFO: [config/config.py:109 - <module>()] ENABLE_TRUSS_LLAMA2 = True
INFO: [config/vector_store_config.py:40 - <module>()] Default vector store = [langchain_pgvector]
INFO: [config/vector_store_config.py:49 - <module>()] PG_HOST = [127.0.0.1]
INFO: [config/vector_store_config.py:50 - <module>()] PG_DBNAME = [pgvector]
INFO: [utils/text_helper.py:37 - <module>()] loaded spacy model


In [5]:
# do this once per vm environment
#!python -m spacy download en_core_web_sm

nlp = spacy.load("en_core_web_sm")

In [6]:
#query_engine = "lukman-test-gs"
#doc_url = "gs://mira-demo-docs"
#user_id = "RkRwdej4IlTvdHdZWCie"
#is_public=True
#storage_client = storage.Client(project=PROJECT_ID)

In [7]:
query_engine = "lukman-test-html"
doc_url = "https://health.ny.gov/health_care/medicaid/rates/manual/apg_provider_manual_december.htm"
user_id = "RkRwdej4IlTvdHdZWCie"
query_description="test depth limit"
storage_client = storage.Client(project=PROJECT_ID)

In [8]:
from services.query.query_service import datasource_from_url

In [9]:
# create q_engine
params = {"depth_limit": 1}
embedding_type = DEFAULT_QUERY_EMBEDDING_MODEL
llm_type = DEFAULT_QUERY_EMBEDDING_MODEL
vector_store_type = "langchain_pgvector"
q_engine = QueryEngine(name=query_engine,
                     created_by=user_id,
                     llm_type=llm_type,
                     description=query_description,
                     embedding_type=embedding_type,
                     vector_store=vector_store_type,
                     params=params,
                     doc_url=doc_url)
q_engine.save()

<common.models.llm_query.QueryEngine at 0x136a9a0d0>

In [10]:
data_source = datasource_from_url(doc_url, q_engine, storage_client)
data_source

INFO: [query/query_service.py:533 - datasource_from_url()] creating WebDataSource with depth limit [1]


<services.query.web_datasource.WebDataSource at 0x138c57c40>

In [11]:
temp_dir = "/tmp"
doc_filepaths = data_source.download_documents(doc_url, temp_dir)
doc_filepaths

2024-01-07 20:44:09 [scrapy.utils.log] INFO: Scrapy 2.11.0 started (bot: scrapybot)
2024-01-07 20:44:09 [scrapy.utils.log] INFO: Versions: lxml 5.0.1.0, libxml2 2.12.3, cssselect 1.2.0, parsel 1.8.1, w3lib 2.1.2, Twisted 22.10.0, Python 3.9.13 (v3.9.13:6de2ca5339, May 17 2022, 11:37:23) - [Clang 13.0.0 (clang-1300.0.29.30)], pyOpenSSL 23.3.0 (OpenSSL 3.1.4 24 Oct 2023), cryptography 41.0.7, Platform macOS-14.2.1-x86_64-i386-64bit
2024-01-07 20:44:09 [scrapy.addons] INFO: Enabled addons:
[]


See the documentation of the 'REQUEST_FINGERPRINTER_IMPLEMENTATION' setting for information on how to handle this deprecation.
  return cls(crawler)

2024-01-07 20:44:09 [scrapy.extensions.telnet] INFO: Telnet Password: 2f9776d62320d56f
2024-01-07 20:44:10 [scrapy.middleware] INFO: Enabled extensions:
['scrapy.extensions.corestats.CoreStats',
 'scrapy.extensions.telnet.TelnetConsole',
 'scrapy.extensions.memusage.MemoryUsage',
 'scrapy.extensions.logstats.LogStats']
2024-01-07 20:44:10 [scrapy.craw

INFO: [query/web_datasource.py:72 - save_content()] Saving health_care_medicaid_rates_manual_apg_provider_manual_december.htm to /tmp
INFO: [query/web_datasource.py:76 - save_content()] 108529 bytes written
INFO: [query/web_datasource.py:239 - _item_scraped()] Downloaded Response URL: https://health.ny.gov/health_care/medicaid/rates/manual/apg_provider_manual_december.htm
INFO: [query/web_datasource.py:72 - save_content()] Saving nysvets_web_.html to /tmp
INFO: [query/web_datasource.py:76 - save_content()] 4182 bytes written
INFO: [query/web_datasource.py:239 - _item_scraped()] Downloaded Response URL: https://apps.health.ny.gov/nysvets/web/
INFO: [query/web_datasource.py:72 - save_content()] Saving facilities_.html to /tmp
INFO: [query/web_datasource.py:76 - save_content()] 13878 bytes written
INFO: [query/web_datasource.py:239 - _item_scraped()] Downloaded Response URL: https://health.ny.gov/facilities/
INFO: [query/web_datasource.py:72 - save_content()] Saving facilities_home_care_.

2024-01-07 20:44:24 [scrapy.core.engine] INFO: Closing spider (finished)
2024-01-07 20:44:24 [scrapy.statscollectors] INFO: Dumping Scrapy stats:
{'downloader/request_bytes': 47897,
 'downloader/request_count': 119,
 'downloader/request_method_count/GET': 119,
 'downloader/response_bytes': 4130566,
 'downloader/response_count': 119,
 'downloader/response_status_count/200': 107,
 'downloader/response_status_count/301': 2,
 'downloader/response_status_count/403': 9,
 'downloader/response_status_count/404': 1,
 'dupefilter/filtered': 13,
 'elapsed_time_seconds': 14.043162,
 'finish_reason': 'finished',
 'finish_time': datetime.datetime(2024, 1, 8, 1, 44, 24, 278441, tzinfo=datetime.timezone.utc),
 'httpcompression/response_bytes': 57117,
 'httpcompression/response_count': 10,
 'item_scraped_count': 106,
 'log_count/INFO': 10,
 'memusage/max': 472641536,
 'memusage/startup': 472641536,
 'request_depth_max': 1,
 'response_received_count': 117,
 'scheduler/dequeued': 119,
 'scheduler/dequeue

finished
https://health.ny.gov/health_care/medicaid/rates/manual/apg_provider_manual_december.htm
https://apps.health.ny.gov/nysvets/web/
https://health.ny.gov/facilities/
https://health.ny.gov/facilities/home_care/
https://health.ny.gov/facilities/hospital/
https://health.ny.gov/related/
https://health.ny.gov/health_care/medicaid/rates/apg/index.htm
https://health.ny.gov/facilities/school_based_health_centers/
https://health.ny.gov/facilities/nursing/
https://health.ny.gov/about/
https://health.ny.gov/contact/translate.htm
https://health.ny.gov/contact/accessibility.htm
https://health.ny.gov/about/disclaim.htm
https://health.ny.gov/professionals/
https://health.ny.gov/permits/
https://health.ny.gov/professionals/protocols_and_guidelines/
https://health.ny.gov/about/privacy.htm
https://health.ny.gov/facilities/adult_care/
https://health.ny.gov/help/file_types.htm
https://health.ny.gov/help/text_size.htm
https://health.ny.gov/environmental/wastewater.htm
https://health.ny.gov/prevention

[('health_care_medicaid_rates_manual_apg_provider_manual_december.htm',
  'https://health.ny.gov/health_care/medicaid/rates/manual/apg_provider_manual_december.htm',
  '/tmp/health_care_medicaid_rates_manual_apg_provider_manual_december.htm'),
 ('nysvets_web_.html',
  'https://apps.health.ny.gov/nysvets/web/',
  '/tmp/nysvets_web_.html'),
 ('facilities_.html',
  'https://health.ny.gov/facilities/',
  '/tmp/facilities_.html'),
 ('facilities_home_care_.html',
  'https://health.ny.gov/facilities/home_care/',
  '/tmp/facilities_home_care_.html'),
 ('facilities_hospital_.html',
  'https://health.ny.gov/facilities/hospital/',
  '/tmp/facilities_hospital_.html'),
 ('related_.html', 'https://health.ny.gov/related/', '/tmp/related_.html'),
 ('health_care_medicaid_rates_apg_index.htm',
  'https://health.ny.gov/health_care/medicaid/rates/apg/index.htm',
  '/tmp/health_care_medicaid_rates_apg_index.htm'),
 ('facilities_school_based_health_centers_.html',
  'https://health.ny.gov/facilities/school_

In [12]:
doc_filepaths

[('health_care_medicaid_rates_manual_apg_provider_manual_december.htm',
  'https://health.ny.gov/health_care/medicaid/rates/manual/apg_provider_manual_december.htm',
  '/tmp/health_care_medicaid_rates_manual_apg_provider_manual_december.htm')]

In [None]:
doc_name, index_doc_url, doc_filepath = doc_filepaths[0]

In [None]:
text_chunks = data_source.chunk_document(doc_name, index_doc_url, doc_filepath)

In [None]:
len(text_chunks)

In [None]:
text_chunks[8]

In [None]:
data_source.text_to_sentence_list(text_chunks[6])

In [None]:
embeddings.get_embeddings(text_chunks, embedding_type)

In [None]:
from utils.html_helper import html_to_text, html_to_sentence_list
query_description = "Policies and guidance on billing for dental procedures using CDT codes for Federally Qualified Health Centers (FQHC).  Includes information on state agency reporting of billing against current dental terminology (CDT) codes.  Also includes information on billing, coding and other guidelines that support the implementation of the CY 2023 Medicare Physician Fee Schedule Final Rule on Dental Services."

In [None]:
from services.query.query_service import process_documents, vector_store_from_query_engine

vector_store = vector_store_from_query_engine(q_engine)

In [None]:
text_chunks

In [None]:
from w3lib.html import replace_escape_chars
import re

def clean_text(text):
  # Replace specific unprocessable characters
  cleaned_text = text.replace("\x00", "")

  # replace escape characters
  cleaned_text = replace_escape_chars(cleaned_text)

  # remove all non-printable characters
  cleaned_text = re.sub(r"[^\x20-\x7E]", "", cleaned_text)

  return cleaned_text

In [None]:
text_chunks = [clean_text(x) for x in text_chunks]

In [None]:
text_chunks

In [None]:
QueryEngine.delete_by_id(q_engine.id)

In [None]:
for i in range(0, len(text_chunks)):
  clean_text = html_to_text(text_chunks[i])
  sentences = html_to_sentence_list(text_chunks[i])
  query_doc_chunk = QueryDocumentChunk(
                        query_engine_id=q_engine.id,
                        query_document_id=query_doc.id,
                        index=i+index_base,
                        text=text_chunks[i],
                        clean_text=clean_text,
                        sentences=sentences)

In [None]:
storage_client = storage.Client(project=PROJECT_ID)

vector_store = VectorStore(q_engine)

In [None]:
docs_processed

In [None]:
docs_not_processed