In [1]:
import os
import sys

sys.path.append("../../common/src")
sys.path.append("../src")

In [2]:
!export PROJECT_ID="nasa-genie-dev"
project = "nasa-genie-dev"
os.environ["PROJECT_ID"] = project
os.environ["PG_HOST"] = "10.10.1.2"

In [4]:
from services.query.vector_store import PostgresVectorStore

In [5]:
from common.models import (UserQuery, QueryResult, QueryEngine, QueryDocument,
                           QueryReference, QueryDocumentChunk, BatchJobModel)

In [6]:
qe_list = QueryEngine.fetch_all()
[(qe.name, qe.id) for qe in qe_list]

[('nasa-demo-rbac-qe', 'bAXnpVXD8Txh0Ri3fqes'),
 ('Test Manifest', '34KBiwqLDqHMG8RYf5kt'),
 ('nasa-demo-parent-qe', 'PNjLCtT9htwrH9jrvUR7'),
 ('nasa-demo-shpt', 'GGlB2In4OQnSkOwD9Dy4'),
 ('nasa-demo-gcs', 'k6A7rBXmEjtzRwaWySmn'),
 ('nasa-search-integrated-v8', 'fcYZwEthyVcclBwwLpmQ'),
 ('nasa-search-shpt-v7', 'qZaOrVSGl8rvOsDCjAoi'),
 ('nasa-search-gcs-v7', 'qqJkXMwHB2Kb0jiJRXEn'),
 ('nasa-search-integrated-v5', 'nqIU7Oi4qRIOgKllDqsY'),
 ('nasa-search-shpt-v5', 'bteIzSiaaCd3t8vjzDLw'),
 ('nasa-search-gcs-v5', 'FZnCyfyWNsk2gcHErMVR')]

In [7]:
q_engine = QueryEngine.find_by_name('Test Manifest')

In [8]:
langchain_vector_store = PostgresVectorStore(q_engine, q_engine.embedding_type)

In [9]:
# Example usage
filter_str = "{\"topics\": {\"$contains\":\"plasma\"}}"
parsed_filter = langchain_vector_store.parse_filter(filter_str)
parsed_filter

{'topics': {'$contains': 'plasma'}}

In [36]:
query_prompt = "what is some recent research in cosmology?"

In [37]:
from services import embeddings
_, query_embeddings = \
      await embeddings.get_embeddings([query_prompt], q_engine.embedding_type)

INFO: [services/embeddings.py:65 - get_embeddings()] generating embeddings with VertexAI-Embedding
INFO: [services/embeddings.py:149 - generate_embeddings()] generating embeddings for embedding type VertexAI-Embedding
INFO: [config/model_config.py:488 - get_provider_value()] Get provider value:
INFO: [config/model_config.py:489 - get_provider_value()] provider_id=Vertex
INFO: [config/model_config.py:490 - get_provider_value()] model_id=VertexAI-Embedding
INFO: [services/embeddings.py:208 - get_vertex_embeddings()] generating Vertex embeddings for 1 chunk(s) embedding model text-embedding-004


In [38]:
query_embedding = query_embeddings[0]

In [39]:
match_indexes_list = langchain_vector_store.similarity_search(q_engine, query_embedding)

In [40]:
match_indexes_list

[1318, 1380, 700, 680, 1389]

In [41]:
qdoc_chunks = [QueryDocumentChunk.find_by_index(q_engine.id, match) for match in match_indexes_list]

In [42]:
qdoc_chunks

[<common.models.llm_query.QueryDocumentChunk at 0x7f5f6b54b990>,
 <common.models.llm_query.QueryDocumentChunk at 0x7f5f6b531e10>,
 <common.models.llm_query.QueryDocumentChunk at 0x7f5f6b530850>,
 <common.models.llm_query.QueryDocumentChunk at 0x7f5f6b54b6d0>,
 <common.models.llm_query.QueryDocumentChunk at 0x7f5f6b542650>]

In [43]:
qdocs = [QueryDocument.find_by_id(doc_chunk.query_document_id) for doc_chunk in qdoc_chunks]

In [44]:
[qdoc.doc_url for qdoc in qdocs]

['https://storage.googleapis.com/nasa-genie-dev-source-docs-02/s11214-015-0150-2.pdf',
 'https://storage.googleapis.com/nasa-genie-dev-source-docs-02/s11214-015-0150-2.pdf',
 'https://storage.googleapis.com/nasa-genie-dev-source-docs-02/PhysRevLett.112.091302.pdf',
 'https://storage.googleapis.com/nasa-genie-dev-source-docs-02/PhysRevLett.112.091302.pdf',
 'https://storage.googleapis.com/nasa-genie-dev-source-docs-02/s11214-015-0150-2.pdf']

In [51]:
filter_str = "{\"title\": {\"eq\":\"Establishing the Context for Reconnection Diffusion Region Encounters and Strategies for the Capture and Transmission of Diffusion Region Burst Data by MMS\"}}"
#filter_str = "{\"title\": \"Establishing the Context for Reconnection Diffusion Region Encounters and Strategies for the Capture and Transmission of Diffusion Region Burst Data by MMS\"}"
parsed_filter = langchain_vector_store.parse_filter(filter_str)
parsed_filter

{'title': {'eq': 'Establishing the Context for Reconnection Diffusion Region Encounters and Strategies for the Capture and Transmission of Diffusion Region Burst Data by MMS'}}

In [52]:
NUM_MATCH_RESULTS = 5
match_list = langchain_vector_store.lc_vector_store.similarity_search_with_score_by_vector(
        embedding=query_embedding,
        k=NUM_MATCH_RESULTS,
        filter=parsed_filter
    )

In [53]:
match_list

[(Document(page_content='Space Sci. ', metadata={'authors': ['T.D. Phan', 'M.A. Shay', 'J.P. Eastwood', 'V. Angelopoulos', 'M. Oieroset', 'M. Oka', 'M. Fujimoto'], 'title': 'Establishing the Context for Reconnection Diffusion Region Encounters and Strategies for the Capture and Transmission of Diffusion Region Burst Data by MMS', 'publish_date': '04/15/2015', 'topics': ['magnetic reconnection', 'diffusion region', 'magnetopause', 'magnetotail']}),
  0.417557154015501,
  '1380'),
 (Document(page_content='Space Sci. ', metadata={'authors': ['T.D. Phan', 'M.A. Shay', 'J.P. Eastwood', 'V. Angelopoulos', 'M. Oieroset', 'M. Oka', 'M. Fujimoto'], 'title': 'Establishing the Context for Reconnection Diffusion Region Encounters and Strategies for the Capture and Transmission of Diffusion Region Burst Data by MMS', 'publish_date': '04/15/2015', 'topics': ['magnetic reconnection', 'diffusion region', 'magnetopause', 'magnetotail']}),
  0.417557154015501,
  '1318'),
 (Document(page_content='Space P

In [35]:
[doc.doc_url for doc in QueryDocument.find_by_query_engine_id(q_engine.id)]

['https://storage.googleapis.com/nasa-genie-dev-source-docs-02/tc-11-2773-2017.pdf',
 'https://storage.googleapis.com/nasa-genie-dev-source-docs-02/stz697.pdf',
 'https://storage.googleapis.com/nasa-genie-dev-source-docs-02/stz640.pdf',
 'https://storage.googleapis.com/nasa-genie-dev-source-docs-02/stz003.pdf',
 'https://storage.googleapis.com/nasa-genie-dev-source-docs-02/s11214-015-0150-2.pdf',
 'https://storage.googleapis.com/nasa-genie-dev-source-docs-02/PhysRevLett.112.091302.pdf',
 'https://storage.googleapis.com/nasa-genie-dev-source-docs-02/JGR%20Solid%20Earth%20-%202015%20-%20Watkins%20-%20Improved%20methods%20for%20observing%20Earth%20s%20time%20variable%20mass%20distribution%20with%20GRACE%20using.pdf']

In [55]:
IN, NIN, BETWEEN, GT, LT, NE = "in", "nin", "between", "gt", "lt", "ne"
EQ, LIKE, CONTAINS, OR, AND = "eq", "like", "contains", "or", "and"

In [62]:
filter_str = "{\"title\": {\"$eq\":\"Establishing the Context for Reconnection Diffusion Region Encounters and Strategies for the Capture and Transmission of Diffusion Region Burst Data by MMS\"}}"
#filter_str = "{\"title\": \"Establishing the Context for Reconnection Diffusion Region Encounters and Strategies for the Capture and Transmission of Diffusion Region Burst Data by MMS\"}"
parsed_filter = langchain_vector_store.parse_filter(filter_str)
parsed_filter

{'title': {'$eq': 'Establishing the Context for Reconnection Diffusion Region Encounters and Strategies for the Capture and Transmission of Diffusion Region Burst Data by MMS'}}

In [63]:
value = parsed_filter['title']

In [64]:
IN in map(str.lower, value)

False

In [65]:
EQ in map(str.lower, value)

False

In [66]:
value

{'$eq': 'Establishing the Context for Reconnection Diffusion Region Encounters and Strategies for the Capture and Transmission of Diffusion Region Burst Data by MMS'}