In [1]:
import os

from langchain.chains import RetrievalQA
from langchain.document_loaders import PyMuPDFLoader
from langchain.retrievers import BM25Retriever, EnsembleRetriever
from langchain.retrievers.contextual_compression import ContextualCompressionRetriever
from langchain.retrievers.document_compressors import CrossEncoderReranker
from langchain.text_splitter import RecursiveCharacterTextSplitter
from langchain.vectorstores import Chroma as ChromaVectorStore
from langchain_chroma import Chroma as ChromaLoader
from langchain_community.cross_encoders import HuggingFaceCrossEncoder
from langchain_core.prompts.prompt import PromptTemplate
from langchain_ollama import OllamaEmbeddings, OllamaLLM

In [2]:
# Options: snowflake-arctic-embed2, nomic-embed-text
OLLAMA_EMBEDDINGS_MODEL = "snowflake-arctic-embed2"
# Options: zephyr:7b-beta
OLLAMA_LLM_MODEL = "llama3.2:1b"
# Options: BAAI/bge-reranker-large, BAAI/bge-reranker-base
HF_RERANKER = "BAAI/bge-reranker-large"

QA_PROMPT_TEMPLATE = (
    "You are a technical assistant for u-blox.\n"
    "Answer the question using only the provided context.\n"
    "- If the answer is not explicitly contained in the context, respond only with: "
    '"The document does not specify." Do not add anything else.\n'
    "- Provide concise, technical answers. Use bullet points for lists.\n"
    "- If the context contains conflicting information, state that clearly without guessing.\n\n"
    "Question: {question}\n\n"
    "Context:\n{context}"
)

In [3]:
doc_path = "data/InterfaceDescription.pdf"
docs = PyMuPDFLoader(doc_path).load()

In [4]:
splitter = RecursiveCharacterTextSplitter(chunk_size=1500, chunk_overlap=250)
chunks = splitter.split_documents(docs)

In [5]:
persist_dir = "chroma_store"
collection_name = "ubx_docs"
embeddings_model = OllamaEmbeddings(model=OLLAMA_EMBEDDINGS_MODEL)

if os.path.exists(persist_dir) and os.listdir(persist_dir):
    vectorstore = ChromaLoader(
        persist_directory=persist_dir,
        collection_name=collection_name,
        embedding_function=embeddings_model,
    )
else:
    vectorstore = ChromaVectorStore.from_documents(
        documents=chunks,
        embedding=embeddings_model,
        persist_directory=persist_dir,
        collection_name=collection_name,
    )

In [None]:
vectorstore_retriever = vectorstore.as_retriever(search_kwargs={"k": 5})
keyword_retriever = BM25Retriever.from_documents(chunks, kwargs={"k": 5})

In [7]:
hybrid_chain = RetrievalQA.from_chain_type(
    llm=OllamaLLM(model=OLLAMA_LLM_MODEL, temperature=0.2),
    chain_type="stuff",
    retriever=ContextualCompressionRetriever(
        base_compressor=CrossEncoderReranker(
            model=HuggingFaceCrossEncoder(model_name=HF_RERANKER), top_n=5
        ),
        base_retriever=EnsembleRetriever(
            retrievers=[vectorstore_retriever, keyword_retriever], weights=[0.3, 0.7]
        ),
        search_kwargs={"score_threshold": 0.2},
        chain_type_kwargs={"prompt": PromptTemplate.from_template(QA_PROMPT_TEMPLATE)},
    ),
)

In [8]:
response = hybrid_chain.invoke(
    "Which UBX message to use to get satellites in view? I need their IDs and constellations."
)
print(response["result"])

To determine which UBX message to use to get satellites in view, you should look for a message that contains information about the GNSS (Global Navigation Satellite System) satellites. 

The NMEA-Standard-GGA is mentioned in the context of getting global positioning fix data, but it does not provide information about satellite constellations.

On the other hand, the UBX-NAV-SIG message is specifically designed to report on signal identifications and constellation information for a given set of GNSS satellites. This message can be used to get information about the satellites in view, including their IDs and constellations.

Therefore, you should use the UBX-NAV-SIG message to get satellites in view.
