In [48]:
import dotenv

dotenv.load_dotenv()

True

## Init Client

In [46]:
from rag import weaviate_utils

client = weaviate_utils.get_weaviate_client()

## Load Markdown

In [75]:
from llama_index import SimpleDirectoryReader
from llama_index import Document
import os

dir_path: str = os.path.join("../mission-docs")
reader = SimpleDirectoryReader(
    input_dir=dir_path,
    required_exts=[".md"],
)

In [78]:
documents: list[Document] = reader.load_data()

documents[0]

Document(id_='46d3e062-7b64-4985-aee3-5a1df53f2948', embedding=None, metadata={'file_path': '../mission-docs/console.md', 'file_name': 'console.md', 'file_type': 'text/markdown', 'file_size': 41188, 'creation_date': '2024-02-10', 'last_modified_date': '2024-02-10', 'last_accessed_date': '2024-02-10'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='\n\n3 Console\n\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

In [40]:
def join_docs(docs: list[Document]) -> list[Document]:
    # group docs by file
    file_docs: dict[str, list[Document]] = {}
    for doc in docs:
        file_name = doc.metadata["file_name"]
        if file_name not in file_docs:
            file_docs[file_name] = []
        file_docs[file_name].append(doc)

    # join docs of the same file
    return [Document(
        text="\n\n".join(
            map(lambda x: x.text, docs),
        ),
        metadata={
            "chapter": docs[0].text.strip(),
        },
    ) for docs in file_docs.values()]

## Import into Weaviate

In [94]:
from llama_index.vector_stores import WeaviateVectorStore

CLASS_NAME: str = "DocsChunk"

# create class
vector_store = WeaviateVectorStore(
    weaviate_client=client,
    index_name=CLASS_NAME,
    text_key="chunk",
)

In [95]:
from weaviate import Client

def is_populated(weaviate_client: Client, class_name: str) -> bool:
    if not client.schema.exists(class_name):
        return False
    result = weaviate_client.query.aggregate(class_name).with_meta_count().do()
    return result["data"]["Aggregate"][class_name][0]["meta"]["count"] != 0
    
is_populated(client, class_name=CLASS_NAME)

False

In [96]:
from llama_index import VectorStoreIndex, StorageContext

# populate database
if is_populated(client, class_name=CLASS_NAME):
    index = VectorStoreIndex.from_vector_store(vector_store)
else:
    storage_context = StorageContext.from_defaults(vector_store=vector_store)
    index = VectorStoreIndex(
        nodes=documents,
        storage_context=storage_context,
        show_progress=True,
    )

Generating embeddings:   0%|          | 0/133 [00:00<?, ?it/s]

## Query Test

In [97]:
query_engine = index.as_query_engine()

In [98]:
response = query_engine.query("Who operates the Aegis Athena mission?")

print(response.response)

The esteemed Terrestrial Human's Republic operates the Aegis Athena mission.


In [109]:
response = client.query.get(
    CLASS_NAME,
    properties=["chunk"]
).with_near_text({
    "concepts": ["Who operate the Aegis Athena mission?"]
}).do()

for idx, doc in enumerate(response["data"]["Get"]["DocsChunk"][:5]):
    chunk = doc["chunk"]
    print(f"Chunk #{idx}: {chunk}\n\n")

Chunk #0: 

3.1 Introduction

The Aegis Athena space mission is an illustrious epitome of such a pursuit. Primarily destined to accomplish a monumental milestone of delivering the first human astronaut to the lunar surface, this visionary mission is governed by a sophisticated piece of technology known as the S.P.A.C.E.C.R.A.F.T. console.

This highly innovative system adopts a dual-purpose role, integrating the essential features of both a command model and a service module. This strategically harmonious blend of utilitarian efficiency and comforting resources paves the way for an environment conducive for the astronaut-trio executing this expedition, turning it into a virtual sanctuary amidst the vast cosmos. The S.P.A.C.E.C.R.A.F.T. console operates with a plethora of commands, designed to modify the inherent system configurations and retrieve state-of-the-art status information. Such telemetry data is indubitably critical in assessing the spacecraft's performance and securing the t