In [2]:
from utils import environment_utils

environment_utils.load_env()

2024-03-09 13:59:50.234 
  command:

    streamlit run /home/kangonaut/data/aegis-athena/venv/lib/python3.11/site-packages/ipykernel_launcher.py [ARGUMENTS]


## Load Markdown

In [3]:
from llama_index.core import Document

from rag import load_utils

documents: list[Document] = load_utils.load_data_dir(dir_path="../mission-docs")

In [4]:
# exclude `file_path` metadata from LLM context and embeddings
for document in documents:
    document.excluded_embed_metadata_keys.append("file_path")
    document.excluded_llm_metadata_keys.append("file_path")

In [5]:
documents[0]

Document(id_='0569b862-1661-42c5-bb17-d15825ebdeb1', embedding=None, metadata={'file_path': '/home/kangonaut/data/aegis-athena/experiments/../mission-docs/console.md', 'file_name': '/home/kangonaut/data/aegis-athena/experiments/../mission-docs/console.md', 'file_type': 'text/markdown', 'file_size': 40904, 'creation_date': '2024-02-27', 'last_modified_date': '2024-02-27', 'last_accessed_date': None, 'section_title': '3 Console'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date', 'file_path'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date', 'file_path'], relationships={}, text='3 Console\n\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

## Parse Nodes into Sentences and reference Parent Block

In [6]:
from rag.node_parser.mock_node_parser import MockNodeParser
from rag.node_parser.individual_sentence_node_parser import IndividualSentenceNodeParser
from llama_index.core.node_parser import HierarchicalNodeParser

node_parser = HierarchicalNodeParser.from_defaults(
    node_parser_ids=["mock", "sentences"],
    node_parser_map={
        "mock": MockNodeParser(),
        "sentences": IndividualSentenceNodeParser.from_defaults(),
    }
)

In [7]:
from llama_index.core.node_parser import get_leaf_nodes, get_root_nodes

hierarchical_nodes = node_parser.get_nodes_from_documents(documents)

# separate leaf and root nodes
leaf_nodes = get_leaf_nodes(hierarchical_nodes)
root_nodes = get_root_nodes(hierarchical_nodes)

print(f"#leaf_nodes: {len(leaf_nodes)}")
print(f"#root_nodes: {len(root_nodes)}")

#leaf_nodes: 474
#root_nodes: 133


## Import Root Nodes into MongoDB

In [8]:
from llama_index.storage.docstore.mongodb import MongoDocumentStore
from rag import mongodb_utils

mongodb_client = mongodb_utils.get_client()
mongodb_docstore = mongodb_utils.as_docstore(mongodb_client)

In [9]:
# delete previous data
counter: int = 0
for doc_id, document in mongodb_docstore.docs.items():
    mongodb_docstore.delete_document(doc_id)
    counter += 1
    
print(f"{counter} documents deleted")

133 documents deleted


In [10]:
mongodb_docstore.add_documents(root_nodes)

## Import Leaf Nodes into Weaviate

In [11]:
from rag import weaviate_utils

WEAVIATE_CLASS_NAME = "AutoMergingDocsChunk"

client = weaviate_utils.get_weaviate_client()
vector_store = weaviate_utils.as_vector_store(client, WEAVIATE_CLASS_NAME)

            Please consider upgrading to the latest version. See https://weaviate.io/developers/weaviate/client-libraries/python for details.


In [12]:
# delete previous data
client.schema.delete_class(WEAVIATE_CLASS_NAME)

In [13]:
from llama_index.core import StorageContext, ServiceContext
from llama_index.embeddings.openai import OpenAIEmbedding
from rag import index_utils

# populate
weaviate_storage_context = StorageContext.from_defaults(
    vector_store=vector_store,
)
embed_model = OpenAIEmbedding(embed_batch_size=50)
weaviate_index = index_utils.populate_index(
    docs=leaf_nodes,
    storage_context=weaviate_storage_context,
    embed_model=embed_model,
)

Generating embeddings:   0%|          | 0/474 [00:00<?, ?it/s]

## Testing

In [14]:
from llama_index.core import StorageContext

mongodb_storage_context = StorageContext.from_defaults(
    docstore=mongodb_docstore,
)

In [15]:
from llama_index.core.retrievers import AutoMergingRetriever

weaviate_retriever = weaviate_index.as_retriever(similarity_top_k=6)
retriever = AutoMergingRetriever(
    simple_ratio_thresh=0.3,
    vector_retriever=weaviate_retriever,
    storage_context=mongodb_storage_context,
    verbose=True,
)

In [16]:
from llama_index.core.schema import BaseNode


def display_nodes(nodes: list[BaseNode]) -> None:
    for idx, node in enumerate(nodes):
        print(f"Node #{idx}:\n{node.text}\n")

In [17]:
query: str = "What is the main objective of the mission?"

vanilla_nodes = weaviate_retriever.retrieve(query)
auto_merging_nodes = retriever.retrieve(query)

> Merging 4 nodes into parent node.
> Parent node id: 00685af6-62ea-493f-9b76-a9991bc2e14e.
> Parent node text: 1.2 Main Objective

The mission's central aim is decidedly explicit - to facilitate humanity's ma...

> Merging 2 nodes into parent node.
> Parent node id: f9b43f85-bbf2-488e-8c66-02c55c43a2f8.
> Parent node text: 1.3. Secondary Mission Objectives and Details

While the key aspiration of Aegis Athena revolves ...


In [18]:
display_nodes(vanilla_nodes)

Node #0:
1.2 Main Objective

The mission's central aim is decidedly explicit - to facilitate humanity's maiden voyage to the lunar surface.

Node #1:
These objectives fuel our quest for expansive knowledge and dominance.

Node #2:
While it would be ill-advised and counter-productive to deny the possibility of mission errors, it is imperative for the morale and unity of our nation that the astronauts are perceived to have returned safely.

Node #3:
Therefore, as a contingency, a set of doppelgängers have been enlisted to present to the public in the event of partial mission success.

Node #4:
Securing their faith in our collective strength against an adversary, whose priorities are primarily driven by monetary motivations, is paramount.

Node #5:
1.3.


In [19]:
display_nodes(auto_merging_nodes)

Node #0:
1.2 Main Objective

The mission's central aim is decidedly explicit - to facilitate humanity's maiden voyage to the lunar surface. While central, the successful completion of this objective does not necessarily encompass a failsafe protocol for the safe return of the pioneering astronauts to our terrestrial home.

Our approach is rooted in principles of utilitarian thought. Our commitment is first and foremost to our nation and its people. Securing their faith in our collective strength against an adversary, whose priorities are primarily driven by monetary motivations, is paramount.

While it would be ill-advised and counter-productive to deny the possibility of mission errors, it is imperative for the morale and unity of our nation that the astronauts are perceived to have returned safely. Therefore, as a contingency, a set of doppelgängers have been enlisted to present to the public in the event of partial mission success.



Node #1:
1.3. Secondary Mission Objectives and D