In [56]:
import dotenv

dotenv.load_dotenv()

True

## Init Client

In [57]:
from rag import weaviate_utils

client = weaviate_utils.get_weaviate_client()

            Consider upgrading to the new and improved v4 client instead!
            See here for usage: https://weaviate.io/developers/weaviate/client-libraries/python
            


## Load Markdown

In [58]:
from llama_index import Document

from rag import load_utils

documents: list[Document] = load_utils.load_data_dir(dir_path="../mission-docs")

documents[0]

Document(id_='ff5a31a5-3424-44a8-867d-19beebd40054', embedding=None, metadata={'file_path': '../mission-docs/console.md', 'file_name': 'console.md', 'file_type': 'text/markdown', 'file_size': 41188, 'creation_date': '2024-02-10', 'last_modified_date': '2024-02-10', 'last_accessed_date': '2024-02-17'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={}, text='\n\n3 Console\n\n', start_char_idx=None, end_char_idx=None, text_template='{metadata_str}\n\n{content}', metadata_template='{key}: {value}', metadata_seperator='\n')

## Group and Join File Documents

In [59]:
grouped_documents = load_utils.group_documents(documents, group_by="file_name")
joined_documents = load_utils.join_grouped_documents(grouped_documents)

joined_documents = list(joined_documents.values())
joined_documents[0]

Document(id_='47b12233-2ccc-4796-b22c-a8bab2848906', embedding=None, metadata={}, excluded_embed_metadata_keys=[], excluded_llm_metadata_keys=[], relationships={}, text='\n\n3 Console\n\n\n\n\n\n3.1 Introduction\n\nThe Aegis Athena space mission is an illustrious epitome of such a pursuit. Primarily destined to accomplish a monumental milestone of delivering the first human astronaut to the lunar surface, this visionary mission is governed by a sophisticated piece of technology known as the S.P.A.C.E.C.R.A.F.T. console.\n\nThis highly innovative system adopts a dual-purpose role, integrating the essential features of both a command model and a service module. This strategically harmonious blend of utilitarian efficiency and comforting resources paves the way for an environment conducive for the astronaut-trio executing this expedition, turning it into a virtual sanctuary amidst the vast cosmos. The S.P.A.C.E.C.R.A.F.T. console operates with a plethora of commands, designed to modify th

## Split & Parse Nodes with Context Window

In [60]:
from llama_index.node_parser import SentenceWindowNodeParser

node_parser = SentenceWindowNodeParser.from_defaults(
    window_size=5,
    window_metadata_key="window",
    original_text_metadata_key="original_text",
)

In [61]:
sentence_window_nodes = node_parser.get_nodes_from_documents(joined_documents, show_progress=True)

Parsing nodes:   0%|          | 0/3 [00:00<?, ?it/s]

In [69]:
sentence_window_nodes[0].metadata["window"]

'\n\n3 Console\n\n\n\n\n\n3.1 Introduction\n\nThe Aegis Athena space mission is an illustrious epitome of such a pursuit.  Primarily destined to accomplish a monumental milestone of delivering the first human astronaut to the lunar surface, this visionary mission is governed by a sophisticated piece of technology known as the S.P.A.C.E.C.R.A.F.T.  console.\n\n This highly innovative system adopts a dual-purpose role, integrating the essential features of both a command model and a service module.  This strategically harmonious blend of utilitarian efficiency and comforting resources paves the way for an environment conducive for the astronaut-trio executing this expedition, turning it into a virtual sanctuary amidst the vast cosmos. '

## Import into Weaviate

In [63]:
from rag import weaviate_utils

CLASS_NAME = "LargerSentenceWindowDocsChunk"
vector_store = weaviate_utils.as_vector_store(client, CLASS_NAME)

In [42]:
# client.schema.delete_class(CLASS_NAME)

In [64]:
from llama_index import VectorStoreIndex, StorageContext, ServiceContext, OpenAIEmbedding
from rag import weaviate_utils, index_utils

# populate database
if weaviate_utils.is_populated(client, class_name=CLASS_NAME):
    index = VectorStoreIndex.from_vector_store(vector_store)
else:
    # populate
    storage_context = StorageContext.from_defaults(
        vector_store=vector_store,
    )
    service_context = ServiceContext.from_defaults(
        embed_model=OpenAIEmbedding(embed_batch_size=50),
    )
    index = index_utils.populate_index(
        sentence_window_nodes, 
        storage_context=storage_context, 
        service_context=service_context,
    )

Generating embeddings:   0%|          | 0/518 [00:00<?, ?it/s]

In [65]:
len(sentence_window_nodes[0].text)

111

In [66]:
sentence_window_nodes[0]

TextNode(id_='30b8b708-46f2-4c30-9ef0-f74427e796d6', embedding=None, metadata={'window': '\n\n3 Console\n\n\n\n\n\n3.1 Introduction\n\nThe Aegis Athena space mission is an illustrious epitome of such a pursuit.  Primarily destined to accomplish a monumental milestone of delivering the first human astronaut to the lunar surface, this visionary mission is governed by a sophisticated piece of technology known as the S.P.A.C.E.C.R.A.F.T.  console.\n\n This highly innovative system adopts a dual-purpose role, integrating the essential features of both a command model and a service module.  This strategically harmonious blend of utilitarian efficiency and comforting resources paves the way for an environment conducive for the astronaut-trio executing this expedition, turning it into a virtual sanctuary amidst the vast cosmos. ', 'original_text': '\n\n3 Console\n\n\n\n\n\n3.1 Introduction\n\nThe Aegis Athena space mission is an illustrious epitome of such a pursuit. '}, excluded_embed_metadat