# 自动合并检索

## 准备

In [2]:
%%time
%%capture

!pip install llama-index

CPU times: user 9.59 ms, sys: 6.51 ms, total: 16.1 ms
Wall time: 3.5 s


## 加载文档

In [5]:
%%time

from llama_index.core import SimpleDirectoryReader

documents = SimpleDirectoryReader(input_files=["data/故乡.txt"]).load_data()

CPU times: user 9.41 ms, sys: 4.16 ms, total: 13.6 ms
Wall time: 13.2 ms


## 生成 root 节点和叶子结点

In [6]:
%%time

from llama_index.core.node_parser import (
    HierarchicalNodeParser,
    SentenceSplitter,
)

node_parser = HierarchicalNodeParser.from_defaults()
nodes = node_parser.get_nodes_from_documents(documents)


len(nodes)

CPU times: user 270 ms, sys: 20.3 ms, total: 290 ms
Wall time: 290 ms


111

In [7]:
%%time

from llama_index.core.node_parser import get_leaf_nodes, get_root_nodes

leaf_nodes = get_leaf_nodes(nodes)

CPU times: user 58 µs, sys: 7 µs, total: 65 µs
Wall time: 67.2 µs


In [8]:
len(leaf_nodes)

88

In [10]:
root_nodes = get_root_nodes(nodes)
len(root_nodes)

4

## 存储文档

In [16]:
%%time

# define storage context
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import StorageContext

docstore = SimpleDocumentStore()

# insert nodes into docstore
docstore.add_documents(nodes)

# define storage context (will include vector store by default too)
storage_context = StorageContext.from_defaults(docstore=docstore)

CPU times: user 18.8 ms, sys: 322 µs, total: 19.1 ms
Wall time: 18.5 ms


In [17]:
%time

# 加载llm和embeddings
%run ../utils2.py

from llama_index.core import Settings

Settings.llm=get_llm()
Settings.embed_model = get_embedding()

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.58 µs


In [18]:
%%time

## Load index into vector index
from llama_index.core import VectorStoreIndex

base_index = VectorStoreIndex(
    leaf_nodes,
    storage_context=storage_context,
)

CPU times: user 374 ms, sys: 18.4 ms, total: 392 ms
Wall time: 9.29 s


## AutoMergingRetriever

In [20]:
%time

from llama_index.core.retrievers import AutoMergingRetriever

base_retriever = base_index.as_retriever(similarity_top_k=6)
retriever = AutoMergingRetriever(base_retriever, storage_context, verbose=True)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.58 µs


In [26]:
%time

query_str = (
    "闰土是谁"
)

nodes = retriever.retrieve(query_str)
base_nodes = base_retriever.retrieve(query_str)
                                     
len(nodes)

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.34 µs


6

In [27]:
len(base_nodes)

6

In [28]:
nodes[0]

NodeWithScore(node=TextNode(id_='56e82bf8-d532-401f-bdae-86eca2343be5', embedding=None, metadata={'file_path': 'data/故乡.txt', 'file_name': '故乡.txt', 'file_type': 'text/plain', 'file_size': 17811, 'creation_date': '2024-07-16', 'last_modified_date': '2024-07-16'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='dd3cc72d-293a-4ba9-9033-65a09d61cd9e', node_type=<ObjectType.TEXT: '1'>, metadata={'file_path': 'data/故乡.txt', 'file_name': '故乡.txt', 'file_type': 'text/plain', 'file_size': 17811, 'creation_date': '2024-07-16', 'last_modified_date': '2024-07-16'}, hash='4c188c1948cc5baea6b04bc0bf5ae4d6e59e2dd579f02e25c27d8a2c1f7b979d'), <NodeRelationship.PREVIOUS: '2'>: RelatedNodeInfo(node_id='97ad5472-3b41-47fd-a5a