# 自动合并检索 - 官方示例

In [1]:
%%time
%%capture

%pip install llama-index-readers-file pymupdf

%load_ext autoreload
%autoreload 2

CPU times: user 9.33 ms, sys: 14.8 ms, total: 24.1 ms
Wall time: 3.43 s


In [3]:
%%time
%%capture

!mkdir -p 'data/'
!apt-get update && apt-get install wget -y -qq
!wget --user-agent "Mozilla" "https://arxiv.org/pdf/2307.09288.pdf" -O "data/llama2.pdf"

CPU times: user 95.4 ms, sys: 12 ms, total: 107 ms
Wall time: 43.6 s


## 加载数据

In [4]:
%%time

from pathlib import Path

from llama_index.readers.file import PDFReader
from llama_index.readers.file import PyMuPDFReader

In [6]:
%%time

loader = PyMuPDFReader()
# docs0 = loader.load_data(file=Path("./data/llama2.pdf"))
docs0 = loader.load(file_path=Path("./data/llama2.pdf"))


from llama_index.core import Document

doc_text = "\n\n".join([d.get_content() for d in docs0])
docs = [Document(text=doc_text)]

CPU times: user 426 ms, sys: 68 ms, total: 494 ms
Wall time: 493 ms


## 解析数据和存储

In [7]:
%%time

from llama_index.core.node_parser import (
    HierarchicalNodeParser,
    SentenceSplitter,
)

node_parser = HierarchicalNodeParser.from_defaults()
nodes = node_parser.get_nodes_from_documents(docs)
len(nodes)

CPU times: user 742 ms, sys: 24.2 ms, total: 766 ms
Wall time: 765 ms


1009

In [8]:
from llama_index.core.node_parser import get_leaf_nodes, get_root_nodes

leaf_nodes = get_leaf_nodes(nodes)
len(leaf_nodes)

783

In [9]:
root_nodes = get_root_nodes(nodes)
len(root_nodes)

47

## 存储

In [10]:
%time

# 加载llm和embeddings
%run ../utils2.py

from llama_index.core import Settings

Settings.llm=get_llm()
Settings.embed_model = get_embedding()

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.1 µs


In [11]:
%time

# define storage context
from llama_index.core.storage.docstore import SimpleDocumentStore
from llama_index.core import StorageContext

docstore = SimpleDocumentStore()

# insert nodes into docstore
docstore.add_documents(nodes)

# define storage context (will include vector store by default too)
storage_context = StorageContext.from_defaults(docstore=docstore)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 5.01 µs


In [12]:
%time

## Load index into vector index
from llama_index.core import VectorStoreIndex

base_index = VectorStoreIndex(
    leaf_nodes,
    storage_context=storage_context,
)

CPU times: user 2 µs, sys: 0 ns, total: 2 µs
Wall time: 3.1 µs


## 定义检索器

In [14]:
%time

from llama_index.core.retrievers import AutoMergingRetriever

base_retriever = base_index.as_retriever(similarity_top_k=6)
retriever = AutoMergingRetriever(base_retriever, storage_context, verbose=True)

query_str = (
    "What could be the potential outcomes of adjusting the amount of safety"
    " data used in the RLHF stage?"
)

nodes = retriever.retrieve(query_str)
base_nodes = base_retriever.retrieve(query_str)

CPU times: user 1 µs, sys: 0 ns, total: 1 µs
Wall time: 3.1 µs
> Merging 1 nodes into parent node.
> Parent node id: 518f2da9-9eec-46d2-b40f-7323f4413e77.
> Parent node text: However, it’s crucial to be aware of the benchmarks’ limitations in evaluating safety. Most of th...



In [15]:
len(nodes)

6

In [16]:
len(base_nodes)

6