# 评估 DocumentSummaryIndex 的性能

## 加载数据

In [1]:
%%time

from llama_index.core import SimpleDirectoryReader

documents=SimpleDirectoryReader(
    input_dir="./data",
    filename_as_id=True,
).load_data()

len(documents)

CPU times: user 2.43 s, sys: 398 ms, total: 2.83 s
Wall time: 1.92 s


3

## 设置全局环境

In [3]:
%%time

from llama_index.core import Settings
from llama_index.llms.ollama import Ollama
from llama_index.embeddings.ollama import OllamaEmbedding

Settings.llm = Ollama(
    base_url="http://ape:11434",
    model="qwen2",
    is_chat_model=True,
    temperature=0.1,
    request_timeout=60.0
)

Settings.embed_model = OllamaEmbedding(
    model_name="rjmalagon/gte-qwen2-1.5b-instruct-embed-f16",
    base_url="http://ape:11434",
    # -mirostat N 使用 Mirostat 采样。
    ollama_additional_kwargs={"mirostat": 0},
)

# Settings.chunk_size = 512
# Settings.chunk_overlap = 50

CPU times: user 611 ms, sys: 39.3 ms, total: 650 ms
Wall time: 650 ms


## 创建文档摘要索引

In [6]:
%%time

from llama_index.core import get_response_synthesizer
from llama_index.core import DocumentSummaryIndex

# default mode of building the index
response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize", 
    use_async=True,
)

index = DocumentSummaryIndex.from_documents(
    documents,
    response_synthesizer=response_synthesizer,
    show_progress=True,
)

Parsing nodes:   0%|          | 0/3 [00:00<?, ?it/s]

Summarizing documents:   0%|          | 0/3 [00:00<?, ?it/s]

current doc id: /root/notebook/jupyterlab-demos/document-summary-index/data/天问三号将在2028年前后实现火星采样返回.txt
current doc id: /root/notebook/jupyterlab-demos/document-summary-index/data/网传“中国高铁一公里耗一万度电”，权威解读.txt
current doc id: /root/notebook/jupyterlab-demos/document-summary-index/data/设计时速350公里，杭温高铁9月6日开通运营.txt


Generating embeddings:   0%|          | 0/3 [00:00<?, ?it/s]

CPU times: user 204 ms, sys: 19.2 ms, total: 223 ms
Wall time: 27.7 s


## 高级查询

In [8]:
%%time

query_engine = index.as_query_engine(
    response_mode="tree_summarize", 
    streaming=True,
    # similarity_top_k=5
)

CPU times: user 161 μs, sys: 23 μs, total: 184 μs
Wall time: 188 μs


In [9]:
%%time

response = query_engine.query("天问三号原计划是何时发射？")
response.print_response_stream()

天问三号任务的原计划是在2030年前后实施两次发射任务。CPU times: user 39.3 ms, sys: 8.05 ms, total: 47.3 ms
Wall time: 1.23 s


In [10]:
%%time

response = query_engine.query("高铁一公里耗电一万度是否属实？")
response.print_response_stream()

网传“中国高铁一公里耗一万度电”的说法不属实。这一说法偷换了概念，与事实不符。以现有CRH380A、CRH380B、CR400AF、CR400BF等4个8编组的主力车型为例，其轮周功率分别为：9360kW、9200kW、9750kW、10140kW。通过在京沪高铁（往返距离2636公里）达速350km/h运行条件下的能耗测试数据显示，其耗电量分别为：56931度、61861度、51364度、55490度，据此推算出平均每公里耗电量分别为：21.6度、23.5度、19.5度、21.1度。因此，网传“一公里耗一万度电”是错误的，实际平均每公里耗电量远低于这一数值。CPU times: user 245 ms, sys: 141 ms, total: 386 ms
Wall time: 5.02 s


## 基于嵌入的检索和查询

### 检索

In [18]:
%%time

from llama_index.core.indices.document_summary import DocumentSummaryIndexEmbeddingRetriever

retriever = DocumentSummaryIndexEmbeddingRetriever(
    index,
    similarity_top_k=2, # 默认是1
)

retrieved_nodes = retriever.retrieve("天问三号原计划是何时发射？")

len(retrieved_nodes)

CPU times: user 9.11 ms, sys: 0 ns, total: 9.11 ms
Wall time: 125 ms


5

In [19]:
retrieved_nodes[3]

NodeWithScore(node=TextNode(id_='884695e4-0b3f-4c84-a681-22254bde1680', embedding=None, metadata={'file_path': '/root/notebook/jupyterlab-demos/document-summary-index/data/设计时速350公里，杭温高铁9月6日开通运营.txt', 'file_name': '设计时速350公里，杭温高铁9月6日开通运营.txt', 'file_type': 'text/plain', 'file_size': 3014, 'creation_date': '2024-09-05', 'last_modified_date': '2024-09-05'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='/root/notebook/jupyterlab-demos/document-summary-index/data/设计时速350公里，杭温高铁9月6日开通运营.txt', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'file_path': '/root/notebook/jupyterlab-demos/document-summary-index/data/设计时速350公里，杭温高铁9月6日开通运营.txt', 'file_name': '设计时速350公里，杭温高铁9月6日开通运营.txt', 'file_type': 'text/plain', 

### 查询

In [20]:
%%time

from llama_index.core.query_engine import RetrieverQueryEngine

retriever = DocumentSummaryIndexEmbeddingRetriever(
    index,
)

response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize",
    streaming=True
)

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

CPU times: user 151 μs, sys: 26 μs, total: 177 μs
Wall time: 181 μs


In [21]:
%%time

response = query_engine.query("高铁一公里耗电一万度是否属实？")
response.print_response_stream()

网传“中国高铁一公里耗一万度电”的说法不属实。这一说法偷换了概念，与事实不符。以现有CRH380A、CRH380B、CR400AF、CR400BF等4个8编组的主力车型为例，其轮周功率分别为：9360kW、9200kW、9750kW、10140kW。通过在京沪高铁（往返距离2636公里）达速350km/h运行条件下的能耗测试数据显示，其耗电量分别为：56931度、61861度、51364度、55490度，据此推算出平均每公里耗电量分别为：21.6度、23.5度、19.5度、21.1度。因此，网传“一公里耗一万度电”是错误的，实际上中国高铁列车每公里耗电量远低于这一数值。CPU times: user 278 ms, sys: 109 ms, total: 387 ms
Wall time: 8.9 s


## 索引的保存和刷新

### 保存

In [22]:
%%time

index.storage_context.persist()

CPU times: user 21.8 ms, sys: 3.55 ms, total: 25.3 ms
Wall time: 24.1 ms


### 重新加载索引

In [26]:
%%time

from llama_index.core import StorageContext, load_index_from_storage

storage_context = StorageContext.from_defaults(persist_dir="./storage")
index = load_index_from_storage(storage_context)

CPU times: user 41.7 ms, sys: 3.39 ms, total: 45.1 ms
Wall time: 44.2 ms


### 追加文档

In [23]:
%%time

new_document=SimpleDirectoryReader(
    input_dir="./data2",
    filename_as_id=True,
).load_data()[0]

documents.append(new_document)

CPU times: user 2.9 ms, sys: 554 μs, total: 3.45 ms
Wall time: 2.91 ms


### 刷新索引

In [27]:
%%time

index.refresh_ref_docs(documents)

current doc id: /root/notebook/jupyterlab-demos/document-summary-index/data2/世界气象组织：中国和欧洲PM2.5浓度低于全球平均水平.txt
CPU times: user 16.7 ms, sys: 4.17 ms, total: 20.9 ms
Wall time: 11.7 s


[False, False, False, True]

In [28]:
%%time

# 保存刷新后的索引
index.storage_context.persist()

CPU times: user 29.1 ms, sys: 2.08 ms, total: 31.2 ms
Wall time: 30 ms
