# 基于嵌入和 BM25 混合检索

## 准备工作

In [1]:
%%time
%%capture

# 安装所需的库

!pip install llama-index-vector-stores-qdrant
!pip install qdrant_client
!pip install trafilatura

!pip install rank_bm25
!pip install nltk jieba

!pip install llama-index-retrievers-bm25==0.1.3
!pip install rank_bm25
!pip install jieba

CPU times: user 72.8 ms, sys: 19.9 ms, total: 92.7 ms
Wall time: 27.1 s


In [2]:
%time

# 加载llm和embeddings

%run ../utils2.py

from llama_index.core import Settings

Settings.llm=get_llm()
Settings.embed_model = get_embedding()

CPU times: user 2 µs, sys: 1e+03 ns, total: 3 µs
Wall time: 3.1 µs


## 加载文档

In [3]:
%%time

from llama_index.readers.web import TrafilaturaWebReader

documents = TrafilaturaWebReader().load_data(
    ["https://baike.baidu.com/item/固态电池"]
)

len(documents)

CPU times: user 332 ms, sys: 4.82 ms, total: 336 ms
Wall time: 386 ms


1

In [4]:
documents[0].text[:500]

'收藏\n查看我的收藏\n0有用+1\n- 中文名\n- 固态电池\n- 外文名\n- Solid-state batteries\n- 领 域\n- 硬件\n- 能量密度\n- 锂离子电池的2倍\n- 性 质\n- 一种使用固体电极和固体电解质的电池\n- 特 点\n- 功率密度较低，能量密度较高\n2030年，锂离子电池将不再是电动汽车电池主流，但其在某些电子原件领域仍有一席之地。 [1]据SNE Researchd的测算，2025年我国固态电池市场空间有望达30亿元，2030年有望达到200亿元。 [3]\n在2010年，丰田就曾推出过续航里程可超过1000KM的固态电池。而包括QuantumScape以及Sakti3所做的努力也都是在试图用固态电池来取代传统的液态锂电池。\n加拿大Avestor公司也曾尝试过研发固态锂电池，最终2006年正式申请破产。Avestor公司使用一种高分子聚合物分离器，代替电池中的液体电解质，但一直没有解决安全问题，在北美地区发生过几起电池燃烧或者爆炸事件。\n2015年3月中旬，真空吸尘器的发明者、英国戴森公司（Dyson）创始人詹姆斯·戴森将其首笔1500万美元的投资投向了固态电池公'

In [5]:
%%time

from llama_index.core.node_parser import SentenceSplitter

splitter = SentenceSplitter(chunk_size=128, chunk_overlap=10)
nodes = splitter.get_nodes_from_documents(documents)

len(nodes)

CPU times: user 266 ms, sys: 12.2 ms, total: 278 ms
Wall time: 278 ms


31

In [6]:
%%time

from llama_index.core.storage.docstore import SimpleDocumentStore

docstore = SimpleDocumentStore()
docstore.add_documents(nodes)

CPU times: user 2.49 ms, sys: 0 ns, total: 2.49 ms
Wall time: 2.5 ms


## 创建向量索引

In [7]:
%%time

# 启用 Qdrant 作为向量存储

import qdrant_client
from llama_index.vector_stores.qdrant import QdrantVectorStore
from qdrant_client.models import Distance, VectorParams
from llama_index.core import StorageContext

client = qdrant_client.QdrantClient(
    location=":memory:",
    vectors_config=VectorParams(
        size=1024, 
        distance=Distance.COSINE
    ),
)

vector_store = QdrantVectorStore(client=client, collection_name="simple")
storage_context = StorageContext.from_defaults(vector_store=vector_store)

CPU times: user 773 ms, sys: 24 ms, total: 797 ms
Wall time: 797 ms


In [8]:
%%time

from llama_index.core import VectorStoreIndex

index = VectorStoreIndex(
    nodes=nodes, 
    storage_context=storage_context
)

CPU times: user 142 ms, sys: 9.23 ms, total: 151 ms
Wall time: 2.94 s


In [9]:
%%time

import nest_asyncio

nest_asyncio.apply()

CPU times: user 881 µs, sys: 81 µs, total: 962 µs
Wall time: 907 µs


In [10]:
%%time

# 下载停用词

# 设置 HTTP 代理环境变量
# https://github.com/nltk/nltk_data/issues/154#issuecomment-2144880495
http_proxy="http://192.168.0.134:7890"

import nltk
nltk.set_proxy(f'{http_proxy}')
nltk.download('stopwords')

CPU times: user 8.53 ms, sys: 1.93 ms, total: 10.5 ms
Wall time: 482 ms


[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


True

In [13]:
%%time
import jieba
from typing import List
from nltk.corpus import stopwords

def chinese_tokenizer(text: str) -> List[str]:
    # Use jieba to segment Chinese text
    return list(jieba.cut(text))
    # return list(jieba.lcut(text))

CPU times: user 14 µs, sys: 0 ns, total: 14 µs
Wall time: 15.7 µs


In [16]:
%%time

from llama_index.core.retrievers import QueryFusionRetriever
from llama_index.retrievers.bm25 import BM25Retriever

retriever = QueryFusionRetriever(
    [
        index.as_retriever(similarity_top_k=2),
        BM25Retriever.from_defaults(
            # docstore=index.docstore, 
            docstore=docstore,
            similarity_top_k=2,
            tokenizer=chinese_tokenizer,
        ),
    ],
    num_queries=1,
    use_async=True,
)

Building prefix dict from the default dictionary ...
Loading model from cache /tmp/jieba.cache
Loading model cost 0.449 seconds.
Prefix dict has been built successfully.


CPU times: user 446 ms, sys: 20 ms, total: 466 ms
Wall time: 460 ms


In [17]:
%%time

from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine(retriever)

CPU times: user 33.9 ms, sys: 166 µs, total: 34.1 ms
Wall time: 33.6 ms


In [18]:
%%time

response = query_engine.query("固态电池是啥?")
print(response)

AttributeError: 'NoneType' object has no attribute 'search'

In [15]:
%%time

from llama_index.core import get_response_synthesizer

response_synthesizer = get_response_synthesizer(
    response_mode="tree_summarize",
    streaming=True,
)

CPU times: user 130 µs, sys: 14 µs, total: 144 µs
Wall time: 147 µs


In [16]:
%%time

from llama_index.core.query_engine import RetrieverQueryEngine

query_engine = RetrieverQueryEngine(
    retriever=retriever,
    response_synthesizer=response_synthesizer,
)

NameError: name 'retriever' is not defined

In [17]:
%%time

response = query_engine.query("固态电池和液态电池的主要区别是啥?")
print(response)

NameError: name 'query_engine' is not defined

In [18]:
%%time

# query
streaming_response = query_engine.query("固态电池和液态电池的主要区别是啥")
streaming_response.print_response_stream()

NameError: name 'query_engine' is not defined