# install

```
pip install sentence-transformers BCEmbedding llama-index llama-index-llms-huggingface llama-index-embeddings-huggingfacellama-index-embeddings-instructor llama-index-vector-stores-faiss
```

In [3]:
import torch
from llama_index.core import Document, Settings, SimpleDirectoryReader, VectorStoreIndex, StorageContext, load_index_from_storage
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.postprocessor import SimilarityPostprocessor
from llama_index.vector_stores.faiss import FaissVectorStore
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
import faiss

In [19]:
DATA_PATH: str = "./data"
EMBEDDING_MODEL_PATH: str = "./models/bce-embedding-base_v1"
RERANKER_MODEL_PATH: str = "./models/bce-reranker-base_v1"
PERSIST_DIRECTORY: str = "./vector_db/llama-index-faiss"
SIMILARITY_TOP_K: int = 4
SCORE_THRESHOLD: float = 0.15
ALLOW_SUFFIX: tuple[str] = (".txt", ".md", ".docx", ".doc", ".pdf")

# SimpleDirectoryReader 读取文档

In [20]:
# https://docs.llamaindex.ai/en/stable/module_guides/loading/simpledirectoryreader/
reader = SimpleDirectoryReader(
    input_dir = DATA_PATH,
    input_files = None, # 指定文件,可以提前去重文件
    recursive = True,
    required_exts = ALLOW_SUFFIX,
)
documents = reader.load_data()
len(documents)



3217

# Embeddings
https://docs.llamaindex.ai/en/stable/module_guides/models/embeddings/

In [21]:
embed_model = HuggingFaceEmbedding(
    model_name = EMBEDDING_MODEL_PATH,
    normalize = True,
    device = "cuda",
)
embed_model._model.half()

# 可以这样初始化默认模型，也能在后面传递参数
# Settings.embed_model = embed_model

06/12/2024 19:47:23 - [INFO] -sentence_transformers.SentenceTransformer->>>    Load pretrained SentenceTransformer: ./models/bce-embedding-base_v1
06/12/2024 19:47:24 - [INFO] -sentence_transformers.SentenceTransformer->>>    2 prompts are loaded, with the keys: ['query', 'text']


SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: XLMRobertaModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': True, 'pooling_mode_mean_tokens': False, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
  (2): Normalize()
)

# VectorStoreIndex.from_documents

https://docs.llamaindex.ai/en/stable/module_guides/loading/documents_and_nodes/

In [22]:
# index: VectorStoreIndex = VectorStoreIndex.from_documents(
#     documents = documents,
#     embed_model = embed_model,
# )
# index

# SentenceSplitter

https://docs.llamaindex.ai/en/stable/module_guides/loading/documents_and_nodes/

In [23]:
parser = SentenceSplitter(chunk_size=512, chunk_overlap=32)
parser

SentenceSplitter(include_metadata=True, include_prev_next_rel=True, callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x000001CC06CEAAD0>, id_func=<function default_id_func at 0x000001CB1196FE20>, chunk_size=512, chunk_overlap=32, separator=' ', paragraph_separator='\n\n\n', secondary_chunking_regex='[^,.;。？！]+[,.;。？！]?')

In [24]:
nodes = parser.get_nodes_from_documents(documents = documents)
len(nodes)

9230

# VectorStoreIndex from nodes

In [25]:
# index: VectorStoreIndex = VectorStoreIndex(
#     nodes = nodes,
#     embed_model = embed_model
# )
# index

# faiss

https://docs.llamaindex.ai/en/stable/examples/vector_stores/FaissIndexDemo/

## FaissVectorStore

In [26]:
# dimensions of bce-embedding-base_v1
d = 768
faiss_index = faiss.IndexFlatL2(d)
faiss_index

<faiss.swigfaiss_avx2.IndexFlatL2; proxy of <Swig Object of type 'faiss::IndexFlatL2 *' at 0x000001CC06CEA2B0> >

In [27]:
vector_store = FaissVectorStore(faiss_index = faiss_index)
vector_store

FaissVectorStore(stores_text=False, is_embedding_query=True)

## StorageContext

In [28]:
storage_context: StorageContext = StorageContext.from_defaults(vector_store = vector_store)
storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x000001CC0F3A7550>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x000001CC0F3A74F0>, vector_stores={'default': FaissVectorStore(stores_text=False, is_embedding_query=True), 'image': SimpleVectorStore(stores_text=False, is_embedding_query=True, data=SimpleVectorStoreData(embedding_dict={}, text_id_to_ref_doc_id={}, metadata_dict={}))}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x000001CC0F3A71F0>, property_graph_store=None)

## VectorStoreIndex

In [29]:
index = VectorStoreIndex(
    nodes = nodes,
    embed_model = embed_model,
    storage_context = storage_context,
)
index

<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x1cc0f3a7460>

## save index to disk

In [32]:
index.storage_context.persist(persist_dir = PERSIST_DIRECTORY)

## load index from disk

In [33]:
vector_store: FaissVectorStore = FaissVectorStore.from_persist_dir(persist_dir = PERSIST_DIRECTORY)
vector_store

06/12/2024 19:55:29 - [INFO] -root->>>    Loading llama_index.vector_stores.faiss.base from ./vector_db/llama-index-faiss\default__vector_store.json.


FaissVectorStore(stores_text=False, is_embedding_query=True)

In [34]:
storage_context = StorageContext.from_defaults(
    vector_store = vector_store,
    persist_dir = PERSIST_DIRECTORY,
)
storage_context

StorageContext(docstore=<llama_index.core.storage.docstore.simple_docstore.SimpleDocumentStore object at 0x000001CC11B99BD0>, index_store=<llama_index.core.storage.index_store.simple_index_store.SimpleIndexStore object at 0x000001CC72503BB0>, vector_stores={'default': FaissVectorStore(stores_text=False, is_embedding_query=True)}, graph_store=<llama_index.core.graph_stores.simple.SimpleGraphStore object at 0x000001CC0F40A770>, property_graph_store=None)

In [35]:
index = load_index_from_storage(
    storage_context = storage_context,
    embed_model = embed_model,
)
index

06/12/2024 19:55:32 - [INFO] -llama_index.core.indices.loading->>>    Loading all indices.


<llama_index.core.indices.vector_store.base.VectorStoreIndex at 0x1cc06a99300>

In [None]:
# 不能这样载入
# ValueError: Cannot initialize from a vector store that does not store text.
# index = VectorStoreIndex.from_vector_store(
#     vector_store = vector_store,
#     embed_model = embed_model,
# )
# index

# Retriever

https://docs.llamaindex.ai/en/stable/module_guides/querying/retriever/

https://docs.llamaindex.ai/en/stable/module_guides/querying/retriever/retriever_modes/

In [45]:
retriever = index.as_retriever(
    retriever_mode = "embedding", # embedding, llm
    choice_batch_size = SIMILARITY_TOP_K * 2,
)
retriever

<llama_index.core.indices.vector_store.retrievers.retriever.VectorIndexRetriever at 0x1cc06d805e0>

In [66]:
query = "Eye Pressure Lowering Effect of Vitamin C"
# query = "吃了吗"

In [71]:
nodes = retriever.retrieve(query)
nodes

Batches:   0%|          | 0/1 [00:00<?, ?it/s]

[NodeWithScore(node=TextNode(id_='41e00465-e27a-4cbc-8b31-6a2d18f19b00', embedding=None, metadata={'page_label': '1', 'file_name': 'JOM_1995_10_3-4_08_Eye_Pressure_Lowering_Effect_of_Vitamin_C.pdf', 'file_path': 'd:\\AI\\14_llm\\HealthcareAgent\\data\\healthcare_data6.2\\JOM_1995_10_3-4_08_Eye_Pressure_Lowering_Effect_of_Vitamin_C.pdf', 'file_type': 'application/pdf', 'file_size': 66128, 'creation_date': '2024-06-03', 'last_modified_date': '2024-05-26'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f8203772-9bc8-4149-96f4-3122c4d6ad2b', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'JOM_1995_10_3-4_08_Eye_Pressure_Lowering_Effect_of_Vitamin_C.pdf', 'file_path': 'd:\\AI\\

# SimilarityPostprocessor

https://docs.llamaindex.ai/en/stable/module_guides/querying/node_postprocessors/

In [62]:
# similarity postprocessor: filter nodes below 0.25 similarity score
similarity_processor = SimilarityPostprocessor(similarity_cutoff = SCORE_THRESHOLD)
similarity_processor

SimilarityPostprocessor(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x000001CC06AB4430>, similarity_cutoff=0.15)

In [68]:
similarity_nodes = similarity_processor.postprocess_nodes(nodes = nodes)
similarity_nodes

[NodeWithScore(node=TextNode(id_='41e00465-e27a-4cbc-8b31-6a2d18f19b00', embedding=None, metadata={'page_label': '1', 'file_name': 'JOM_1995_10_3-4_08_Eye_Pressure_Lowering_Effect_of_Vitamin_C.pdf', 'file_path': 'd:\\AI\\14_llm\\HealthcareAgent\\data\\healthcare_data6.2\\JOM_1995_10_3-4_08_Eye_Pressure_Lowering_Effect_of_Vitamin_C.pdf', 'file_type': 'application/pdf', 'file_size': 66128, 'creation_date': '2024-06-03', 'last_modified_date': '2024-05-26'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='f8203772-9bc8-4149-96f4-3122c4d6ad2b', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'JOM_1995_10_3-4_08_Eye_Pressure_Lowering_Effect_of_Vitamin_C.pdf', 'file_path': 'd:\\AI\\

# BCERerank

https://github.com/netease-youdao/BCEmbedding/blob/master/BCEmbedding/tools/llama_index/bce_rerank.py

In [64]:
from BCEmbedding.tools.llama_index import BCERerank

In [58]:
reranker = BCERerank(
    top_n = SIMILARITY_TOP_K,
    model = RERANKER_MODEL_PATH,
    device = 'cuda',
    use_fp16 = True
)
reranker

06/12/2024 19:58:38 - [INFO] -BCEmbedding.models.RerankerModel->>>    Loading from `./models/bce-reranker-base_v1`.
06/12/2024 19:58:38 - [INFO] -BCEmbedding.models.RerankerModel->>>    Execute device: cuda;	 gpu num: 1;	 use fp16: True


BCERerank(callback_manager=<llama_index.core.callbacks.base.CallbackManager object at 0x000001CC06E210C0>, model='./models/bce-reranker-base_v1', top_n=4)

In [69]:
reranked_nodes = reranker.postprocess_nodes(nodes = similarity_nodes, query_str = query)
reranked_nodes

[NodeWithScore(node=TextNode(id_='6473578e-c258-408c-812c-e3cf2630abd9', embedding=None, metadata={'page_label': '1', 'file_name': 'Eye Pressure Lowering Effect of Vitamin C.pdf', 'file_path': 'd:\\AI\\14_llm\\HealthcareAgent\\data\\FM docs 2024.3\\Eye Pressure Lowering Effect of Vitamin C.pdf', 'file_type': 'application/pdf', 'file_size': 66128, 'creation_date': '2024-05-08', 'last_modified_date': '2024-01-27'}, excluded_embed_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], excluded_llm_metadata_keys=['file_name', 'file_type', 'file_size', 'creation_date', 'last_modified_date', 'last_accessed_date'], relationships={<NodeRelationship.SOURCE: '1'>: RelatedNodeInfo(node_id='8264aa79-59a4-4d2d-b181-ce95c5c9f52f', node_type=<ObjectType.DOCUMENT: '4'>, metadata={'page_label': '1', 'file_name': 'Eye Pressure Lowering Effect of Vitamin C.pdf', 'file_path': 'd:\\AI\\14_llm\\HealthcareAgent\\data\\FM docs 2024.3\\Eye Pressure L