In [1]:
from pymilvus.model.hybrid import BGEM3EmbeddingFunction
from tqdm import tqdm
from pymilvus import (
    connections,
    utility,
    FieldSchema,
    CollectionSchema,
    DataType,
    client,
    Collection
)
def read_txt_to_list(file_path):
    # 初始化一个空列表来存储每一行的内容
    lines = []
    
    # 打开文件并以只读模式读取内容
    with open(file_path, 'r', encoding='utf-8') as file:
        # 逐行读取文件内容
        for line in file:
            # 去掉每行末尾的换行符（如果有）
            line = line.strip()
            # 检查行的长度是否超过4096
            while len(line) > 4096:
                # 如果超过，将行分割成多个部分
                lines.append(line[:4096])
                line = line[4096:]
            # 添加剩余的部分到列表中
            lines.append(line)
    
    return lines
docs = read_txt_to_list('/home/lihao/projects/RAG/medqa/textbooks/zh_sentence/all_books.txt')


ef = BGEM3EmbeddingFunction(
    model_name='/data/model/BAAI/bge-m3', # Specify the model name
    device='cuda', # Specify the device to use, e.g., 'cpu' or 'cuda:0'
)
doc_embeddings = ef(docs)
connections.connect(uri="./medqa_all_book_sentence.db")

Inference Embeddings: 100%|██████████| 19411/19411 [26:29<00:00, 12.21it/s]  


In [2]:
dense_dim = 1024
fields = [
    # Use auto generated id as primary key
    FieldSchema(
        name="answer", dtype=DataType.VARCHAR, is_primary=True, auto_id=True, max_length=65535
    ),
    # Store the original text to retrieve based on semantically distance
    FieldSchema(name="text", dtype=DataType.VARCHAR, max_length=4096),
    # Milvus now supports both sparse and dense vectors,
    # we can store each in a separate field to conduct hybrid search on both vectors
    FieldSchema(name="sparse_vector", dtype=DataType.SPARSE_FLOAT_VECTOR),
    FieldSchema(name="dense_vector", dtype=DataType.FLOAT_VECTOR, dim=dense_dim),
]
schema = CollectionSchema(fields)

col_name = "medqa_all_book_sentence"
if utility.has_collection(col_name):
    Collection(col_name).drop()
col = Collection(col_name, schema, consistency_level="Strong")

In [3]:
sparse_index = {"index_type": "SPARSE_INVERTED_INDEX", "metric_type": "IP"}
col.create_index("sparse_vector", sparse_index)
dense_index = {"index_type": "AUTOINDEX", "metric_type": "IP"}
col.create_index("dense_vector", dense_index)
col.load()

In [4]:
for i in tqdm(range(0, len(docs), 50)):
    batched_entities = [
        docs[i : i + 50],
        doc_embeddings["sparse"][i : i + 50],
        doc_embeddings["dense"][i : i + 50],
    ]
    col.insert(batched_entities)
print("Number of entities inserted:", col.num_entities)

100%|██████████| 6212/6212 [03:12<00:00, 32.31it/s]


Number of entities inserted: 310568
