为了代码更通用且具有可复用价值构建了一个向量数据库 Chroma
，存储过程中的向量

In [None]:
# !pip install -qU langchain-core chromadb langchain-chroma

In [1]:
import os
import pandas as pd
from tqdm import tqdm

TOP_K = 5

In [2]:
from langchain_chroma import Chroma
from langchain_core.documents import Document

In [3]:
from modelscope import snapshot_download

model_dir = snapshot_download("BAAI/bge-m3")

  from .autonotebook import tqdm as notebook_tqdm


Downloading Model to directory: C:\Users\1\.cache\modelscope\hub\BAAI/bge-m3


In [4]:
from langchain_huggingface import HuggingFaceEmbeddings


hf_embedding = HuggingFaceEmbeddings(
    model_name=model_dir,
    # model_kwargs={"trust_remote_code": True}
)

In [5]:
def delete_words(text):
    DELETW_WORDS = [
        "有限公司",
        "有限责任公司",
        "股份有限公司",
        "控股集团公司",
        "控股有限责任公司",
        "控股股份有限公司",
        "集团",
        "公司",
        "合作社",
    ]
    for word in DELETW_WORDS:
        text = text.replace(word, "")
    return text

In [6]:
os.makedirs("output", exist_ok=True)

In [7]:
fold = "data/"

for file in os.listdir(fold):
    file_name = os.path.join(fold, file)
    if not file_name.endswith(".xlsx"):
        continue
    tmp_df = pd.read_excel(file_name)
    raw_companies = tmp_df["名称1"].to_list()
    ref_companies = tmp_df["名称2"].to_list()

    documents = [
        Document(
            page_content=delete_words(ref_company),
            id=idx,
            metadata={"ref_company": ref_company},
        )
        for idx, ref_company in enumerate(ref_companies)
    ]

    # 向量数据库
    vector_store = Chroma(
        collection_name=file,
        embedding_function=hf_embedding,
        # Where to save data locally, remove if not necessary
        # persist_directory="./chroma_industry_name_db",
    )

    vector_store.add_documents(documents=documents)
    data = []
    for raw_company in tqdm(raw_companies):
        if not isinstance(raw_company, str):
            continue
        relevant_companies = vector_store.similarity_search_with_score(
            delete_words(raw_company), k=TOP_K
        )
        ans = [raw_company]
        for relevant_company, score in relevant_companies:
            ans.append(relevant_company.metadata["ref_company"])
            ans.append(score)
        data.append(ans)

    tmp_df = pd.DataFrame(
        data,
        columns=["raw_company"]
        + sum([[f"ref_company_{i}", f"score_{i}"] for i in range(1, TOP_K + 1)], []),
    )
    
    output_file = os.path.join("output", file)
    tmp_df.to_excel(output_file, index=False)
    print(f"Output saved to {output_file}")

100%|██████████| 559/559 [00:05<00:00, 103.52it/s]


Output saved to output\A21.xlsx


100%|██████████| 624/624 [00:05<00:00, 106.59it/s]


Output saved to output\A31.xlsx


100%|██████████| 669/669 [00:10<00:00, 65.88it/s]


Output saved to output\A41.xlsx


100%|██████████| 556/556 [00:13<00:00, 41.23it/s]


Output saved to output\A51.xlsx


100%|██████████| 2317/2317 [00:33<00:00, 69.09it/s]


Output saved to output\AA.xlsx
