In [7]:
import json
from collections import defaultdict
import pandas as pd
from langchain.vectorstores import FAISS  
from langchain.embeddings import HuggingFaceBgeEmbeddings

embeddings = HuggingFaceBgeEmbeddings(
    model_name='BAAI/bge-large-zh-v1.5',
    model_kwargs={'device':'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)
vector_db = FAISS.load_local(
    r"output\v1\FAISS\bge_large_v1.5\faiss_index",
    embeddings,
    allow_dangerous_deserialization=True  
)
def detect_duplicates_and_missing_metadata(vector_db, export_json="dedup_report.json", export_excel="dedup_report.xlsx"):
    # 获取所有文档
    print("Loading all documents from vector_db...")
    all_docs = list(vector_db.docstore._dict.values())  

    print(f"Total documents loaded: {len(all_docs)}")

    # 映射：content -> [uuid, source_file]
    content_to_docs = defaultdict(list)
    unknown_source_docs = []

    for doc in all_docs:
        content = doc.page_content.strip()
        uuid = doc.metadata.get("uuid", "unknown")
        source_file = doc.metadata.get("source_file", "unknown")

        content_to_docs[content].append({
            "uuid": uuid,
            "source_file": source_file
        })

        # 记录 source_file 缺失的
        if source_file == "unknown":
            unknown_source_docs.append({
                "uuid": uuid,
                "content": content[:100] + "..." if len(content) > 100 else content
            })

    # 找出重复内容的文档（content 对应多个 uuid）
    duplicate_content_docs = []
    for content, docs in content_to_docs.items():
        if len(docs) > 1:
            duplicate_content_docs.append({
                "content_sample": content[:100] + "..." if len(content) > 100 else content,
                "matched_docs": docs
            })

    # 结果汇总
    report = {
        "total_documents": len(all_docs),
        "duplicates_found": len(duplicate_content_docs),
        "unknown_source_files": len(unknown_source_docs),
        "duplicate_content_docs": duplicate_content_docs,
        "unknown_source_docs": unknown_source_docs
    }

    # 导出 JSON 文件
    with open(export_json, "w", encoding="utf-8") as f:
        json.dump(report, f, ensure_ascii=False, indent=2)
    print(f"Duplicate & metadata report saved to {export_json}")


detect_duplicates_and_missing_metadata(vector_db)


Loading all documents from vector_db...
Total documents loaded: 1038
Duplicate & metadata report saved to dedup_report.json


In [1]:
from langchain.vectorstores import FAISS
from langchain.embeddings import HuggingFaceBgeEmbeddings
from tqdm import tqdm
import json
import os

# 初始化
embeddings = HuggingFaceBgeEmbeddings(
    model_name='BAAI/bge-large-zh-v1.5',
    model_kwargs={'device': 'cpu'},
    encode_kwargs={'normalize_embeddings': True}
)
vector_db = FAISS.load_local(
    r"output\v1\FAISS\bge_large_v1.5\faiss_index",
    embeddings,
    allow_dangerous_deserialization=True
)

# 加载测试数据
with open(r'D:\desktop\code\QA\test.json', 'r', encoding='utf-8') as f:
    test_data = json.load(f)

test_queries = [item["question"] for item in test_data]
correct_answers = [item["answer"] for item in test_data]

# 导出前20条 query，top20 个文档
export_data = []

for query, correct_answer in tqdm(zip(test_queries, correct_answers), total=len(test_queries), desc="导出TOP20"):
    results = vector_db.similarity_search(query, k=20)
    export_data.append({
        "query": query,
        "correct_answer": correct_answer,
        "retrieved_documents": [
            {"rank": i+1, "content": doc.page_content}
            for i, doc in enumerate(results)
        ]
    })

# 保存
os.makedirs("./manual_check", exist_ok=True)
export_file = "./manual_check/sample_top20_export.json"
with open(export_file, "w", encoding="utf-8") as f:
    json.dump(export_data, f, ensure_ascii=False, indent=2)

print(f"导出完成：{export_file}")


  embeddings = HuggingFaceBgeEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm
导出TOP20: 100%|██████████| 97/97 [00:12<00:00,  7.53it/s]

导出完成：./manual_check/sample_top20_export.json



