In [7]:
import os
import json
from pathlib import Path
from typing import List, Dict
from llama_index.core import Document, VectorStoreIndex, StorageContext
from llama_index.core.vector_stores import SimpleVectorStore
from llama_index.core.extractors import BaseExtractor
from llama_index.core.node_parser import SentenceSplitter
from llama_index.core.ingestion import IngestionPipeline
from llama_index.embeddings.openai import OpenAIEmbedding
from llama_index.embeddings.dashscope import DashScopeEmbedding
import nest_asyncio
from llama_index.retrievers.bm25 import BM25Retriever


class UnifiedEmbedding:
    """统一嵌入模型接口"""
    def __init__(
        self,
        model_type: str = "dashscope",
        model_name: str = "text-embedding-v2",
        dashscope_text_type: str = "document",
        api_key: str = None,
        base_url: str = None
    ):
        """
        Args:
            model_type: 模型类型 (openai/dashscope)
            model_name: 模型名称
            dashscope_text_type: DashScope文本类型
            api_key: API密钥
            base_url: 服务地址
        """
        self.model_type = model_type
        
        if model_type == "dashscope":
            self.embedder = DashScopeEmbedding(
                model_name=model_name,
                text_type=dashscope_text_type,
                api_key=api_key
            )
        else:
            self.embedder = OpenAIEmbedding(
                # openai 默认的嵌入模型
                # model: str = OpenAIEmbeddingModelType.TEXT_EMBED_ADA_002,
                # model=model_name,
                api_key=api_key,
                api_base=base_url
            )

    def get_embed_model(self):
        return self.embedder

class CustomExtractor(BaseExtractor):
    """统一元数据提取器"""
    async def aextract(self, nodes) -> List[Dict]:
        return [{
            "title": node.metadata["title"],
            "source": node.metadata["source"],
            "published_at": node.metadata["published_at"]
        } for node in nodes]

def build_vector_indices(
    input_path: str,
    output_dir: str,
    model_type: str = "dashscope",
    model_name: str = "text-embedding-v2",
    chunk_size: int = 256,
    chunk_overlap: int = 20,
    api_key: str = None,
    base_url: str = None
):
    """
    统一索引构建函数
    
    Args:
        input_path: 输入路径（文件或文件夹）
        output_dir: 输出目录
        model_type: 模型类型 (openai/dashscope)
        model_name: 模型名称
        chunk_size: 文本分块大小
        chunk_overlap: 分块重叠大小
        api_key: API密钥
        base_url: 服务地址
    """
    # 初始化异步环境
    nest_asyncio.apply()

    # 初始化嵌入模型
    embed_model = UnifiedEmbedding(
        model_type=model_type,
        model_name=model_name,
        api_key=api_key,
        base_url=base_url
    ).get_embed_model()

    # 获取文件列表
    if os.path.isdir(input_path):
        json_files = [str(p) for p in Path(input_path).glob("*.json")]
    else:
        json_files = [input_path]

    # 处理每个文件
    for idx, json_path in enumerate(json_files):
        print(f"\n处理 {json_path} ({idx+1}/{len(json_files)})")

        # 加载数据
        with open(json_path, 'r', encoding='utf-8') as f:
            load_data = json.load(f)

        # 创建文档
        documents = [
            Document(
                text=data['body'],
                metadata={
                    "title": data['title'],
                    "published_at": data['published_at'],
                    "source": data['source']
                }
            ) for data in load_data
        ]

        # 构建处理管道
        text_splitter = SentenceSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        pipeline = IngestionPipeline(transformations=[
            text_splitter,
            CustomExtractor()
        ])
        nodes = pipeline.run(documents=documents)

        # 创建存储
        vector_store = SimpleVectorStore()
        storage_context = StorageContext.from_defaults(vector_store=vector_store)

        # 构建索引
        index = VectorStoreIndex(
            nodes,
            storage_context=storage_context,
            embed_model=embed_model,
            show_progress=True
        )

        # 持久化存储
        file_stem = Path(json_path).stem
        persist_dir = os.path.join(output_dir, f"index_{file_stem}")
        os.makedirs(persist_dir, exist_ok=True)
        storage_context.persist(persist_dir=persist_dir)
        print(f"Index persisted at: {persist_dir}")
        
def build_bm25_indices(
    input_path: str,
    output_dir: str,
    chunk_size: int = 256,
    chunk_overlap: int = 20,
):
    """
    统一索引构建函数
    Args:
        input_path: 输入路径（文件或文件夹）
        output_dir: 输出目录
        chunk_size:
        chunk_overlap:
    """
    
    # 初始化异步环境
    nest_asyncio.apply()
    
    # 获取文件列表
    if os.path.isdir(input_path):
        json_files = [str(p) for p in Path(input_path).glob("*.json")]
    else:
        json_files = [input_path]

    # 处理每个文件
    for idx, json_path in enumerate(json_files):
        print(f"\n处理 {json_path} ({idx+1}/{len(json_files)})")

        # 加载数据
        with open(json_path, 'r', encoding='utf-8') as f:
            load_data = json.load(f)

        # 创建文档
        documents = [
            Document(
                text=data['body'],
                metadata={
                    "title": data['title'],
                    "published_at": data['published_at'],
                    "source": data['source']
                }
            ) for data in load_data
        ]

        # 构建处理管道
        text_splitter = SentenceSplitter(
            chunk_size=chunk_size,
            chunk_overlap=chunk_overlap
        )
        pipeline = IngestionPipeline(transformations=[
            text_splitter,
            CustomExtractor()
        ])
        nodes = pipeline.run(documents=documents)
        
        bm25retriever = BM25Retriever.from_defaults(nodes=nodes)
        file_stem = Path(json_path).stem
        persist_dir = os.path.join(output_dir, f"index_{file_stem}")
        os.makedirs(persist_dir, exist_ok=True)
        bm25retriever.persist(path=persist_dir)
        print(f"Index persisted at: {persist_dir}")

In [None]:
# DashScope 使用示例
build_vector_indices(
    input_path="splits/balance_3",
    output_dir="./embeddings/dashscope/balance_3",
    model_type="dashscope",
    model_name="text-embedding-v2",
    api_key=os.getenv("DASHSCOPE_API_KEY"),
    base_url="https://dashscope.aliyuncs.com/compatible-mode/v1"
)

In [None]:
# OpenAI 使用示例
build_vector_indices(
    input_path="splits/balance_3",
    output_dir="./embeddings/openai/balance_3",
    model_type="openai",
    model_name="text-embedding-3-large",
    api_key=os.getenv("OPENAI_API_KEY"),
    base_url=os.getenv("OPENAI_API_BASE")
)

In [10]:
build_bm25_indices(
    input_path="splits/balance_2",
    output_dir="embeddings/bm25/balance_2",
)


处理 splits\balance_2\group_0.json (1/2)


Finding newlines for mmindex:   0%|          | 0.00/12.1M [00:00<?, ?B/s]

Index persisted at: embeddings/bm25/balance_2\index_group_0

处理 splits\balance_2\group_1.json (2/2)


Finding newlines for mmindex:   0%|          | 0.00/10.8M [00:00<?, ?B/s]

Index persisted at: embeddings/bm25/balance_2\index_group_1
