In [2]:
from llama_index.core import VectorStoreIndex, SimpleDirectoryReader, Settings
from llama_index.embeddings.huggingface import HuggingFaceEmbedding
from llama_index.llms.huggingface import HuggingFaceLLM
from llama_index.core.postprocessor import SentenceTransformerRerank
from llama_index.core.query_engine import RetrieverQueryEngine
from llama_index.core.retrievers import VectorIndexRetriever
from llama_index.core.postprocessor import SimilarityPostprocessor
import torch
from transformers import AutoTokenizer, AutoModelForCausalLM

from llama_index.embeddings.huggingface import HuggingFaceEmbedding


Looking in indexes: https://mirrors.aliyun.com/pypi/simple/


ERROR: Could not find a version that satisfies the requirement llama_index.postprocessor.sentence_transformer_rerank (from versions: none)
ERROR: No matching distribution found for llama_index.postprocessor.sentence_transformer_rerank


In [None]:
# 1. 配置基础 LLM 模型 (Qwen3-7B)
def setup_base_llm(model_path="./model/qwen3-7b"):
    """配置本地 Qwen3-7B 基础模型"""
    llm = HuggingFaceLLM(
        model_name=model_path,
        tokenizer_name=model_path,
        context_window=8192,
        max_new_tokens=2048,
        generate_kwargs={
            "temperature": 0.1,
            "do_sample": True,
            "top_p": 0.9,
        },
        model_kwargs={
            "torch_dtype": torch.float16,
            "device_map": "auto",
        },
        tokenizer_kwargs={
            "trust_remote_code": True
        }
    )
    return llm

# 2. 配置 Embedding 模型
def setup_embedding_model(model_path="./model/qwen-embedding"):
    """配置本地 Qwen Embedding 模型"""
    embed_model = HuggingFaceEmbedding(
        model_name=model_path,
        trust_remote_code=True,
        device="cuda" if torch.cuda.is_available() else "cpu"
    )
    return embed_model

# 3. 配置 Rerank 模型
def setup_rerank_model(model_path="./model/qwen-rerank", top_n=5):
    """配置本地 Qwen Rerank 模型"""
    rerank = SentenceTransformerRerank(
        model=model_path,
        top_n=top_n,
        device="cuda" if torch.cuda.is_available() else "cpu"
    )
    return rerank

# 4. 主要配置函数
def setup_rag_system(data_path="./data", similarity_top_k=20, rerank_top_n=5):
    """设置完整的 RAG 系统"""
    
    # 初始化模型
    llm = setup_base_llm()
    embed_model = setup_embedding_model()
    rerank_model = setup_rerank_model(top_n=rerank_top_n)
    
    # 设置全局配置
    Settings.llm = llm
    Settings.embed_model = embed_model
    
    # 加载文档
    print("加载文档...")
    documents = SimpleDirectoryReader(data_path).load_data()
    
    # 构建向量索引
    print("构建向量索引...")
    index = VectorStoreIndex.from_documents(documents)
    
    # 配置检索器
    retriever = VectorIndexRetriever(
        index=index,
        similarity_top_k=similarity_top_k
    )
    
    # 构建查询引擎，包含 rerank 后处理器
    query_engine = RetrieverQueryEngine(
        retriever=retriever,
        node_postprocessors=[
            SimilarityPostprocessor(similarity_cutoff=0.7),  # 相似度过滤
            rerank_model  # Rerank 重排序
        ]
    )
    
    return query_engine

# 5. 使用示例
def main():
    # 初始化 RAG 系统
    query_engine = setup_rag_system(
        data_path="./data",  # 您的文档目录
        similarity_top_k=20,  # 第一阶段检索文档数
        rerank_top_n=5       # 最终返回文档数
    )
    
    # 执行查询
    while True:
        question = input("请输入问题 (输入 'quit' 退出): ")
        if question.lower() == 'quit':
            break
            
        print("检索中...")
        response = query_engine.query(question)
        print(f"回答: {response}")
        print("-" * 50)

# 6. 高级配置 - 自定义 Prompt 模板
def setup_custom_prompt():
    """设置自定义提示模板"""
    from llama_index.core import PromptTemplate
    
    qa_prompt_tmpl = (
        "上下文信息如下:\n"
        "---------------------\n"
        "{context_str}\n"
        "---------------------\n"
        "根据上下文信息而不是先验知识，回答以下问题。如果上下文中没有相关信息，请说明无法从提供的信息中找到答案。\n"
        "问题: {query_str}\n"
        "回答: "
    )
    
    qa_prompt = PromptTemplate(qa_prompt_tmpl)
    return qa_prompt

# 7. 批量处理函数
def batch_query(query_engine, questions):
    """批量处理问题"""
    results = []
    for i, question in enumerate(questions, 1):
        print(f"处理问题 {i}/{len(questions)}: {question}")
        response = query_engine.query(question)
        results.append({
            "question": question,
            "answer": str(response),
            "source_nodes": response.source_nodes
        })
    return results

if __name__ == "__main__":
    main()