In [2]:
from langchain_community.document_loaders import DirectoryLoader, UnstructuredFileLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
from langchain_community.vectorstores import FAISS
from langchain_core.output_parsers import StrOutputParser
from langchain_core.prompts import ChatPromptTemplate
from langchain_core.runnables import RunnablePassthrough
#from langchain_huggingface import HuggingFaceEmbeddings

In [None]:
# 1. 加载知识库文档（支持Word/PDF/TXT）
def load_documents(file_path):
    loader = DirectoryLoader(
        file_path,
        glob="**/*.docx",  # 可修改为*.pdf或*.txt
        loader_cls=UnstructuredFileLoader,
        show_progress=True
    )
    return loader.load()

In [None]:
# 2. 文本分块处理
def split_documents(docs):
    text_splitter = RecursiveCharacterTextSplitter(
        chunk_size=500,  # 根据内容调整分块大小
        chunk_overlap=50,
        separators=["\n\n", "\n", "。", "！", "？"]
    )
    return text_splitter.split_documents(docs)

In [None]:
# 3. 构建向量数据库
def build_vector_store(chunks):
    embeddings = HuggingFaceEmbeddings(model_name="BAAI/bge-small-zh-v1.5")  # 中文向量模型
    vector_db = FAISS.from_documents(chunks, embeddings)
    vector_db.save_local("faiss_index")  # 保存索引供后续使用
    return vector_db

In [None]:
def create_qa_chain(vector_db):
    # 使用本地大模型（需安装ollama并运行模型）
    from langchain_community.chat_models import ChatOllama
    llm = ChatOllama(model="qwen:7b")  # 可替换为llama3/mistral等
    
    # 或使用OpenAI API（需API Key）
    # from langchain_openai import ChatOpenAI
    # llm = ChatOpenAI(model="gpt-3.5-turbo")

    # 提示词模板
    prompt_template = """
    基于以下上下文信息回答问题：
    {context}
    
    问题：{question}
    请用中文给出专业、清晰的回答，如果无法找到答案请说明
    """
    
    # 构建检索链
    retriever = vector_db.as_retriever(search_kwargs={"k": 3})  # 检索Top3相关段落
    prompt = ChatPromptTemplate.from_template(prompt_template)
    
    return (
        {"context": retriever, "question": RunnablePassthrough()}
        | prompt
        | llm
        | StrOutputParser()
    )

In [None]:
# 主流程
if __name__ == "__main__":
    # 加载并处理文档
    documents = load_documents("./knowledge_base/")  # 知识库存放路径
    chunks = split_documents(documents)
    
    # 构建/加载向量库（如果已有索引可跳过构建）
    # vector_db = FAISS.load_local("faiss_index", embeddings)
    vector_db = build_vector_store(chunks)
    
    # 初始化问答链
    qa_chain = create_qa_chain(vector_db)
    
    # 测试问答
    while True:
        question = input("\n用户提问：")
        if question.lower() == "exit":
            break
        print("AI回答：", qa_chain.invoke(question))