In [8]:
import PyPDF2
from sentence_transformers import SentenceTransformer
import numpy as np
import faiss
from langchain_community.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS
from langchain.docstore.document import Document


In [9]:
def extract_text_by_lines(pdf_path):
    """使用 PyPDF2 从 PDF 文件中提取文本，并按行分割。"""
    reader = PyPDF2.PdfReader(pdf_path)
    lines = []
    for page in reader.pages:
        text = page.extract_text()
        if text:
            lines.extend(text.splitlines())  # 按行分割
    return lines


def process_pdf_with_langchain(pdf_path, hf_model):
    """使用LangChain处理PDF文件，按行分割文本并向量化。"""
    lines = extract_text_by_lines(pdf_path)
    documents = [Document(page_content=line) for line in lines if line.strip()]  # 过滤掉空行
    return documents


In [10]:
def store_vectors_with_langchain(documents, hf_model, faiss_db_path):
    """将向量化的内容存储到FAISS中，并保存FAISS索引。"""
    # 使用模型生成向量
    embeddings = hf_model.embed_documents([doc.page_content for doc in documents])
    
    # 创建FAISS VectorStore
    vectorstore = FAISS.from_documents(documents, hf_model)
    
    # 将FAISS索引保存到文件
    vectorstore.save_local(faiss_db_path)

    return vectorstore


In [11]:
def process_and_store_pdf(pdf_path, hf_model, faiss_db_path):
    """处理PDF文件，向量化并存储到FAISS中。"""
    # Step 1: 使用 LangChain 处理PDF并生成文档列表
    documents = process_pdf_with_langchain(pdf_path, hf_model)
    
    # Step 2: 使用 LangChain 存储向量到 FAISS
    vectorstore = store_vectors_with_langchain(documents, hf_model, faiss_db_path)
    
    return vectorstore


In [12]:
# 示例用法
local_em_model_path = "E:\\AAAAWork\\python\\models\\EMB\\bce-embedding-base_v1"  # 替换为你的本地模型路径
hf = HuggingFaceEmbeddings(
    model_name=local_em_model_path,
    model_kwargs={"device": "cpu"},  # 如果使用GPU，将 "cpu" 替换为 "cuda"
    encode_kwargs={"normalize_embeddings": True},
)

pdf_path = R"E:\AAAAWork\python\LLM_RAG\C2012\demo\contents.pdf"  # 替换为你的PDF文件路径
faiss_db_path = "contents.index"  # 指定保存FAISS索引的路径

vectorstore = process_and_store_pdf(pdf_path, hf, faiss_db_path)

In [13]:
# 导入必须的包
from langchain.document_loaders import UnstructuredExcelLoader, Docx2txtLoader, PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter
# from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.embeddings import DashScopeEmbeddings
from langchain.vectorstores import FAISS
# 设置LLM
from langchain_community.chat_models import ChatTongyi
# 引入上下文压缩
from langchain.retrievers import ContextualCompressionRetriever # 上下文压缩检索
from langchain.retrievers.document_compressors import LLMChainExtractor
from langchain.prompts import ChatPromptTemplate

import os


In [14]:
api_key = os.getenv("KEY_TONGYI")
serp_api_key = os.getenv("KEY_SEARCH") #搜索平台Serp的API KEY
os.environ["SERPAPI_API_KEY"] = serp_api_key

# LLM
llm = ChatTongyi(
    dashscope_api_key=api_key,
    temperature=0 # 0-1，越小越倾向与输入一致
)

In [19]:
class ChatDoc:
    def __init__(self, faiss_db_path):
        self.faiss_db_path = faiss_db_path
        self.template = [
            ("system", "你是一个处理文档的秘书,你会根据下面提供的上下文内容来继续回答问题,你从不说自己是一个大模型或者AI助手.\n上下文内容\n{context}\n"),
            ("human", "你好!\n"),
            ("ai", "你好!"),
            ("human", "{question}\n"),
        ]
        self.prompt = ChatPromptTemplate.from_messages(self.template)

    # 加载向量数据库
    def load_vector_db(self):
        # 加载已经保存的 FAISS 索引
        hf = HuggingFaceEmbeddings(
            model_name="E:\\AAAAWork\\python\\models\\EMB\\bce-embedding-base_v1",  # 替换为实际的模型路径
            model_kwargs={"device": "cpu"},
            encode_kwargs={"normalize_embeddings": True},
        )
        db = FAISS.load_local(self.faiss_db_path, hf, allow_dangerous_deserialization=True)
        return db

    # 提问并找到相关文本块
    def askAndFindFiles(self, question):
        db = self.load_vector_db()  # 加载已有的向量数据库
        # 采用上下文压缩的方式
        retriever = db.as_retriever()
        compressor = LLMChainExtractor.from_llm(llm=llm)
        compressor_retriever = ContextualCompressionRetriever(
            base_compressor=compressor, base_retriever=retriever
        )
        return compressor_retriever.get_relevant_documents(query=question)

    # 用自然语言和文档聊天
    def chatWithDoc(self, question):
        _context = ""
        context = self.askAndFindFiles(question)
        for i in context:
            _context += i.page_content
        message = self.prompt.format_messages(context=_context, question=question)
        return llm.invoke(message)


In [20]:
# 创建 ChatDoc 实例，并指定 FAISS 数据库的路径
chat_doc = ChatDoc(faiss_db_path="contents.index")  # 替换为实际的路径

# 进行问题查询
response = chat_doc.chatWithDoc("第一条内容是什么")
print(response)


RuntimeError: Error in __cdecl faiss::FileIOReader::FileIOReader(const char *) at D:\a\faiss-wheels\faiss-wheels\faiss\faiss\impl\io.cpp:68: Error: 'f' failed: could not open content.index\index.faiss for reading: No such file or directory