In [5]:
import os
import os
from dotenv import load_dotenv
# Load environment variables from openai.env file
load_dotenv("openai.env")

# Read the OPENAI_API_KEY from the environment
api_key = os.getenv("OPENAI_API_KEY")
api_base = os.getenv("OPENAI_API_BASE")
os.environ["OPENAI_API_KEY"] = api_key
os.environ["OPENAI_API_BASE"] = api_base

### ChatDoc:和文件聊天

In [6]:
# 导入必须的包
from langchain.document_loaders import UnstructuredExcelLoader, Docx2txtLoader, PyPDFLoader
from langchain.text_splitter import CharacterTextSplitter

from langchain.vectorstores import Chroma
# 导入聊天的模块
from langchain_ollama import ChatOllama, OllamaEmbeddings
from langchain.prompts import ChatPromptTemplate

model_name = "qwen2.5:7b"
# 定义chatdoc


class ChatDoc():
    def __init__(self):
        self.doc = None
        self.splitText = []  # 分割后的文本
        self.template = [
            ('system',
             "你是一个处理文档的秘书,你从不说自己是一个大模型或者AI助手,你会根据下面提供的上下文内容来继续回答问题.\n 上下文内容\n {context} \n"),
            ("human", "你好！"),
            ("ai", "你好"),
            ("human", "{question}")
        ]
        self.prompt = ChatPromptTemplate.from_messages(self.template)

    def getFile(self):
        doc = self.doc
        loaders = {
            "docx": Docx2txtLoader,
            "pdf": PyPDFLoader,
            "xlsx": UnstructuredExcelLoader,
        }
        file_extension = doc.split(".")[-1]
        loader_class = loaders.get(file_extension)
        if loader_class:
            try:
                loader = loader_class(doc)
                text = loader.load()
                return text
            except Exception as e:
                print(f"Error loading {file_extension} files:{e}")
        else:
            print(f"Unsupported file extension: {file_extension}")
            return None

    # 处理文档的函数
    def splitSentences(self):
        full_text = self.getFile()  # 获取文档内容
        if full_text != None:
            # 对文档进行分割
            text_split = CharacterTextSplitter(
                chunk_size=141,
                chunk_overlap=20,
            )
            texts = text_split.split_documents(full_text)
            self.splitText = texts

    # 向量化与向量存储
    def embeddingAndVectorDB(self):
        embeddings = OllamaEmbeddings(model=model_name)
        db = Chroma.from_documents(
            documents=self.splitText,
            embedding=embeddings,
        )
        return db

    # 提问并找到相关的文本块
    def askAndFindFiles(self, question):
        db = self.embeddingAndVectorDB()
        retriever = db.as_retriever(search_type="mmr")
        # retriever = db.as_retriever(search_type="similarity_score_threshold",search_kwargs={"score_threshold":.5,"k":1})
        return retriever.get_relevant_documents(query=question)

    def chatWithDoc(self, question):
        _content = ""
        context = self.askAndFindFiles(question)
        for i in context:
            _content += i.page_content

        messages = self.prompt.format_messages(
            context=_content, question=question)
        chat = ChatOllama(
            model=model_name,
            temperature=0.0,
        )
        return chat.invoke(messages)


chat_doc = ChatDoc()
chat_doc.doc = "example/fake.docx"
chat_doc.splitSentences()
chat_doc.chatWithDoc("公司是亏损还是盈利？给出具体数额")

AIMessage(content='根据提供的信息，宏图科技发展有限公司在最近一个会计年度出现了亏损，具体数额为800万元人民币。去年同期则是盈利200万元人民币。', additional_kwargs={}, response_metadata={'model': 'qwen2.5:7b', 'created_at': '2025-01-07T04:27:13.22642Z', 'done': True, 'done_reason': 'stop', 'total_duration': 1529640875, 'load_duration': 11721000, 'prompt_eval_count': 334, 'prompt_eval_duration': 732000000, 'eval_count': 36, 'eval_duration': 780000000, 'message': Message(role='assistant', content='', images=None, tool_calls=None)}, id='run-1da299b3-05c9-4815-92c1-61666fc9bdc9-0', usage_metadata={'input_tokens': 334, 'output_tokens': 36, 'total_tokens': 370})