In [3]:
from langchain_chroma import Chroma
from langchain_modelscope import ModelScopeEmbeddings
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter

import logging
logging.getLogger('modelscope').setLevel(logging.ERROR)

# 1. lodad documents.
file_path = 'data/pdf/10803900.pdf' #BYD's 2024 Annual Report.
loader = PyPDFLoader(file_path)
docs = loader.load()

# 2. split documents.
text_splitter = RecursiveCharacterTextSplitter(chunk_size=500, chunk_overlap=50, 
                                               separators=["\n\n", "\n", "。", "！", "？", "；", "……"])
all_splits = text_splitter.split_documents(docs)

# 3. make embeddings.
embedding_model_id = "iic/nlp_gte_sentence-embedding_chinese-large"
embeddings = ModelScopeEmbeddings(model_id=embedding_model_id)

# 4. make vector store.
vector_store = Chroma(collection_name="byd_annual_report", embedding_function=embeddings, persist_directory="./chroma_db")
ids = vector_store.add_documents(all_splits)
print(ids)

Downloading Model from https://www.modelscope.cn to directory: /home/liule/.cache/modelscope/hub/models/iic/nlp_gte_sentence-embedding_chinese-large




['ff1cf99e-6642-4822-8729-a5c64bc3fc1e', '0b3eca2f-b9d5-42cf-9add-3728ddd8fe18', 'd0d06bc5-849c-4248-8e66-b462b84336a6', 'c5d6dfe4-9446-4bd4-85a0-7b7157d83e2f', '2419ea37-7a54-43fd-a40b-2a94d1028e28', 'bbdbdb4f-5f7e-47ce-8fc6-ae96fac5f037', 'fe69f528-27f5-4867-8064-8ea990395c06', 'e5ea4196-607c-42ff-9cf7-ec1a4a5b7cd8', '38e82be0-6704-43dc-a939-7c7fb5f45d41', '48b64e69-6495-42ce-ab9c-a93e4bc65d99', '650eb29a-9306-4104-8d15-46c903b16f9d', '19e2bf04-3d8c-4919-9445-eddda507f02d', '97012712-e24f-405b-97ce-3d19a60760f7', '47e594c7-84c5-4c29-9db6-bd5d2b62a6e1', '05bda851-ff40-4c3e-9d23-102a8ceb2a2a', '00934c80-e97f-445b-871e-ef89ff017336', 'fcc8c923-0817-425c-87d5-9df7b8233701', '5e76c515-09e7-4f83-91ee-744f36fbc121', '0f25584e-f7d4-411d-8fd1-249cf7fb838a', 'ff6a1cc2-7be9-4690-a9e7-a83f29b89a48', '0f79e9d1-39d8-417a-b91e-01f1d2315448', '32af7193-bf1d-408c-a77c-bf632e60425f', '17dd25ef-01e6-4a3c-a65f-012b206e7f55', 'ffefd26b-d604-4de7-a134-68ac1585d28c', 'f86e8141-98e8-46ba-bed7-1a2440783925',

In [12]:
from langchain_deepseek import ChatDeepSeek
from langchain_modelscope import ModelScopeEmbeddings
from langchain_core.prompts import ChatPromptTemplate

embedding_model_id = "iic/nlp_gte_sentence-embedding_chinese-large"
embeddings = ModelScopeEmbeddings(model_id=embedding_model_id)

vector_store = Chroma(collection_name="byd_annual_report", embedding_function=embeddings, persist_directory="./chroma_db")

question = "比亚迪的主营业务是什么？"
docs = vector_store.similarity_search(question, k=3)
context = "\n".join([doc.page_content for doc in docs])

llm = ChatDeepSeek(model='deepseek-chat')

prompt = ChatPromptTemplate.from_messages([
    ("system", 
     """
    请根据以下上下文回答问题。如果无法从中得到答案，请说 “根据已知信息无法回答该问题” 或 “没有提供足够的相关信息”，不允许在答案中添
    加编造成分，答案请使用中文。
    """),
    ("user", 
     """
    上下文: {context}
    问题: {question}
    """)
])

messages = prompt.invoke({"context": context, "question": question}).to_messages()
ai_mesage = llm.invoke(messages)
answer = ai_mesage.content
print(answer)




Downloading Model from https://www.modelscope.cn to directory: /home/liule/.cache/modelscope/hub/models/iic/nlp_gte_sentence-embedding_chinese-large




比亚迪的主营业务包括：锂离子电池以及其他电池、充电器、电子产品、仪器仪表、柔性线路板、五金制品、液晶显示器、手机零配件、模具、塑胶制品及其相关附件的生产、销售；3D眼镜、GPS导航产品的研发、生产及销售；作为比亚迪汽车有限公司比亚迪品牌乘用车、电动车的总经销商，从事上述品牌的乘用车、电动车及其零部件的营销、批发和出口，提供售后服务；电池管理系统、换流柜、逆变柜/器、汇流箱、开关柜、储能机组的销售。
