In [None]:
!pip install transformers modelscope langchain sentence_transformers langchain-huggingface onnxruntime

## 根据langchain的文档，安装chroma
## https://python.langchain.com/docs/integrations/vectorstores/chroma/
!pip install -qU "langchain-chroma>=0.1.2"

In [1]:
import os
from langchain_huggingface import HuggingFaceEmbeddings
from langchain_community.document_loaders.text import TextLoader
from langchain_text_splitters import CharacterTextSplitter
from langchain_chroma import Chroma

# embedding 模型下载
from modelscope import snapshot_download
model_dir = snapshot_download('BAAI/bge-m3')

  from .autonotebook import tqdm as notebook_tqdm


Downloading Model from https://www.modelscope.cn to directory: /home/jie/.cache/modelscope/hub/models/BAAI/bge-m3


In [5]:
# load the document and split it into chunks
chunks = []
PREFIX = "以下是{file_name}年政府工作报告的节选部分:"
fold = "data"
for file in os.listdir(fold):
    file_name = os.path.join(fold, file)
    if file.endswith(".txt"):
        prefix = PREFIX.format(file_name=file.split(".")[0])
    else:
        continue
    loader = TextLoader(file_name)
    documents = loader.load()
    
    # split it into chunks
    text_splitter = CharacterTextSplitter(chunk_size=200, chunk_overlap=20)
    tmp_chunks = (text_splitter.split_documents(documents))
    for chunk in tmp_chunks:
        chunk.page_content = prefix + chunk.page_content
    
    chunks.extend(tmp_chunks)

Created a chunk of size 593, which is longer than the specified 200
Created a chunk of size 903, which is longer than the specified 200
Created a chunk of size 396, which is longer than the specified 200
Created a chunk of size 584, which is longer than the specified 200
Created a chunk of size 398, which is longer than the specified 200
Created a chunk of size 763, which is longer than the specified 200
Created a chunk of size 791, which is longer than the specified 200
Created a chunk of size 857, which is longer than the specified 200
Created a chunk of size 1169, which is longer than the specified 200
Created a chunk of size 380, which is longer than the specified 200
Created a chunk of size 290, which is longer than the specified 200
Created a chunk of size 470, which is longer than the specified 200
Created a chunk of size 1892, which is longer than the specified 200
Created a chunk of size 662, which is longer than the specified 200
Created a chunk of size 1295, which is longer 

In [None]:
len(chunks)

In [3]:
chunks[0], chunks[-1]

(Document(metadata={'source': 'data/七台河市2024.txt'}, page_content='以下是七台河市2024年政府工作报告的节选部分:七台河市政府工作报告（2024年1月12日 李兵）\n2024年04月01日 17:21   来源：七台河市政府网   \n[手机看新闻][字号 大 中 小][打印本稿]\n\u3000\u3000——2024年1月12日在七台河市第十二届人民代表大会第三次会议上\n\n\u3000\u3000李兵\n\n\u3000\u3000各位代表:\n\n\u3000\u3000现在,我代表市政府向大会报告工作,请予审议,并请政协委员提出意见。\n\n\u3000\u3000一、2023年工作回顾'),
 Document(metadata={'source': 'data/内蒙古自治区2024.txt'}, page_content='以下是内蒙古自治区2024年政府工作报告的节选部分:“七个作模范”：在感党恩听党话、紧跟习近平总书记奋进新征程上作模范，在铸牢中华民族共同体意识上作模范，在民族地区推进中国式现代化建设中作模范，在边疆民族地区走向共同富裕的道路上作模范，在兴边稳边固边上作模范，在边疆地区联通国内国际双循环上作模范，在弘扬新风正气上作模范。'))

In [None]:
print(chunks[0].page_content)

In [None]:
print(chunks[-1].page_content)

In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

In [None]:
!pip install sentence-transformers

In [7]:
model_dir

'/home/jie/.cache/modelscope/hub/models/BAAI/bge-m3'

In [2]:
# create the open-source embedding function
bge_m3_embedding = HuggingFaceEmbeddings(
    model_name=model_dir, model_kwargs={
        "trust_remote_code": True, 
        }
)

In [3]:
import torch

torch.cuda.is_available()

True

In [None]:
!pip install --upgrade transformers

In [None]:
# https://python.langchain.com/docs/how_to/vectorstores/
vector_store = Chroma(
    collection_name="demo_db",
    embedding_function=bge_m3_embedding,
    # Where to save data locally, remove if not necessary
    persist_directory="./chroma_langchain_db",  
)

# 默认使用GPU计算，除非torch不能使用cuda
vector_store.add_documents(chunks)

In [None]:
# 从本地加载


In [None]:
!pip install torch torchvision torchaudio --index-url https://download.pytorch.org/whl/cu126

In [None]:
vector_store.similarity_search("生产总值")

In [None]:
query = "上海市2024年工作报告中提到的国民生成总值是多少？"# 新增就业岗位60.6万个
relevant_docs = vector_store.similarity_search(query)
relevant_docs

In [None]:
query = "上海市2024年工作报告中提到新增就业岗位多少个？"# 新增就业岗位60.6万个
relevant_docs = vector_store.similarity_search(query)
relevant_docs

## LLM

In [None]:
from langchain_community.llms import Ollama
from langchain.schema import SystemMessage, HumanMessage

llm = Ollama(model="deepseek-r1:7b", temperature=0)

# prompt = [SystemMessage("You are a helpful assistant"), HumanMessage("Who are you?")]
# print(llm.invoke(prompt))

### 手动实现

In [None]:
context = "\n\n".join([doc.page_content for doc in relevant_docs])
PROMPT = f"""
你是一个智能助手，擅长从提供的背景知识中提取关键信息来回答问题。
请根据以下提供的背景知识回答问题，如果无法找到答案，请直接说“我不知道”。

背景知识：
{context}

问题：
{query}

答案：
"""

query = "上海市2024年工作报告中提到新增就业岗位多少个？"# 新增就业岗位60.6万个
relevant_docs = vector_store.similarity_search(query)
context = "\n\n".join([doc.page_content for doc in relevant_docs])
answer = llm.invoke(
    PROMPT.format(context=context, query=query).lstrip()
)

print(answer)

### langchain封装RAG问答模块

In [None]:
from langchain.chains import RetrievalQA


# 创建 RAG QA Chain
qa = RetrievalQA.from_chain_type(llm=llm, chain_type="stuff", retriever=vector_store.as_retriever())

# 直接提问
query = "上海市2024年工作报告中提到新增就业岗位多少个？"
answer = qa.invoke(query)

print(answer)


In [None]:
from pprint import pprint

In [None]:
print({'query': '上海市2024年工作报告中提到新增就业岗位多少个？', 'result': '<think>\n好的，我现在需要回答用户的问题：上海市2024年政府工作报告中提到新增就业岗位多少个？\n\n首先，我查看了提供的上下文。在2024年的报告中有两段提到了新增就业岗位的情况。\n\n第一段提到：“完善创业担保贷款、职业培训补贴等稳就业政策，城镇新增就业55万人以上。”这里明确指出新增就业岗位超过55万。\n\n接着，在同一报告的另一部分中说：“公共就业招聘新平台上线运行，新增就业岗位60.6万个。”这进一步确认了新增就业岗位的数量是60.6万个。\n\n现在，我需要将这两个数字整合起来。虽然第一段用了“以上”，但第二段给出了具体的数字，所以更准确的是以60.6万为准。\n</think>\n\n上海市2024年政府工作报告中提到新增就业岗位60.6万个。'}["result"]
)

In [None]:
1