完了 https://qwen.readthedocs.io/zh-cn/latest/framework/Langchain.html

In [1]:
from langchain_community.document_loaders.csv_loader import CSVLoader
from langchain_community.document_loaders.text import TextLoader
from langchain_community.document_loaders import PyPDFLoader
from langchain_text_splitters import RecursiveCharacterTextSplitter
# from langchain.embeddings import HuggingFaceBgeEmbeddings
from langchain_community.embeddings import HuggingFaceBgeEmbeddings
from langchain_chroma import Chroma
# from langchain.llms import Qwen
from langchain.prompts import ChatPromptTemplate


In [2]:
## 1 文档读取
# 读取csv，返回list
def load_csv(path):
    # 每条记录为一个元素
    # https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.csv_loader.CSVLoader.html
    loader = CSVLoader(
        file_path=path,
        encoding='utf-8',# 编码
        csv_args={
                'delimiter': ',',
                'quotechar': '"',
                'fieldnames': ['Index', 'Height', 'Weight'] # CSV 文件应该包含这三个字段
                }
    )
    data = loader.load()
    return data 
# 读取pdf，返回list
def load_pdf(path):
    # 是以每页为一个元素的
    loader = PyPDFLoader(path)
    pages = loader.load_and_split() #why
    return pages
def load_txt(path):
    # https://api.python.langchain.com/en/latest/document_loaders/langchain_community.document_loaders.text.TextLoader.html
    loader=TextLoader(file_path=path,encoding='utf-8',)
    data=loader.load()
    return data
    
## 2 文档切块
text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=200, # 指定每个文本块的目标大小，这里设置为200个字符。
    chunk_overlap=50, # 指定文本块之间的重叠字符数，这里设置为50个字符。
    length_function=len, # 用于测量文本长度的函数，这里使用Python内置的`len`函数。
    is_separator_regex=False, # 指定`separators`中的分隔符是否应被视为正则表达式，这里设置为False，表示分隔符是字面字符。
    separators=["\n\n",  "\n",   " ",    ".",    ",",     "，",  "。", ] # 定义用于分割文本的分隔符列表。
)
pages = load_txt("/ssd/xiaxinyuan/code/CS3602_NLP_Final_Project/chatbot/dataset/hlm.txt")
# texts = text_splitter.split_documents([pages[0].page_content])
texts = text_splitter.split_documents(pages)
# type(texts)=list
# texts[int]={page_content="...",metadata={'source': '/kaggle/input/the-dream-of-red-mansion/hlm.txt'}}


In [3]:
## 3 向量化
# reference: https://python.langchain.com/api_reference/community/embeddings/langchain_community.embeddings.huggingface.HuggingFaceBgeEmbeddings.html
# model_name = "BAAI/bge-large-en-v1.5"
import torch
print(f"CUDA available: {torch.cuda.is_available()}")

model_name = "BAAI/bge-large-zh"
model_kwargs = {'device': 'cuda'}
encode_kwargs = {'normalize_embeddings': True} # 当向量都被规范化（归一化）后，它们的范数都是1。

# model_name = "BAAI/bge-large-en-v1.5"
# model_kwargs = {'device': 'cuda'}
# encode_kwargs = {'normalize_embeddings': True}
hf = HuggingFaceBgeEmbeddings(
    model_name=model_name,
    model_kwargs=model_kwargs,
    encode_kwargs=encode_kwargs
)
# embedding = hf.embed_query("贾宝玉是谁")
# print(embedding)



CUDA available: True


  hf = HuggingFaceBgeEmbeddings(
  from .autonotebook import tqdm as notebook_tqdm


In [4]:
## 4 创建向量数据库
# 快速创建数据库
# reference: https://api.python.langchain.com/en/latest/vectorstores/langchain_community.vectorstores.chroma.Chroma.html#langchain_community.vectorstores.chroma.Chroma.from_documents
vectorstore = Chroma.from_documents(
   documents = texts, 
   embedding = hf, #第三步得到的用于计算embedding的模型
   ids = None,
   collection_metadata = {"hnsw:space": "cosine"}, # 算余弦相似度
   )
# todo: 保存vectorstore
# 相似度方法通过查询文本检索数据 示例
query = "贾宝玉是谁"
docs = vectorstore.similarity_search(query)
print(docs[0].page_content)

.这政老爹的夫人王氏，头胎生的公子，名唤贾珠，十四岁进学，不到二十岁就娶了妻生了子，一病死了.第二胎生了一位小姐，生在大年初一，这就奇了，不想后来又生一位公子，说来更奇，一落胎胞，嘴里便衔下一块五彩晶莹的玉来，上面还有许多字迹，就取名叫作宝玉.你道是新奇异事不是？”


In [12]:

import torch
from transformers import AutoTokenizer, AutoModelForCausalLM
import re
import os
# 设置可见的 GPU 设备为 cuda:0
os.environ["CUDA_VISIBLE_DEVICES"] = "0"

def load_single_model(model_path,torch_dtype,trust_remote_code,device_map,use_cache):
    return AutoModelForCausalLM.from_pretrained(
            pretrained_model_name_or_path=model_path, 
            torch_dtype=torch_dtype,
            trust_remote_code=trust_remote_code,  # Qwen模型需要这个参数
            device_map=device_map,  # 可选，用于自动处理模型加载到设备
            use_cache=use_cache
        )

device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model_path = "/ssd/xiaxinyuan/code/CS3602_NLP_Final_Project/output/peft_3b/checkpoint-30000"
max_position_embeddings = 4096 # 模型支持的最大长度
tokenizer = AutoTokenizer.from_pretrained(model_path,device_map="auto" )
model = load_single_model(model_path,"bfloat16",True,"auto",False)
model = model.to(device)

Loading checkpoint shards: 100%|██████████| 2/2 [00:02<00:00,  1.25s/it]


In [None]:
# 定义生成回复的函数
def generate_response(model, tokenizer, conversation_history, user_input):
    """
    根据用户输入和对话历史生成模型回复。
    """
    # 更新对话历史②
    conversation_history.append({"role": "user", "content": user_input})
    print(user_input)
    
    text=tokenizer.apply_chat_template(conversation_history,tokenize=False,add_generation_prompt=True)
    inputs=tokenizer([text],return_tensors="pt").to(model.device)
    if len(inputs["input_ids"][0])>max_position_embeddings:
        # 不移除system 移除一轮对话
        conversation_history.pop(1) 
        conversation_history.pop(2)
        text=tokenizer.apply_chat_template(conversation_history,tokenize=False,add_generation_prompt=True)
        inputs=tokenizer([text],return_tensors="pt").to(model.device)

    # print(text)查看输入的所有prompt
    
    outputs = model.generate(**inputs,pad_token_id=tokenizer.eos_token_id, #在生成时用eos填充序列
                            max_new_tokens=100, #新生成文本长度
                            # num_beams=5,
                            # temperature=0.7,
                            # top_k=50,
                            # top_p=0.95,
                            # repetition_penalty=1.2
                            )
    # print(outputs)
    response = tokenizer.decode(outputs[:, inputs['input_ids'].shape[-1]:][0], skip_special_tokens=True) #在解码过程中跳过特殊符号如eos pad
    print("Bot:",response)
    # 添加对话历史②
    conversation_history.pop()
    last_round_content=conversation_history[-1]["content"]
    match = re.search(r'\[Round (\d+)\]', last_round_content)
    if match:
        last_round = int(match.group(1))
    else:
        last_round = 0
    conversation_history.append({"role": "user", "content": f"[Round {last_round+1}]:{user_input}"})
    conversation_history.append({"role": "assistant", "content": f"[Round {last_round+1}]:{response.strip()}"})
    return response.strip()

In [19]:
## 5
# 创建一个检索器 https://api.python.langchain.com/en/latest/vectorstores/langchain_community.vectorstores.chroma.Chroma.html#langchain_community.vectorstores.chroma.Chroma.as_retriever
from langchain_core.output_parsers.string import StrOutputParser
from typing import Sequence
from langchain.schema import Document
from langchain.schema.document import Document

retriever = vectorstore.as_retriever(search_kwargs={"k": 5}) # k 表示要检索的结果数量
# query = "萧炎是谁?"
# https://api.python.langchain.com/en/latest/vectorstores/langchain_core.vectorstores.base.VectorStoreRetriever.html#langchain_core.vectorstores.base.VectorStoreRetriever
query = "你好"
docs = retriever.invoke(query)

# 实例化自定义模型
llm = Qwen()
PROMPT_TEMPLATE = "请根据提示回答问题"
template = """基于下列红楼梦的背景，回答问题。
红楼梦的背景：{context}
问题：{question}
"""

def format_docs(docs: Sequence[Document]) -> str:
    formatted_docs = []
    for i, doc in enumerate(docs):
        doc_string = f"<doc id='{i}'>{doc.page_content}</doc>"
        formatted_docs.append(doc_string)
    return "\n".join(formatted_docs)
prompt = ChatPromptTemplate.from_template(PROMPT_TEMPLATE)
# https://python.langchain.com/api_reference/core/output_parsers/langchain_core.output_parsers.string.StrOutputParser.html

output_parser = StrOutputParser()
# 构建 chain
chain = prompt | llm | output_parser
res = chain.invoke(
                    {
                    "context": format_docs(docs),
                    "question": query
                    }
                    )

print(res)
# 为什么《西游记》里孙悟空叫唐僧师父

Setting `pad_token_id` to `eos_token_id`:None for open-end generation.


Assistant:很抱歉，我无法回答这个问题。因为提示中没有提供足够的信息，我无法理解您的问题。如果您能提供更多的信息，我将尽力为您提供帮助。


In [17]:
print(prompt)

input_variables=[] input_types={} partial_variables={} messages=[HumanMessagePromptTemplate(prompt=PromptTemplate(input_variables=[], input_types={}, partial_variables={}, template='请根据提示回答问题'), additional_kwargs={})]
