### ChatGLM2-6B

前置依赖

In [None]:
pip install protobuf transformers==4.30.2 cpm_kernels torch>=2.0 gradio mdtex2html sentencepiece accelerate

运行模型

In [None]:
from transformers import AutoTokenizer, AutoModel

tokenizer = AutoTokenizer.from_pretrained("/home/lpc/models/chatglm3-6b/", trust_remote_code=True)
model = AutoModel.from_pretrained("/home/lpc/models/chatglm3-6b/", trust_remote_code=True).half().cuda()
model = model.eval()
history_ = []

In [None]:
for response, history in model.stream_chat(tokenizer, "你的任务是什么?", history=history_):
    history_ = history
    print(response)

### Predictor封装方法

In [None]:
import sys
from main.predictor.chatglm import Predictor

predictor = Predictor(model_from_pretrained="/home/lpc/models/chatglm3-6b/")

In [None]:
res = predictor("你好?", history=[])
print(res)

In [None]:
for res in predictor.stream_chat("你的任务是什么?", history=[]):
    sys.stdout.write('\r' + res[0])
    sys.stdout.flush()

In [None]:
with open('./a.txt', encoding='utf-8') as f:
    ask_content = f.read()
res = predictor(ask_content, history=[])
print(res)

### ChatGLM_LoRA

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from main.trainer.chatglm_lora import Trainer
from transformers import AutoTokenizer, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("/home/lpc/models/chatglm3-6b/", trust_remote_code=True)
config = AutoConfig.from_pretrained("/home/lpc/models/chatglm3-6b/", trust_remote_code=True)
trainer = Trainer(tokenizer=tokenizer, config=config, from_pretrained='/home/lpc/models/chatglm3-6b/', loader_name='ChatGLM_Chat', data_path='FDEX2', max_length=3600, batch_size=4, task_name='FDEX2')

In [None]:
for i in trainer(num_epochs=5):
    a = i

#### 推理预测
- 方式一: 调用原生方法Chat预测

In [None]:
from main.predictor.chatglm_lora import Predictor

pred = Predictor(model_from_pretrained='/home/lpc/models/chatglm3-6b/', resume_path='./save_model/FDEX2/ChatGLM_5409')

In [2]:
history = []

In [None]:
result = pred.chat('<rag>检索增强知识: \n1.《政府采购代理机构管理暂行办法》(财库[2018]2号)\n第十三条 代理机构受采购人委托办理采购事宜，应当与采购人签订委托代理协议，明确采购代理范围、权限、期限、档案保存、代理费用收取方式及标准、协议解除及终止、违约责任等具体事项，约定双方权利义务。</rag>\n请根据以上检索增强知识回答以下问题\n采购人委托采购代理机构代理采购项目，发布招标公告后，有权更换采购代理机构吗?', max_length=3000, history=history)
history = result[1]
print(result[0])

- 方式二: 调用重写方法 (支持批量)

In [None]:
from main.predictor.chatglm_lora import Predictor

pred = Predictor(model_from_pretrained='/home/lpc/models/chatglm3-6b/', resume_path='./save_model/FDEX2/ChatGLM_5409')

In [None]:
result = pred(['Instrcution: 请识别该商品的要素: 理光（Ricoh） M2700/M2701/2702多功能黑白激光复合机 a3复合机打印机一体机办公 M 2702(网络+双面+输稿器+7寸触屏) 官方标配\n Answer:', '你好啊'], max_length=512, build_message=True)
print(result)

In [None]:
with open('./a.txt', encoding='utf-8') as f:
    ask_content = f.read()
result = pred(ask_content, max_length=512)
print(result)

### ChatGLM_LoRA RAG 推理

In [None]:
# 创建或者加载chromadb客户端
import chromadb
from chromadb.utils import embedding_functions

DB_SAVE_DIR = './chroma_data'
DB_NAME = 'FDQA'
N_RESULTS = 1

client = chromadb.PersistentClient(DB_SAVE_DIR)
sentence_transformer_ef = embedding_functions.SentenceTransformerEmbeddingFunction(model_name="DMetaSoul/sbert-chinese-general-v2")
collection = client.get_or_create_collection(DB_NAME, embedding_function=sentence_transformer_ef, metadata={"hnsw:space": "cosine"})

In [None]:
from main.predictor.chatglm_lora import Predictor

pred = Predictor(model_from_pretrained='./model/chatglm3-6b', resume_path='./save_model/FDRAG/ChatGLM_44136')

In [None]:
history = []

In [None]:
user_question = '采购人委托采购代理机构代理采购项目，发布招标公告后，有权更换采购代理机构吗?'
res = collection.query(
    query_texts=[user_question],
    n_results=N_RESULTS
)
if len(res['metadatas'][0]) > 0:
    distance = res['distances'][0][0]
    if distance < 0.1:
        clue = res['metadatas'][0][0]['clue']
    else:
        clue = False
else:
    clue = False
if not clue:
    rag_user_question = user_question
else:
    rag_user_question = f'<rag>检索增强知识: \n{clue}</rag>\n请根据以上检索增强知识回答以下问题\n{user_question}'
result = pred.chat(rag_user_question, history=history)
history = result[1]
print(result[0])

### ChatGLM_LoRA_RLHF

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from main.trainer.chatglm_rlhf import Trainer
from transformers import AutoTokenizer, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("/home/lpc/models/chatglm3-6b/", trust_remote_code=True)
config = AutoConfig.from_pretrained("/home/lpc/models/chatglm3-6b/", trust_remote_code=True)
trainer = Trainer(tokenizer=tokenizer, config=config, from_pretrained='/home/lpc/models/chatglm3-6b/', reward_from_pretrained='/home/lpc/models/text2vec-base-chinese/', loader_name='ChatGLM_RLHF', data_path='ID', max_length=1200, batch_size=2, task_name='ID')

In [None]:
for i in trainer(num_epochs=5):
    a = i

### Qianwen_LoRA

运行前请参阅[Qianwen-14B-Chat-Int4](https://huggingface.co/Qwen/Qwen-14B-Chat-Int4)安装相关依赖.

In [None]:
from main.trainer.qianwen_lora import Trainer
from transformers import AutoTokenizer, AutoConfig

tokenizer = AutoTokenizer.from_pretrained("model/Qwen-14B-Chat-Int4", trust_remote_code=True)
config = AutoConfig.from_pretrained("model/Qwen-14B-Chat-Int4", trust_remote_code=True)
config.disable_exllama = True
trainer = Trainer(tokenizer=tokenizer, config=config, from_pretrained='./model/Qwen-14B-Chat-Int4', loader_name='Qianwen_Chat', data_path='FD', max_length=512, batch_size=1, task_name='FD_Qianwen')

In [None]:
for i in trainer(num_epochs=5):
    a = i

使用Accelerator分布式训练加速

In [None]:
! accelerate launch run_qianwen_lora.py

#### Chat预测

In [None]:
from main.predictor.qianwen_lora import Predictor

pred = Predictor(model_from_pretrained='./model/Qwen-14B-Chat-Int4', resume_path='./save_model/FDQALaw_Qianwen/Qwen_39000')

In [None]:
result = pred.chat('hello,我想问下中华人民共和国民法典中第三条是什么?')
print(result[0])

##### 预测文本

In [None]:
from main.predictor.chatglm_lora import Predictor

pred = Predictor(model_from_pretrained='./model/chatglm3-6b', resume_path='./save_model/BossCondition/ChatGLM_25108')

In [None]:
result = pred('Instrcution: 请识别该商品的要素: 理光（Ricoh） M2700/M2701/2702多功能黑白激光复合机 a3复合机打印机一体机办公 M 2702(网络+双面+输稿器+7寸触屏) 官方标配\n Answer:', max_length=512)
print(result)

In [None]:
import json
from tqdm import tqdm

with open('./data/Boss/BertPred/复印机_retrieved.tsv', encoding='utf-8') as f:
    ori_list = f.read().split('\n')

if ori_list[-1] == '':
    ori_list = ori_list[:-1]

result = []
iter = tqdm(ori_list)
for item in iter:
    item = item.split('\t')
    res = pred(f'Instrcution: 请识别该商品的要素: {item[2]}\n Answer:', max_length=512)
    res_item = {
        'pred': res
    }
    answer_index = res.find('Answer:')
    iter.set_postfix(pred=json.dumps(res[answer_index + 7:], ensure_ascii=False))
    result.append(res_item)

with open('./data_record/BertPred_ChatGLMLoRA/复印机.json', 'w', encoding='utf-8') as f:
    for item in result:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

#### 推理文本
建议采用`Predictor`中的默认方法, 以便支持批量生成.

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from main.evaluation.inferences import inference_with_data_path
from main.predictor.chatglm_lora import Predictor

pred = Predictor(model_from_pretrained='/home/lpc/models/chatglm3-6b/', resume_path='./save_model/FDEX2/ChatGLM_5409')

def batcher(item):
    return pred(**item, max_length=1024, temperature=0, build_message=True)

inference_with_data_path(data_path='test', batcher=batcher, save_path='./outputs.txt', batch_size=4)

# 若你希望能够自行喂入数据, 也可以使用inference_with_data迭代器, 注意每一条格式为{"query": "", "history": []}

#### VLLM加速推理

目前ChatGLM在`transformers==0.45.x`上还存在bug.

强烈建议在新的conda环境下安装, 目前可行的版本`transformers==4.43.2 vllm==0.5.4`

同时, LoRA保存的目录下需包含`config.json`, `modeling_chatglm.py`等文件, 可以从`Chatglm`模型目录里找.

In [None]:
import os
os.environ["CUDA_VISIBLE_DEVICES"] = "1"
from vllm import LLM, SamplingParams
from vllm.lora.request import LoRARequest

llm = LLM(model="/home/lpc/models/chatglm3-6b/", enable_lora=True, trust_remote_code=True)

sampling_params = SamplingParams(
    temperature=0,
    max_tokens=256
)

prompts = [
     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_74 (icao VARCHAR, airport VARCHAR)\n\n question: Name the ICAO for lilongwe international airport [/user] [assistant]",
     "[user] Write a SQL query to answer the question based on the table schema.\n\n context: CREATE TABLE table_name_11 (nationality VARCHAR, elector VARCHAR)\n\n question: When Anchero Pantaleone was the elector what is under nationality? [/user] [assistant]",
]

outputs = llm.generate(
    prompts,
    sampling_params,
    lora_request=LoRARequest("lora", 1, './save_model/FDEX2/ChatGLM_5409')
)

In [None]:
outputs[1].outputs[0].text

#### 计算生成文本与参考文本的评估指标

- 单例计算

In [None]:
from main.evaluation.metrics import evaluate_all_metrics

# 测试示例
reference_text = ["I love this cat.", "I really love this cat."]
generated_text = "hahaha I love this cat."

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/home/lpc/models/chatglm3-6b/", trust_remote_code=True)
scores = evaluate_all_metrics(tokenizer, reference_text, generated_text, intensive=False) # 如果是中文请将intensive设置为True
print(scores)

- 批量计算

In [None]:
from main.evaluation.metrics import evaluate_generation

import json
with open('./data_record/FDEX2/outputs.txt') as f:
    outputs = f.readlines()
outputs = [json.loads(item) for item in outputs]

with open('./data/FD/data/ex2/dev.jsonl') as f:
    data = f.readlines()
data = [json.loads(item) for item in data]
data = [item['conversations'] if 'conversations' in item else item for item in data]
data = [[item[-1]['content']] for item in data]
# 测试示例
reference_text = [[item] for item in data]
generated_text = "hahaha I love this cat."

from transformers import AutoTokenizer

tokenizer = AutoTokenizer.from_pretrained("/home/lpc/models/chatglm3-6b/", trust_remote_code=True)
scores = evaluate_generation(tokenizer, data[:len(outputs)], outputs, intensive=True) # 如果是中文请将intensive设置为True
print(scores)

##### 预测商品蕴涵关系

In [None]:
from main.predictor.chatglm_lora import Predictor

pred = Predictor(model_from_pretrained='./model/chatglm3-6b', resume_path='./save_model/BossRTE/ChatGLM_22264')

In [None]:
result = pred('Instruction: 请判断以下两个商品是否为同款商品\nContext: Source: 联想(Lenovo）启天 M415-B114 台式计算机 I3-7100/8G/1T/无光驱/15L机箱/21.5寸显示器 5288\nTarget: 戴尔（DELL） I3-6100 戴尔（DELL）成就3667-R1308商用台式电脑整机（i3-6100 4G 1T WIFI 蓝牙 三年上门 硬盘保留 Win10）19.5英寸 3455\nAnswer: ', max_length=512)
print(result)

STS数据集

In [None]:
import json
from tqdm import tqdm

with open('./data/Boss/RTE/dev.json', encoding='utf-8') as f:
    ori_list = f.read().split('\n')

if ori_list[-1] == '':
    ori_list = ori_list[:-1]

iter = tqdm(ori_list)
tp = 0
fp = 0
tn = 0
fn = 0
for item in iter:
    item = json.loads(item)
    res = pred(item['context'], max_length=512)
    res = res.split('预测结果: ')
    if len(res) < 2:
        res = 1
    else:
        res = int(res[1])
    gold = int(item['target'].split('预测结果: ')[1])
    if res == 1:
        if res == gold:
            tp += 1
        else:
            fp += 1
    else:
        if res == gold:
            tn += 1
        else:
            fn += 1
    p = tp / (tp + fp)
    r = tp / (tp + fn)
    f1 = 2 * p * r / (p + r)
    iter.set_postfix(F1=f1, p=p, r=r)

print(f1, p, r)

全样本环境预测

In [None]:
import json
from tqdm import tqdm

with open('./data/Boss/BertPred/复印机_retrieved.tsv', encoding='utf-8') as f:
    ori_list = f.read().split('\n')

if ori_list[-1] == '':
    ori_list = ori_list[:-1]

iter = tqdm(ori_list)
tp = 0
fp = 0
tn = 0
fn = 0
current_id = 0
current_index = 0
for idx, item in enumerate(iter):
    item = item.split('\t')
    id = item[0]
    if id != current_id:
        current_id = id
        current_index = idx
    ori_item = ori_list[current_index]
    ori_item = ori_item.split('\t')
    if ori_item[2] == item[2]:
        continue
    if len(item) < 4:
        item.append(1)
    if len(ori_item) < 4:
        ori_item.append(1)
    res = pred(f"Instruction: 请判断以下两个商品是否为同款商品\nContext: Source: {ori_item[2]} {ori_item[3]}\nTarget: {item[2]} {item[3]}\nAnswer: ", max_length=512)
    res = res.split('预测结果: ')
    if len(res) < 2:
        res = 1
    else:
        res = int(res[1])
    gold = int(item[5]) if len(item) > 5 else 1
    if res == 1:
        if res == gold:
            tp += 1
        else:
            fp += 1
    else:
        if res == gold:
            tn += 1
        else:
            fn += 1
    p = 0 if tp + fp == 0 else tp / (tp + fp)
    r = 0 if tp + fn == 0 else tp / (tp + fn)
    f1 = 0 if p + r == 0 else 2 * p * r / (p + r)
    iter.set_postfix(F1=f1, p=p, r=r)

print(f1, p, r)

In [None]:
tp_ = tp + 988 / 2
fp_ = fp + 52 / 2
p = tp_ / (tp_ + fp_)
r = tp_ / (tp_ + fn)
f1 = 2 * p * r / (p + r)
print(f'F1: {f1}, P: {p}, R: {r}')

### 使用ChatGLM3-6B对商品进行要素抽取

In [None]:
import sys
from main.predictor.chatglm import Predictor

predictor = Predictor(model_name="ChatGLM2-6B", model_from_pretrained="model/chatglm3-6b")

In [None]:
import json
from tqdm import tqdm

with open('./data/Boss/train.json') as f:
    ori_data = f.read().split('\n')

if ori_data[-1] == '':
    ori_data.pop()

result = []
for item in tqdm(ori_data):
    data = json.loads(item)
    item_id = data['item_id']
    context = data['context']
    question = context.replace('\n Answer: ', '')
    res = predictor.chat(question, max_length=1024)
    result.append({'item_id': item_id, 'question': question, 'answer': res})

with open('./data/Boss/train_result.json', encoding='utf-8', mode='w') as f:
    for item in result:
        f.write(json.dumps(item, ensure_ascii=False) + '\n')

### 使用ChatGLM3-6B预测商品蕴涵关系

In [None]:
import sys
from main.predictor.chatglm import Predictor

predictor = Predictor(model_name="ChatGLM2-6B", model_from_pretrained="model/chatglm3-6b")

In [None]:
import json
from tqdm import tqdm

with open('./data/Boss/BertPred/台式计算机.tsv', encoding='utf-8') as f:
    ori_list = f.read().split('\n')

if ori_list[-1] == '':
    ori_list = ori_list[:-1]

iter = tqdm(ori_list)
tp = 0
fp = 0
tn = 0
fn = 0
current_id = 0
current_index = 0
for idx, item in enumerate(iter):
    item = item.split('\t')
    id = item[0]
    if id != current_id:
        current_id = id
        current_index = idx
    ori_item = ori_list[current_index]
    ori_item = ori_item.split('\t')
    if ori_item[2] == item[2]:
        continue
    res = predictor.chat(f"请判断以下两个商品是否为同款商品，直接回答“同款”或“非同款”即可。\n文本1： {ori_item[2]} {ori_item[3]}\n文本2： {item[2]} {item[3]}\n回答：", max_length=1024)
    if '非同款' in res:
        res = 0
    else:
        res = 1
    gold = int(item[5]) if len(item) > 5 else 1
    if res == 1:
        if res == gold:
            tp += 1
        else:
            fp += 1
    else:
        if res == gold:
            tn += 1
        else:
            fn += 1
    p = 0 if tp + fp == 0 else tp / (tp + fp)
    r = 0 if tp + fn == 0 else tp / (tp + fn)
    f1 = 0 if p + r == 0 else 2 * p * r / (p + r)
    iter.set_postfix(F1=f1, p=p, r=r)

print(f1, p, r)

In [None]:
tp_ = tp + 988 / 2
fp_ = fp + 52 / 2
p = tp_ / (tp_ + fp_)
r = tp_ / (tp_ + fn)
f1 = 2 * p * r / (p + r)
print(f'F1: {f1}, P: {p}, R: {r}')