<a href="https://colab.research.google.com/github/JoeWang3/Create_Coding_Midterm_Joe_Wang/blob/main/bge_evaluate.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
# Step 1: Install necessary libraries
!pip install transformers datasets sentence-transformers

Collecting datasets
  Using cached datasets-2.20.0-py3-none-any.whl.metadata (19 kB)
Collecting sentence-transformers
  Using cached sentence_transformers-3.0.1-py3-none-any.whl.metadata (10 kB)
Collecting pyarrow>=15.0.0 (from datasets)
  Using cached pyarrow-17.0.0-cp310-cp310-manylinux_2_28_x86_64.whl.metadata (3.3 kB)
Collecting dill<0.3.9,>=0.3.0 (from datasets)
  Using cached dill-0.3.8-py3-none-any.whl.metadata (10 kB)
Collecting xxhash (from datasets)
  Using cached xxhash-3.4.1-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (12 kB)
Collecting multiprocess (from datasets)
  Using cached multiprocess-0.70.16-py310-none-any.whl.metadata (7.2 kB)
Collecting fsspec<=2024.5.0,>=2023.1.0 (from fsspec[http]<=2024.5.0,>=2023.1.0->datasets)
  Using cached fsspec-2024.5.0-py3-none-any.whl.metadata (11 kB)
Collecting nvidia-cuda-nvrtc-cu12==12.1.105 (from torch>=1.11.0->sentence-transformers)
  Using cached nvidia_cuda_nvrtc_cu12-12.1.105-py3-none-manylinux1_x86_64.wh

In [None]:
# Step 2: Import libraries
import os
import json
import torch
from datasets import load_dataset
from sentence_transformers import SentenceTransformer
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers.util import cos_sim

In [None]:
# Step 3: Load a dataset (AFQMC)
# We will format AFQMC to simulate an information retrieval task
dataset = load_dataset("clue", "afqmc")

Downloading readme:   0%|          | 0.00/21.5k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/211k [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/1.89M [00:00<?, ?B/s]

Downloading data:   0%|          | 0.00/240k [00:00<?, ?B/s]

Generating test split:   0%|          | 0/3861 [00:00<?, ? examples/s]

Generating train split:   0%|          | 0/34334 [00:00<?, ? examples/s]

Generating validation split:   0%|          | 0/4316 [00:00<?, ? examples/s]

In [None]:
# For simplicity, we will take a subset of the data
queries = {str(i): q for i, q in enumerate(dataset['train']['sentence1'][:100])}
corpus = {str(i): d for i, d in enumerate(dataset['train']['sentence2'][:100])}
# Assuming sentence1 matches sentence2 in the same index
relevant_docs = {str(i): [str(i)] for i in range(100)}

In [None]:
# Step 4: Save the dataset in JSON format (simulating the local data loading)
data = {
    "queries": queries,
    "corpus": corpus,
    "relevant_docs": relevant_docs
}

with open("doc_qa.json", "w", encoding="utf-8") as f:
    json.dump(data, f, ensure_ascii=False, indent=4)

In [None]:
# Step 5: Load the BGE model
model_name = "BAAI/bge-base-zh-v1.5"  # Assuming this is the correct Hugging Face model ID
model = SentenceTransformer(model_name, device="cuda" if torch.cuda.is_available() else "cpu")
print("Model loaded")

modules.json:   0%|          | 0.00/349 [00:00<?, ?B/s]

config_sentence_transformers.json:   0%|          | 0.00/124 [00:00<?, ?B/s]

README.md:   0%|          | 0.00/27.8k [00:00<?, ?B/s]

sentence_bert_config.json:   0%|          | 0.00/52.0 [00:00<?, ?B/s]

config.json:   0%|          | 0.00/998 [00:00<?, ?B/s]

pytorch_model.bin:   0%|          | 0.00/409M [00:00<?, ?B/s]

tokenizer_config.json:   0%|          | 0.00/366 [00:00<?, ?B/s]

vocab.txt:   0%|          | 0.00/110k [00:00<?, ?B/s]

tokenizer.json:   0%|          | 0.00/439k [00:00<?, ?B/s]

special_tokens_map.json:   0%|          | 0.00/125 [00:00<?, ?B/s]

1_Pooling/config.json:   0%|          | 0.00/190 [00:00<?, ?B/s]

Model loaded


In [None]:
# Step 6: Evaluate the model
evaluator = InformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    name=model_name.split('/')[-1],  # Use the model name as part of the evaluation report
    score_functions={"cosine": cos_sim}
)

In [None]:
# Evaluate the model
result = evaluator(model)
print(result)

{'bge-base-zh-v1.5_cosine_accuracy@1': 0.7, 'bge-base-zh-v1.5_cosine_accuracy@3': 0.86, 'bge-base-zh-v1.5_cosine_accuracy@5': 0.89, 'bge-base-zh-v1.5_cosine_accuracy@10': 0.94, 'bge-base-zh-v1.5_cosine_precision@1': 0.7, 'bge-base-zh-v1.5_cosine_precision@3': 0.2866666666666666, 'bge-base-zh-v1.5_cosine_precision@5': 0.17799999999999996, 'bge-base-zh-v1.5_cosine_precision@10': 0.09399999999999999, 'bge-base-zh-v1.5_cosine_recall@1': 0.7, 'bge-base-zh-v1.5_cosine_recall@3': 0.86, 'bge-base-zh-v1.5_cosine_recall@5': 0.89, 'bge-base-zh-v1.5_cosine_recall@10': 0.94, 'bge-base-zh-v1.5_cosine_ndcg@10': 0.8253342776967313, 'bge-base-zh-v1.5_cosine_mrr@10': 0.7883452380952383, 'bge-base-zh-v1.5_cosine_map@100': 0.79087055736273}


In [None]:
import json

# 加载数据集
with open("doc_qa.json", "r", encoding="utf-8") as f:
    data = json.load(f)

queries = data['queries']
corpus = data['corpus']
relevant_docs = data['relevant_docs']

# 打印一些样本
for i in range(5):
    query_id = list(queries.keys())[i]
    print(f"Query: {queries[query_id]}")
    relevant_doc_ids = relevant_docs[query_id]
    for doc_id in relevant_doc_ids:
        print(f"Relevant Document: {corpus[doc_id]}")
    print("-" * 50)

Query: 蚂蚁借呗等额还款可以换成先息后本吗
Relevant Document: 借呗有先息到期还本吗
--------------------------------------------------
Query: 蚂蚁花呗说我违约一次
Relevant Document: 蚂蚁花呗违约行为是什么
--------------------------------------------------
Query: 帮我看一下本月花呗账单有没有结清
Relevant Document: 下月花呗账单
--------------------------------------------------
Query: 蚂蚁借呗多长时间综合评估一次
Relevant Document: 借呗得评估多久
--------------------------------------------------
Query: 我的花呗账单是***，还款怎么是***
Relevant Document: 我的花呗，月结出来说让我还***元，我自己算了一下详细名单我应该还***元
--------------------------------------------------


In [13]:
import json
import os
from datasets import Dataset

# 假设 doc_qa.json 文件已经上传到 Colab 环境中
with open("doc_qa.json", "r", encoding="utf-8") as f:
    data = json.load(f)

# 从 doc_qa.json 中提取 queries, corpus 和 relevant_docs
queries = data['queries']
corpus = data['corpus']
relevant_docs = data['relevant_docs']

# 生成训练对 (anchor 和 positive)
train_anchor, train_positive = [], []
for query_id, doc_ids in relevant_docs.items():
    train_anchor.append(queries[query_id])
    train_positive.append(corpus[doc_ids[0]])  # 假设每个查询只有一个相关文档

# 创建 datasets 格式的训练数据集
train_dataset = Dataset.from_dict({"positive": train_positive, "anchor": train_anchor})

# 打印数据集样本，确认加载正确
print(train_dataset[0:5])

{'positive': ['借呗有先息到期还本吗', '蚂蚁花呗违约行为是什么', '下月花呗账单', '借呗得评估多久', '我的花呗，月结出来说让我还***元，我自己算了一下详细名单我应该还***元'], 'anchor': ['蚂蚁借呗等额还款可以换成先息后本吗', '蚂蚁花呗说我违约一次', '帮我看一下本月花呗账单有没有结清', '蚂蚁借呗多长时间综合评估一次', '我的花呗账单是***，还款怎么是***']}


In [14]:
from sentence_transformers import SentenceTransformer

model_name = 'BAAI/bge-base-zh-v1.5'
model = SentenceTransformer(model_name, device="cuda" if torch.cuda.is_available() else "cpu")
print("Model loaded")

Model loaded


In [15]:
from sentence_transformers.evaluation import InformationRetrievalEvaluator
from sentence_transformers.util import cos_sim
from sentence_transformers.losses import MultipleNegativesRankingLoss

# 设置评估器
evaluator = InformationRetrievalEvaluator(
    queries=queries,
    corpus=corpus,
    relevant_docs=relevant_docs,
    name=model_name.split('/')[-1],  # 使用模型名作为评估名称
    score_functions={"cosine": cos_sim}
)

# 设置损失函数
train_loss = MultipleNegativesRankingLoss(model)

In [21]:
from sentence_transformers import SentenceTransformerTrainingArguments

args = SentenceTransformerTrainingArguments(
    output_dir=f"ft_{model_name}",  # 输出目录
    num_train_epochs=10,             # 训练轮数
    per_device_train_batch_size=2,  # 训练批次大小
    gradient_accumulation_steps=2,  # 累积梯度步数
    per_device_eval_batch_size=4,   # 评估批次大小
    warmup_ratio=0.1,               # 学习率预热比例
    learning_rate=2e-5,             # 学习率
    lr_scheduler_type="cosine",     # 学习率调度类型
    optim="adamw_torch_fused",      # 优化器类型
    tf32=True,                      # 使用 TF32 精度
    bf16=True,                      # 使用 BF16 精度
    eval_strategy="epoch",          # 每个 epoch 后进行评估
    save_strategy="epoch",          # 每个 epoch 后保存模型
    logging_steps=10,               # 每 10 步记录一次日志
    save_total_limit=3,             # 最多保存 3 个模型
    load_best_model_at_end=True,    # 训练结束时加载最优模型
    metric_for_best_model="eval_bge-base-zh-v1.5_cosine_ndcg@10",  # 最优模型的评价指标
)

In [22]:
from sentence_transformers import SentenceTransformerTrainer

trainer = SentenceTransformerTrainer(
    model=model,                          # 预训练模型
    args=args,                            # 训练参数
    train_dataset=train_dataset,          # 训练数据集
    loss=train_loss,                      # 损失函数
    evaluator=evaluator                   # 评估器
)

# 开始训练
trainer.train()

# 保存模型
trainer.save_model()

print("Training complete and model saved.")

Epoch,Training Loss,Validation Loss,Bge-base-zh-v1.5 Cosine Accuracy@1,Bge-base-zh-v1.5 Cosine Accuracy@3,Bge-base-zh-v1.5 Cosine Accuracy@5,Bge-base-zh-v1.5 Cosine Accuracy@10,Bge-base-zh-v1.5 Cosine Precision@1,Bge-base-zh-v1.5 Cosine Precision@3,Bge-base-zh-v1.5 Cosine Precision@5,Bge-base-zh-v1.5 Cosine Precision@10,Bge-base-zh-v1.5 Cosine Recall@1,Bge-base-zh-v1.5 Cosine Recall@3,Bge-base-zh-v1.5 Cosine Recall@5,Bge-base-zh-v1.5 Cosine Recall@10,Bge-base-zh-v1.5 Cosine Ndcg@10,Bge-base-zh-v1.5 Cosine Mrr@10,Bge-base-zh-v1.5 Cosine Map@100
1,0.0,No log,0.88,0.97,0.98,1.0,0.88,0.323333,0.196,0.1,0.88,0.97,0.98,1.0,0.94601,0.92819,0.92819
2,0.0,No log,0.79,0.95,0.98,1.0,0.79,0.316667,0.196,0.1,0.79,0.95,0.98,1.0,0.905956,0.874667,0.874667
3,0.0001,No log,0.78,0.92,0.93,0.98,0.78,0.306667,0.186,0.098,0.78,0.92,0.93,0.98,0.889457,0.859778,0.860964
4,0.0019,No log,0.8,0.97,1.0,1.0,0.8,0.323333,0.2,0.1,0.8,0.97,1.0,1.0,0.915374,0.8865,0.8865
5,0.04,No log,0.85,0.95,1.0,1.0,0.85,0.316667,0.2,0.1,0.85,0.95,1.0,1.0,0.932879,0.910333,0.910333
6,0.0087,No log,0.85,0.96,1.0,1.0,0.85,0.32,0.2,0.1,0.85,0.96,1.0,1.0,0.931825,0.909,0.909
7,0.0456,No log,0.89,0.98,1.0,1.0,0.89,0.326667,0.2,0.1,0.89,0.98,1.0,1.0,0.951469,0.935,0.935
8,0.0002,No log,0.87,0.98,1.0,1.0,0.87,0.326667,0.2,0.1,0.87,0.98,1.0,1.0,0.942779,0.923333,0.923333
9,0.0097,No log,0.87,0.98,1.0,1.0,0.87,0.326667,0.2,0.1,0.87,0.98,1.0,1.0,0.942779,0.923333,0.923333
10,0.0045,No log,0.87,0.98,1.0,1.0,0.87,0.326667,0.2,0.1,0.87,0.98,1.0,1.0,0.944088,0.925,0.925


Training complete and model saved.


In [23]:
# 查看训练数据集的大小
dataset_size = len(train_dataset)
print(f"训练数据集大小: {dataset_size} 条样本")

训练数据集大小: 100 条样本


In [24]:
!pip install onnx onnxruntime onnxruntime-tools transformers onnxconverter-common

Collecting onnx
  Downloading onnx-1.16.2-cp310-cp310-manylinux_2_17_x86_64.manylinux2014_x86_64.whl.metadata (16 kB)
Collecting onnxruntime
  Downloading onnxruntime-1.18.1-cp310-cp310-manylinux_2_27_x86_64.manylinux_2_28_x86_64.whl.metadata (4.3 kB)
Collecting onnxruntime-tools
  Downloading onnxruntime_tools-1.7.0-py3-none-any.whl.metadata (14 kB)
Collecting onnxconverter-common
  Downloading onnxconverter_common-1.14.0-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting coloredlogs (from onnxruntime)
  Downloading coloredlogs-15.0.1-py2.py3-none-any.whl.metadata (12 kB)
Collecting py3nvml (from onnxruntime-tools)
  Downloading py3nvml-0.2.7-py3-none-any.whl.metadata (13 kB)
Collecting protobuf>=3.20.2 (from onnx)
  Downloading protobuf-3.20.2-cp310-cp310-manylinux_2_12_x86_64.manylinux2010_x86_64.whl.metadata (679 bytes)
Collecting humanfriendly>=9.1 (from coloredlogs->onnxruntime)
  Downloading humanfriendly-10.0-py2.py3-none-any.whl.metadata (9.2 kB)
Collecting xmltodict (from py3n

In [25]:
import os

# 根据你之前的设置，模型应该保存在以下路径
model_path = f"ft_{model_name}"

# 检查路径是否存在
if os.path.exists(model_path):
    print(f"模型保存路径为: {model_path}")
    print("目录内容如下:")
    print(os.listdir(model_path))
else:
    print("未找到模型保存路径，请检查路径设置。")

模型保存路径为: ft_BAAI/bge-base-zh-v1.5
目录内容如下:
['tokenizer.json', 'config.json', 'checkpoint-225', 'vocab.txt', 'README.md', 'checkpoint-250', 'modules.json', 'training_args.bin', 'tokenizer_config.json', 'sentence_bert_config.json', 'runs', 'special_tokens_map.json', '2_Normalize', 'model.safetensors', 'checkpoint-175', '1_Pooling', 'config_sentence_transformers.json']


In [27]:
from sentence_transformers import SentenceTransformer

# 加载微调后的模型
model = SentenceTransformer(model_path)

In [34]:
import torch
from transformers import AutoTokenizer, AutoModel

# 初始化 tokenizer 和 PyTorch 模型
tokenizer = AutoTokenizer.from_pretrained(model_path)
pytorch_model = model._first_module().auto_model

# 准备 ONNX 的输入示例
dummy_input = tokenizer("示例输入文本", return_tensors="pt", padding=True, truncation=True)

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
pytorch_model.to(device)  # 将模型移动到设备（通常是 GPU）

# 移动输入张量到同一设备
dummy_input = {
    "input_ids": dummy_input["input_ids"].to(device),
    "attention_mask": dummy_input["attention_mask"].to(device)
}

# 导出为 ONNX 格式
torch.onnx.export(
    pytorch_model,                                      # 需要导出的 PyTorch 模型
    (dummy_input["input_ids"], dummy_input["attention_mask"]),  # 模型的输入张量
    "bge_base_zh_v1.5.onnx",                            # 输出 ONNX 文件名
    input_names=["input_ids", "attention_mask"],        # 输入张量名称
    output_names=["output"],                            # 输出张量名称
    dynamic_axes={"input_ids": {0: "batch_size"}, "attention_mask": {0: "batch_size"}},  # 动态批次大小
    opset_version=14                                    # ONNX opset 版本
)

In [36]:
import onnxruntime as ort

# 加载 ONNX 模型
onnx_model_path = "bge_base_zh_v1.5.onnx"
ort_session = ort.InferenceSession(onnx_model_path)

# 准备推理输入
def preprocess(text, max_length=8):
    inputs = tokenizer(text, return_tensors="pt", padding=True, truncation=True, max_length=max_length)
    return {
        "input_ids": inputs["input_ids"].cpu().numpy(),
        "attention_mask": inputs["attention_mask"].cpu().numpy()
    }

# 进行推理
def infer(text):
    inputs = preprocess(text)
    outputs = ort_session.run(None, inputs)
    return outputs

# 测试推理
output = infer("这是一段测试文本")
print(output)

[array([[[ 0.4769148 , -0.28393015, -0.43231583, ...,  0.4167677 ,
         -0.52408266, -0.19929872],
        [ 0.04043505,  0.09434471,  0.29566997, ..., -0.359766  ,
         -0.04143882, -0.35307646],
        [ 0.04177262, -0.17315501, -0.14003229, ..., -0.2051987 ,
         -0.2597228 , -0.33541787],
        ...,
        [ 0.5436055 ,  0.07704913, -0.4569545 , ..., -0.71320224,
         -0.46993056, -0.30751562],
        [ 0.3175323 , -0.39069545, -0.54021007, ..., -0.48639128,
         -0.5634594 , -0.35589206],
        [ 0.4769148 , -0.2839303 , -0.4323159 , ...,  0.41676772,
         -0.5240828 , -0.19929864]]], dtype=float32), array([[-2.24004716e-01,  2.97416896e-01,  1.30188987e-01,
        -6.02790602e-02, -2.50664175e-01,  1.55318975e-01,
        -8.92245695e-02, -1.31961286e-01, -8.15235898e-02,
         1.38905600e-01,  3.21851559e-02, -1.23419315e-01,
         1.91661287e-02, -9.79901776e-02,  1.60287037e-01,
         1.11388564e-01,  2.34656796e-01, -2.83867925e-01,
  

In [37]:
import numpy as np

output = infer("这是一段测试文本")
output_array = np.array(output[0])

print(f"输出形状: {output_array.shape}")
print(f"输出内容: {output_array}")

输出形状: (1, 8, 768)
输出内容: [[[ 0.4769148  -0.28393015 -0.43231583 ...  0.4167677  -0.52408266
   -0.19929872]
  [ 0.04043505  0.09434471  0.29566997 ... -0.359766   -0.04143882
   -0.35307646]
  [ 0.04177262 -0.17315501 -0.14003229 ... -0.2051987  -0.2597228
   -0.33541787]
  ...
  [ 0.5436055   0.07704913 -0.4569545  ... -0.71320224 -0.46993056
   -0.30751562]
  [ 0.3175323  -0.39069545 -0.54021007 ... -0.48639128 -0.5634594
   -0.35589206]
  [ 0.4769148  -0.2839303  -0.4323159  ...  0.41676772 -0.5240828
   -0.19929864]]]
