In [None]:
# !pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -U optimum
# !pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -U onnxruntime
# !pip install -i https://pypi.tuna.tsinghua.edu.cn/simple -U onnxruntime-gpu
# !pip install -i https://pypi.tuna.tsinghua.edu.cn/simple onnx==1.16.1

In [1]:
import warnings; warnings.filterwarnings("ignore")
import os
import sys
import numpy as np
import pandas as pd
import torch as th
from transformers import (AutoTokenizer, AutoModel, AutoModelForSequenceClassification)
from optimum.onnxruntime import ORTModelForSequenceClassification
from sentence_transformers import (SentenceTransformer, util, CrossEncoder)
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import (FAISS, LanceDB)
from cross_encoder_ort import CrossEncoderOrt

In [2]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(th.__version__)
print(th.version.cuda)

device = cuda; devive_cnt = 1
2.2.0+cu118
11.8


In [3]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = "F:/LLM/sentence-transformers"
path_output = os.path.join(os.path.dirname(path_project), "output")

## step-1: 设置 checkpoint 和样本

In [8]:
checkpoint = "m3e-base"

In [4]:
corpus = [
    "南京师范大学",
    "南京大学",
    "南京中医药大学",
    "南京医科大学",
    "南京林业大学",
    "北京师范大学"
]

In [5]:
query = ["南师大"]

## step-2: Using Transformers

In [9]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True
)

In [10]:
print(f"checkpoint = {checkpoint}, vocab_size = {tokenizer.vocab_size}")  # checkpoint = m3e-base, vocab_size = 21128

checkpoint = m3e-base, vocab_size = 21128


In [14]:
model = AutoModel.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map=device,
    # torch_dtype=th.bfloat16
)

In [15]:
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSdpaSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False

In [None]:
for i, (name, parm) in enumerate(model.named_parameters()):
    print(f"{i}  name: {name};  shape: {parm.shape};  dtype: {parm.dtype};  device: {parm.device}")

In [131]:
inputs_corpus = tokenizer(corpus, padding=True, truncation=True, return_tensors="pt").to(device)

model.eval()
with th.inference_mode():
    # model_outputs = model(**inputs_corpus)
    model_outputs = model(
        input_ids=inputs_corpus.input_ids,
        attention_mask=inputs_corpus.attention_mask,
        token_type_ids=inputs_corpus.token_type_ids
    )
    
embeddings_corpus = model_outputs.last_hidden_state[:, 0, :]
embeddings_corpus = th.nn.functional.normalize(embeddings_corpus, p=2, dim=1)
print(embeddings_corpus)

tensor([[ 0.0150,  0.0156,  0.0373,  ..., -0.0377, -0.0717, -0.0500],
        [ 0.0097,  0.0089,  0.0240,  ..., -0.0496, -0.0785, -0.0489],
        [-0.0049,  0.0205,  0.0228,  ..., -0.0178, -0.0746, -0.0377],
        [-0.0036,  0.0248,  0.0386,  ..., -0.0368, -0.0726, -0.0307],
        [ 0.0206,  0.0230,  0.0008,  ..., -0.0454, -0.0869, -0.0372],
        [ 0.0115,  0.0254,  0.0153,  ..., -0.0032, -0.0607, -0.0427]],
       device='cuda:0')


In [132]:
inputs_query = tokenizer(query, padding=True, truncation=True, return_tensors="pt").to(device)

model.eval()
with th.inference_mode():
    # model_outputs = model(**inputs_corpus)
    model_outputs = model(
        input_ids=inputs_query.input_ids,
        attention_mask=inputs_query.attention_mask,
        token_type_ids=inputs_query.token_type_ids
    )
    
embeddings_query = model_outputs.last_hidden_state[:, 0, :]
embeddings_query = th.nn.functional.normalize(embeddings_query, p=2, dim=1)

In [133]:
# sim = embeddings_corpus @ embeddings_query.T
sim = util.cos_sim(embeddings_corpus, embeddings_query)
print(sim)

tensor([[0.9358],
        [0.8700],
        [0.8453],
        [0.8400],
        [0.8443],
        [0.8788]], device='cuda:0')


## step-3: Using Sentence-Transformers

In [16]:
model = SentenceTransformer(
    model_name_or_path=os.path.join(path_model, checkpoint),
    cache_folder=path_model,
    local_files_only=True,
    device=device
)

In [10]:
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)


In [11]:
embeddings_corpus = model.encode(sentences=corpus, convert_to_tensor=True, normalize_embeddings=True).to(device)
print(embeddings_corpus)

tensor([[ 0.0121,  0.0173,  0.0365,  ..., -0.0422, -0.0780, -0.0438],
        [ 0.0066,  0.0114,  0.0244,  ..., -0.0549, -0.0730, -0.0457],
        [-0.0117,  0.0155,  0.0126,  ..., -0.0173, -0.0743, -0.0304],
        [-0.0159,  0.0265,  0.0340,  ..., -0.0397, -0.0739, -0.0302],
        [ 0.0146,  0.0214,  0.0036,  ..., -0.0383, -0.0861, -0.0367],
        [ 0.0092,  0.0280,  0.0147,  ..., -0.0139, -0.0672, -0.0323]],
       device='cuda:0')


In [12]:
embeddings_query = model.encode(sentences=query, convert_to_tensor=True, normalize_embeddings=True).to(device)

In [14]:
sim = util.cos_sim(embeddings_corpus, embeddings_query)
print(sim)

tensor([[0.9308],
        [0.8779],
        [0.8395],
        [0.8445],
        [0.8517],
        [0.8650]], device='cuda:0')


In [20]:
hits = util.semantic_search(query_embeddings=embeddings_query, corpus_embeddings=embeddings_corpus, top_k=1)
print(hits)

[[{'corpus_id': 0, 'score': 0.9308120012283325}]]


## step-4: Using Langchain

In [145]:
model = HuggingFaceEmbeddings(
    model_name=os.path.join(path_model, checkpoint),
    cache_folder=os.path.join(path_model, checkpoint),
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True}
    )

In [146]:
print(model)

client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
) model_name='C:/my_project/MyGit/Machine-Learning-Column\\model/sentence-transformers\\m3e-base' cache_folder='C:/my_project/MyGit/Machine-Learning-Column\\model/sentence-transformers\\m3e-base' model_kwargs={'device': 'cuda'} encode_kwargs={'normalize_embeddings': True} multi_process=False show_progress=False


## step-5: Testing Onnx

https://blog.csdn.net/pipisorry/article/details/131519708?ops_request_misc=%257B%2522request%255Fid%2522%253A%25225B8B1CBE-4598-41BE-BDAF-33C4615991B4%2522%252C%2522scm%2522%253A%252220140713.130102334.pc%255Fall.%2522%257D&request_id=5B8B1CBE-4598-41BE-BDAF-33C4615991B4&biz_id=0&utm_medium=distribute.pc_search_result.none-task-blog-2~all~first_rank_ecpm_v1~rank_v31_ecpm-5-131519708-null-null.142^v100^pc_search_result_base8&utm_term=optimum%20%E6%8E%A8%E7%90%86%E5%8A%A0%E9%80%9F&spm=1018.2226.3001.4187

HuggingFace Optimum 是 Transformers 的扩展，它提供了性能优化工具的统一 API，以实现在加速硬件上训练和运行模型的最高效率，包括在Graphcore IPU和Habana Gaudi上优化性能的工具包。Optimum可通过其模块将模型从 PyTorch 或 TensorFlow 导出为序列化格式，例如 ONNX 和 TFLite exporters。

In [24]:
device = th.device("cpu")
# device = th.device("cuda")

In [25]:
checkpoint = "BAAI/bge-reranker-large"

In [26]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True
)

In [13]:
model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map=device,
    # torch_dtype=th.float32
)

In [19]:
model_ort = ORTModelForSequenceClassification.from_pretrained(
    model_id=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    use_merged=True,  # 如下
    # file_name=os.path.join(path_model, checkpoint, "model.onnx"),
)
model_ort = model_ort.to(device)

use_merged (`Optional[bool]`, defaults to `None`):
    whether or not to use a single ONNX that handles both the decoding without and with past key values reuse. This option defaults
    to `True` if loading from a local repository and a merged decoder is found. When exporting with `export=True`,
    defaults to `False`. This option should be set to `True` to minimize memory usage.

In [20]:
pairs = [
	("酒店名称：格林豪泰酒店上海虹桥机场动物园地铁站店；酒店地址：上海长宁区哈密路1721号", "酒店名称：格林豪泰虹桥动物园店；酒店地址：位于长宁区哈密路1721号，近青溪路，距离地铁10号线上海动物园站动物园不远"),
	("酒店名称：格林豪泰酒店上海虹桥机场动物园地铁站店；酒店地址：上海长宁区哈密路1721号", "酒店名称：格林豪泰动物园地铁站店原上海虹桥店；酒店地址：长宁区哈密路"),
	("酒店名称：格林豪泰酒店上海虹桥机场动物园地铁站店；酒店地址：上海长宁区哈密路1721号", "酒店名称：格林豪泰上海虹桥机场店原名动物园地铁站店；酒店地址：长宁区哈密路1721号，青溪路与哈密路交叉口")
	]

In [27]:
model_inputs = tokenizer(pairs, padding=True, truncation=True, return_tensors="pt").to(device)

In [106]:
%%timeit
with th.inference_mode():
    scores = model(**model_inputs, return_dict=True).logits.view(-1, ).float()

623 ms ± 18.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [28]:
%%timeit
with th.inference_mode():
    scores_ort = model_ort(**model_inputs, return_dict=True).logits.view(-1, ).float()

408 ms ± 24.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [48]:
# 推理结果相同
print(scores)
print(scores_ort)

tensor([7.4298, 2.8237, 8.6251])
tensor([7.4298, 2.8237, 8.6251])


## step-6: Testing Onnx On CrossEncoder

In [3]:
device = th.device("cpu")
# device = th.device("cuda")

In [4]:
checkpoint = "BAAI/bge-reranker-large"

In [15]:
model = CrossEncoder(
    model_name=os.path.join(path_model, checkpoint),
    device=device,
    local_files_only=True,
    # trust_remote_code=False,
    # automodel_args={"torch_dtype": th.float32},
)

In [5]:
model_ort = CrossEncoderOrt(
    model_name=os.path.join(path_model, checkpoint),
    device=device,
    local_files_only=True,
    # trust_remote_code=False,
    # automodel_args={"torch_dtype": th.float32},
)

In [6]:
a = "酒店名称：格林豪泰酒店上海虹桥机场动物园地铁站店；酒店地址：上海长宁区哈密路1721号"
b_list = [
    "酒店名称：格林豪泰虹桥动物园店；酒店地址：位于长宁区哈密路1721号，近青溪路，距离地铁10号线上海动物园站动物园不远",
    "酒店名称：格林豪泰动物园地铁站店原上海虹桥店；酒店地址：长宁区哈密路",
    "酒店名称：格林豪泰上海虹桥机场店原名动物园地铁站店；酒店地址：长宁区哈密路1721号，青溪路与哈密路交叉口"
]

In [17]:
%%timeit
response = model.rank(query=a, documents=b_list, top_k=1)

659 ms ± 15.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [10]:
%%timeit
response_ort = model_ort.rank(query=a, documents=b_list, top_k=1)

416 ms ± 16 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


device = cpu  
- use AutoModelForSequenceClassification = 640 ms  
- use ORTModelForSequenceClassification = 425 ms
- use CrossEncoder.rank = 642 ms
- use CrossEncoderOrt.rank = 413 ms

## step-7: Testing GPU Used

In [4]:
cached_memory_t0 = th.cuda.memory_cached(device)
print(f"已缓存的显存：{cached_memory_t0 / 1024**3:.2f}G")

已缓存的显存：0.00G


In [5]:
embedding_model = AutoModel.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, "BAAI/bge-base-zh-v1.5"),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map=device,
    # torch_dtype=th.bfloat16
)

In [6]:
cached_memory_t1 = th.cuda.memory_cached(device)
print(f"Embedding Model 消耗显存：{(cached_memory_t1 - cached_memory_t0) / 1024**3:.2f}G")

Embedding Model 消耗显存：0.43G


In [7]:
ranking_model = AutoModelForSequenceClassification.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, "BAAI/bge-reranker-large"),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    device_map=device,
    # torch_dtype=th.bfloat16
)

In [8]:
cached_memory_t2 = th.cuda.memory_cached(device)
print(f"Ranking Model 消耗显存：{(cached_memory_t2 - cached_memory_t1 - cached_memory_t0) / 1024**3:.2f}G")

Ranking Model 消耗显存：2.08G
