向量embedding模型的底层解释

加载 bge-m3 模型

In [None]:
!pip install langchain-huggingface sentence-transformers

In [1]:
# 下载热门的embedding模型
from modelscope import snapshot_download
model_dir = snapshot_download('BAAI/bge-m3')

  from .autonotebook import tqdm as notebook_tqdm


Downloading Model from https://www.modelscope.cn to directory: /Users/jie/.cache/modelscope/hub/models/BAAI/bge-m3


In [2]:
# https://python.langchain.com/docs/integrations/vectorstores/

from langchain_huggingface import HuggingFaceEmbeddings

embeddings = HuggingFaceEmbeddings(model_name=model_dir)

In [3]:
text = "hello world!"

In [4]:
print(embeddings.embed_query(text)[:10])

[-0.03437407687306404, 0.029861997812986374, -0.041066382080316544, 0.0028893225826323032, -0.020066309720277786, -0.03695249184966087, -0.038797527551651, -0.05372311547398567, 0.011357544921338558, -0.0034280770923942327]


In [5]:
print(embeddings.embed_documents([text])[0][:10])

[-0.03437407687306404, 0.029861997812986374, -0.041066382080316544, 0.0028893225826323032, -0.020066309720277786, -0.03695249184966087, -0.038797527551651, -0.05372311547398567, 0.011357544921338558, -0.0034280770923942327]


## BERT

In [6]:
# bert 加载 
import torch
from transformers import AutoModel, AutoTokenizer

In [7]:
bge_m3_bert = AutoModel.from_pretrained(model_dir)
bge_m3_tokenizer = AutoTokenizer.from_pretrained(model_dir)

In [8]:
tokens = bge_m3_tokenizer(text, return_tensors="pt")

In [9]:
output = bge_m3_bert(**tokens)
output

BaseModelOutputWithPoolingAndCrossAttentions(last_hidden_state=tensor([[[-0.9034,  0.7848, -1.0793,  ...,  0.6585, -0.9023,  0.2505],
         [ 0.3974, -0.1502, -0.8096,  ...,  1.5423, -0.6413,  0.4494],
         [ 0.0902,  0.2658, -0.8696,  ...,  1.3465, -0.2546,  0.5970],
         [-0.6971,  0.5654, -0.2306,  ...,  1.0564, -0.2211,  0.0615],
         [ 0.1637,  0.3918, -0.7434,  ...,  1.4434, -0.7046,  0.3979],
         [-0.0837,  0.7006, -0.8004,  ...,  1.2811, -1.2167,  0.4276]]],
       grad_fn=<NativeLayerNormBackward0>), pooler_output=tensor([[-0.8678,  0.2324,  0.1832,  ..., -0.0287,  0.6174,  0.1894]],
       grad_fn=<TanhBackward0>), hidden_states=None, past_key_values=None, attentions=None, cross_attentions=None)

In [10]:
output.last_hidden_state.shape

torch.Size([1, 6, 1024])

In [11]:
cls_tensor = output.last_hidden_state[:, 0]
cls_tensor.shape

torch.Size([1, 1024])

In [12]:
torch.nn.functional.normalize(cls_tensor, dim=-1)

tensor([[-0.0344,  0.0299, -0.0411,  ...,  0.0251, -0.0343,  0.0095]],
       grad_fn=<DivBackward0>)

mean

In [13]:
hidden_state = output.last_hidden_state
mask = tokens["attention_mask"]

print("hidden_state.shape", hidden_state.shape)
print( mask.shape)

hidden_state.shape torch.Size([1, 6, 1024])
torch.Size([1, 6])


In [None]:
# * 对应位置的数据做乘法, 和mask用*做乘法表示填充部分token不参与计算
s = torch.sum(hidden_state * mask.unsqueeze(-1).float(), dim=1)
d = mask.sum(axis=1, keepdim=True).float()

(s / d).shape

torch.Size([1, 1024])

In [53]:
import torch
from torch import nn

# 下述代码参考自：FlagEmbedding 包的实现


class EncoderModel(nn.Module):

    def __init__(
        self,
        model_name,
        normlized: bool = False,
        sentence_pooling_method: str = "mean",
    ):
        super().__init__()
        self.model = AutoModel.from_pretrained(model_name)
        # self.cross_entropy = nn.CrossEntropyLoss(reduction="mean")
        self.sentence_pooling_method = sentence_pooling_method
        self.normlized = normlized

    def sentence_embedding(self, hidden_state, mask):
        if self.sentence_pooling_method == "mean":
            s = torch.sum(hidden_state * mask.unsqueeze(-1).float(), dim=1)
            d = mask.sum(axis=1, keepdim=True).float()
            return s / d
        elif self.sentence_pooling_method == "cls":
            return hidden_state[:, 0]

    def encode(self, features):
        if features is None:
            return None
        psg_out = self.model(**features, return_dict=True)
        p_reps = self.sentence_embedding(
            psg_out.last_hidden_state, features["attention_mask"]
        )
        if self.normlized:
            p_reps = torch.nn.functional.normalize(p_reps, dim=-1)
        return p_reps.contiguous()

In [54]:
embeddings.embed_documents([text])[0][:10]

[-0.03437407687306404,
 0.029861997812986374,
 -0.041066382080316544,
 0.0028893225826323032,
 -0.020066309720277786,
 -0.03695249184966087,
 -0.038797527551651,
 -0.05372311547398567,
 0.011357544921338558,
 -0.0034280770923942327]

In [57]:
EncoderModel(model_dir, sentence_pooling_method="cls", normlized=True).encode(
    bge_m3_tokenizer(text, return_tensors="pt")
)

tensor([[-0.0344,  0.0299, -0.0411,  ...,  0.0251, -0.0343,  0.0095]],
       grad_fn=<DivBackward0>)

In [58]:
EncoderModel(model_dir, sentence_pooling_method="mean", normlized=True).encode(
    bge_m3_tokenizer(text, return_tensors="pt")
)

tensor([[-0.0068,  0.0168, -0.0298,  ...,  0.0483, -0.0259,  0.0144]],
       grad_fn=<DivBackward0>)