In [3]:
import os
import sys
import warnings; warnings.filterwarnings("ignore")
import numpy as np
import pandas as pd
import torch as th
from transformers import (AutoTokenizer, AutoModel)
from sentence_transformers import (SentenceTransformer, util)
from langchain.embeddings import HuggingFaceEmbeddings
from langchain.vectorstores import FAISS

In [4]:
device = th.device("cuda" if th.cuda.is_available() else "cpu")
devive_cnt = th.cuda.device_count()
print(f"device = {device}; devive_cnt = {devive_cnt}")
print(th.__version__)
print(th.version.cuda)

device = cuda; devive_cnt = 1
2.0.1+cu118
11.8


In [5]:
path_project = "C:/my_project/MyGit/Machine-Learning-Column/hugging_face"
path_data = os.path.join(os.path.dirname(path_project), "data")
path_model = os.path.join(os.path.dirname(path_project), "model/sentence-transformers")
path_output = os.path.join(os.path.dirname(path_project), "output")

## step-1: 设置 checkpoint 和样本

In [6]:
checkpoint = "m3e-base"

In [7]:
corpus = [
    "南京师范大学",
    "南京大学",
    "南京中医药大学",
    "南京医科大学",
    "南京林业大学",
    "北京师范大学"
]

In [8]:
query = ["南师大"]

## step-2: Using Transformers

In [113]:
tokenizer = AutoTokenizer.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True
)

In [114]:
print(f"checkpoint = {checkpoint}, vocab_size = {tokenizer.vocab_size}")  # checkpoint = m3e-base, vocab_size = 21128

checkpoint = m3e-base, vocab_size = 21128


In [128]:
model = AutoModel.from_pretrained(
    pretrained_model_name_or_path=os.path.join(path_model, checkpoint),
    cache_dir=path_model,
    force_download=False,
    local_files_only=True,
    # torch_dtype=th.bfloat16
)
model = model.to(device)

In [8]:
print(model)

BertModel(
  (embeddings): BertEmbeddings(
    (word_embeddings): Embedding(21128, 768, padding_idx=0)
    (position_embeddings): Embedding(512, 768)
    (token_type_embeddings): Embedding(2, 768)
    (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
    (dropout): Dropout(p=0.1, inplace=False)
  )
  (encoder): BertEncoder(
    (layer): ModuleList(
      (0-11): 12 x BertLayer(
        (attention): BertAttention(
          (self): BertSelfAttention(
            (query): Linear(in_features=768, out_features=768, bias=True)
            (key): Linear(in_features=768, out_features=768, bias=True)
            (value): Linear(in_features=768, out_features=768, bias=True)
            (dropout): Dropout(p=0.1, inplace=False)
          )
          (output): BertSelfOutput(
            (dense): Linear(in_features=768, out_features=768, bias=True)
            (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
            (dropout): Dropout(p=0.1, inplace=False)
  

In [None]:
for i, (name, parm) in enumerate(model.named_parameters()):
    print(f"{i}  name: {name};  shape: {parm.shape};  dtype: {parm.dtype};  device: {parm.device}")

In [131]:
inputs_corpus = tokenizer(corpus, padding=True, truncation=True, return_tensors="pt").to(device)

model.eval()
with th.inference_mode():
    # model_outputs = model(**inputs_corpus)
    model_outputs = model(
        input_ids=inputs_corpus.input_ids,
        attention_mask=inputs_corpus.attention_mask,
        token_type_ids=inputs_corpus.token_type_ids
    )
    
embeddings_corpus = model_outputs.last_hidden_state[:, 0, :]
embeddings_corpus = th.nn.functional.normalize(embeddings_corpus, p=2, dim=1)
print(embeddings_corpus)

tensor([[ 0.0150,  0.0156,  0.0373,  ..., -0.0377, -0.0717, -0.0500],
        [ 0.0097,  0.0089,  0.0240,  ..., -0.0496, -0.0785, -0.0489],
        [-0.0049,  0.0205,  0.0228,  ..., -0.0178, -0.0746, -0.0377],
        [-0.0036,  0.0248,  0.0386,  ..., -0.0368, -0.0726, -0.0307],
        [ 0.0206,  0.0230,  0.0008,  ..., -0.0454, -0.0869, -0.0372],
        [ 0.0115,  0.0254,  0.0153,  ..., -0.0032, -0.0607, -0.0427]],
       device='cuda:0')


In [132]:
inputs_query = tokenizer(query, padding=True, truncation=True, return_tensors="pt").to(device)

model.eval()
with th.inference_mode():
    # model_outputs = model(**inputs_corpus)
    model_outputs = model(
        input_ids=inputs_query.input_ids,
        attention_mask=inputs_query.attention_mask,
        token_type_ids=inputs_query.token_type_ids
    )
    
embeddings_query = model_outputs.last_hidden_state[:, 0, :]
embeddings_query = th.nn.functional.normalize(embeddings_query, p=2, dim=1)

In [133]:
# sim = embeddings_corpus @ embeddings_query.T
sim = util.cos_sim(embeddings_corpus, embeddings_query)
print(sim)

tensor([[0.9358],
        [0.8700],
        [0.8453],
        [0.8400],
        [0.8443],
        [0.8788]], device='cuda:0')


## step-3: Using Sentence-Transformers

In [9]:
model = SentenceTransformer(
    model_name_or_path=os.path.join(path_model, checkpoint),
    cache_folder=path_model,
    local_files_only=True
)
model = model.to(device)

In [10]:
print(model)

SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
)


In [11]:
embeddings_corpus = model.encode(sentences=corpus, convert_to_tensor=True, normalize_embeddings=True).to(device)
print(embeddings_corpus)

tensor([[ 0.0121,  0.0173,  0.0365,  ..., -0.0422, -0.0780, -0.0438],
        [ 0.0066,  0.0114,  0.0244,  ..., -0.0549, -0.0730, -0.0457],
        [-0.0117,  0.0155,  0.0126,  ..., -0.0173, -0.0743, -0.0304],
        [-0.0159,  0.0265,  0.0340,  ..., -0.0397, -0.0739, -0.0302],
        [ 0.0146,  0.0214,  0.0036,  ..., -0.0383, -0.0861, -0.0367],
        [ 0.0092,  0.0280,  0.0147,  ..., -0.0139, -0.0672, -0.0323]],
       device='cuda:0')


In [12]:
embeddings_query = model.encode(sentences=query, convert_to_tensor=True, normalize_embeddings=True).to(device)

In [14]:
sim = util.cos_sim(embeddings_corpus, embeddings_query)
print(sim)

tensor([[0.9308],
        [0.8779],
        [0.8395],
        [0.8445],
        [0.8517],
        [0.8650]], device='cuda:0')


In [20]:
hits = util.semantic_search(query_embeddings=embeddings_query, corpus_embeddings=embeddings_corpus, top_k=1)
print(hits)

[[{'corpus_id': 0, 'score': 0.9308120012283325}]]


## step-4: Using Langchain

In [145]:
model = HuggingFaceEmbeddings(
    model_name=os.path.join(path_model, checkpoint),
    cache_folder=os.path.join(path_model, checkpoint),
    model_kwargs={"device": "cuda"},
    encode_kwargs={"normalize_embeddings": True}
    )

In [146]:
print(model)

client=SentenceTransformer(
  (0): Transformer({'max_seq_length': 512, 'do_lower_case': False}) with Transformer model: BertModel 
  (1): Pooling({'word_embedding_dimension': 768, 'pooling_mode_cls_token': False, 'pooling_mode_mean_tokens': True, 'pooling_mode_max_tokens': False, 'pooling_mode_mean_sqrt_len_tokens': False, 'pooling_mode_weightedmean_tokens': False, 'pooling_mode_lasttoken': False, 'include_prompt': True})
) model_name='C:/my_project/MyGit/Machine-Learning-Column\\model/sentence-transformers\\m3e-base' cache_folder='C:/my_project/MyGit/Machine-Learning-Column\\model/sentence-transformers\\m3e-base' model_kwargs={'device': 'cuda'} encode_kwargs={'normalize_embeddings': True} multi_process=False show_progress=False


## step-5: Using Specific