In [1]:
# Requires transformers>=4.51.0

import torch
import torch.nn.functional as F

from torch import Tensor
from transformers import AutoTokenizer, AutoModel

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
def last_token_pool(last_hidden_states: Tensor,
                 attention_mask: Tensor) -> Tensor:
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


def get_detailed_instruct(task_description: str, query: str) -> str:
    return f'Instruct: {task_description}\nQuery:{query}'

In [3]:
import torch
import torch.nn.functional as F
from transformers import AutoTokenizer, AutoModel
from torch import Tensor

def last_token_pool(last_hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
    """获取最后一个有效token的隐藏状态作为嵌入向量"""
    left_padding = (attention_mask[:, -1].sum() == attention_mask.shape[0])
    if left_padding:
        return last_hidden_states[:, -1]
    else:
        # 计算每个序列的有效长度，取最后一个有效token
        sequence_lengths = attention_mask.sum(dim=1) - 1
        batch_size = last_hidden_states.shape[0]
        return last_hidden_states[torch.arange(batch_size, device=last_hidden_states.device), sequence_lengths]


def get_detailed_instruct(task_description: str, query: str) -> str:
    """构建带指令的查询文本"""
    return f'Instruct: {task_description}\nQuery:{query}'


# 设备配置 - 自动选择可用GPU，否则使用CPU
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(f"使用设备: {device}")
if device.type == 'cuda':
    print(f"GPU名称: {torch.cuda.get_device_name(0)}")
    print(f"CUDA版本: {torch.version.cuda}")


# 任务与文本数据
task = 'Given a web search query, retrieve relevant passages that answer the query'

# 带指令的查询
queries = [
    get_detailed_instruct(task, 'What is the capital of China?'),
    get_detailed_instruct(task, 'Explain gravity')
]

# 文档（无需指令）
documents = [
    "The capital of China is Beijing.",
    "Gravity is a force that attracts two bodies towards each other. It gives weight to physical objects and is responsible for the movement of planets around the sun."
]

# 合并查询和文档作为输入
input_texts = queries + documents


# 加载分词器和模型（配置GPU加速）
model_path = '/root/lanyun-fs/models/Qwen3-Embedding-8B'  # 模型路径
tokenizer = AutoTokenizer.from_pretrained(model_path, padding_side='left')

# 加载模型并配置GPU参数
model = AutoModel.from_pretrained(
    model_path,
    torch_dtype=torch.float16,  # 使用半精度节省显存
    device_map="auto"  # 自动分配到可用设备
)

# 确保模型在正确设备上
model = model.to(device)
model.eval()  # 推理模式


# 文本编码与嵌入计算
max_length = 8192  # Qwen3-Embedding支持的最大长度

# 分词处理
batch_dict = tokenizer(
    input_texts,
    padding=True,
    truncation=True,
    max_length=max_length,
    return_tensors="pt"
)

# 将数据移至模型所在设备
batch_dict = {k: v.to(device) for k, v in batch_dict.items()}

# 计算嵌入（关闭梯度计算提高效率）
with torch.no_grad():
    outputs = model(** batch_dict)
    # 池化获取最终嵌入向量
    embeddings = last_token_pool(outputs.last_hidden_state, batch_dict['attention_mask'])
    # 归一化嵌入向量
    embeddings = F.normalize(embeddings, p=2, dim=1)


# 计算查询与文档的相似度分数
# 前2个是查询嵌入，后面是文档嵌入
scores = (embeddings[:2] @ embeddings[2:].T)

# 输出结果
print("查询与文档的相似度分数:")
print(scores.tolist())

使用设备: cuda
GPU名称: NVIDIA GeForce RTX 3090
CUDA版本: 12.4


Loading checkpoint shards: 100%|██████████| 4/4 [03:34<00:00, 53.50s/it]


查询与文档的相似度分数:
[[0.75, 0.07525634765625], [0.088134765625, 0.63232421875]]


In [4]:
model

Qwen3Model(
  (embed_tokens): Embedding(151665, 4096)
  (layers): ModuleList(
    (0-35): 36 x Qwen3DecoderLayer(
      (self_attn): Qwen3Attention(
        (q_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (k_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (v_proj): Linear(in_features=4096, out_features=1024, bias=False)
        (o_proj): Linear(in_features=4096, out_features=4096, bias=False)
        (q_norm): Qwen3RMSNorm((128,), eps=1e-06)
        (k_norm): Qwen3RMSNorm((128,), eps=1e-06)
      )
      (mlp): Qwen3MLP(
        (gate_proj): Linear(in_features=4096, out_features=12288, bias=False)
        (up_proj): Linear(in_features=4096, out_features=12288, bias=False)
        (down_proj): Linear(in_features=12288, out_features=4096, bias=False)
        (act_fn): SiLU()
      )
      (input_layernorm): Qwen3RMSNorm((4096,), eps=1e-06)
      (post_attention_layernorm): Qwen3RMSNorm((4096,), eps=1e-06)
    )
  )
  (norm): Qwen3RMSNorm((