In [None]:
# 测试评测能力
from eval import eval

eval('result/40/')

In [None]:
# 测试prompt-bert
from transformers import BertTokenizer, BertModel
import torch
import torch.nn.functional as F

# 使用 'bert-base-uncased' 模型
model_name = '/root/project/Prompt-BERT/result/unsup-bert_s42/'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# 示例句子
sentence1 = "I love machine learning and natural language processing."
sentence2 = "I hate machine learning and natural language processing."

# 将句子转换为token ID，并添加特殊token [CLS] 和 [SEP]
inputs1 = tokenizer(sentence1, return_tensors='pt', padding=True, truncation=True)
inputs2 = tokenizer(sentence2, return_tensors='pt', padding=True, truncation=True)

# 模型不需要计算梯度，因此使用 torch.no_grad()
with torch.no_grad():
    outputs1 = model(**inputs1)
    outputs2 = model(**inputs2)

# BERT 输出的是一个包含多层的输出，这里我们只关心最后一层的隐藏状态
last_hidden_state1 = outputs1.last_hidden_state
last_hidden_state2 = outputs2.last_hidden_state

# 取 [CLS] token 对应的向量，作为整个句子的向量表示
sentence_embedding1 = last_hidden_state1[:, 0, :]  # [batch_size, hidden_size]
sentence2_embedding = last_hidden_state2[:, 0, :]  # [batch_size, hidden_size]

cos_sim = F.cosine_similarity(sentence_embedding1, sentence2_embedding)

print(cos_sim.item())


In [None]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn.functional as F

# 使用Prompt-BERT预训练模型
model_name = 'princeton-nlp/unsup-simcse-bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# 定义带有 [MASK] 的prompt模板
def create_prompt_with_mask(sentence):
    # 在句子中插入 [MASK]，这里可以根据需求灵活调整位置
    prompt_template = f"The meaning of the sentence: {sentence} is [MASK]."
    return prompt_template

# 示例句子
sentence1 = "I love machine learning and natural language processing."
sentence2 = "I hate machine learning and natural language processing."

# 为句子添加 [MASK] prompt
prompted_sentence1 = create_prompt_with_mask(sentence1)
prompted_sentence2 = create_prompt_with_mask(sentence2)

# 将句子转换为token ID
inputs1 = tokenizer(prompted_sentence1, return_tensors='pt', padding=True, truncation=True)
inputs2 = tokenizer(prompted_sentence2, return_tensors='pt', padding=True, truncation=True)

# 获取 [MASK] token 的位置
mask_token_index1 = (inputs1['input_ids'] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]
mask_token_index2 = (inputs2['input_ids'] == tokenizer.mask_token_id).nonzero(as_tuple=True)[1]

# 不需要计算梯度
with torch.no_grad():
    outputs1 = model(**inputs1)
    outputs2 = model(**inputs2)

# 提取 [MASK] token 的向量表示
mask_embedding1 = outputs1.last_hidden_state[0, mask_token_index1, :].squeeze(0)
mask_embedding2 = outputs2.last_hidden_state[0, mask_token_index2, :].squeeze(0)

# 确保向量维度正确
mask_embedding1 = mask_embedding1.unsqueeze(0)  # 转为 [1, hidden_size] 形式
mask_embedding2 = mask_embedding2.unsqueeze(0)

# 计算两个句子 [MASK] token 向量的余弦相似度
cos_sim = F.cosine_similarity(mask_embedding1, mask_embedding2, dim=1)

print("余弦相似度 (使用Prompt-BERT的[MASK] token 向量):", cos_sim.item())

In [None]:
from simcse import SimCSE
model = SimCSE("result//")

sentences_a = ['I love machine learning and natural language processing.']
sentences_b = ['I hate machine learning and natural language processing.']
similarities = model.similarity(sentences_a, sentences_b)
print(similarities)