In [None]:
# 测试prompt-bert
from transformers import BertTokenizer, BertModel
import torch
import torch.nn.functional as F

# 使用 'bert-base-uncased' 模型
model_name = 'result/28/'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# 示例句子
sentence1 = "I love machine learning and natural language processing."
# sentence2 = "I hate machine learning and natural language processing."
template = 'This sentence : "{sentence}" means [MASK].'
mask_token_id = tokenizer.mask_token_id

sentence2 = template.format(sentence=sentence1)


# 将句子转换为token ID，并添加特殊token [CLS] 和 [SEP]
inputs1 = tokenizer(sentence1, return_tensors='pt', padding=True, truncation=True)
inputs2 = tokenizer(sentence2, return_tensors='pt', padding=True, truncation=True)

# 模型不需要计算梯度，因此使用 torch.no_grad()
with torch.no_grad():
    outputs1 = model(**inputs1)
    outputs2 = model(**inputs2)

# BERT 输出的是一个包含多层的输出，这里我们只关心最后一层的隐藏状态
last_hidden_state1 = outputs1.last_hidden_state
last_hidden_state2 = outputs2.last_hidden_state

# 取 [CLS] token 对应的向量，作为整个句子的向量表示
sentence_embedding1 = last_hidden_state1[:, 0, :]  # [batch_size, hidden_size]
# sentence2_embedding = last_hidden_state2[:, 0, :]  # [batch_size, hidden_size]
sentence2_embedding = last_hidden_state2[inputs2['input_ids'] == mask_token_id]  # [batch_size, hidden_size]
# sentence2_embedding = sentence2_embedding.view(1, -1)

cos_sim = F.cosine_similarity(sentence_embedding1, sentence2_embedding)

print(cos_sim.item())


In [None]:
from transformers import BertTokenizer, BertModel
import torch
import torch.nn.functional as F

model_name = 'princeton-nlp/unsup-simcse-bert-base-uncased'
tokenizer = BertTokenizer.from_pretrained(model_name)
model = BertModel.from_pretrained(model_name)

# 示例句子
sentence = "I love machine learning and natural language processing."

# 通过不同的方式计算句子表征
# 1 什么都不做，直接使用 [CLS] token 的向量
inputs = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
    outputs1 = model(**inputs)
    emb1 = outputs1.last_hidden_state[:, 0, :]
cos_sim = F.cosine_similarity(emb1, emb1)
print("余弦相似度 (直接使用[CLS] token 向量):", cos_sim.item())

# 2 整个句子进去
template = 'This sentence : "{sentence}" means [MASK].'
prompt_sentenct = template.format(sentence=sentence).replace('[MASK]', tokenizer.mask_token)
inputs2 = tokenizer(prompt_sentenct, return_tensors='pt', padding=True, truncation=True)
with torch.no_grad():
    outputs2 = model(**inputs2)
    emb2 = outputs2.last_hidden_state[:, 0, :] # cls
    emb3 = outputs2.last_hidden_state[inputs2['input_ids'] == tokenizer.mask_token_id]  # mask
cos_sim = F.cosine_similarity(emb1, emb2)
cos_sim2 = F.cosine_similarity(emb1, emb3)
print("余弦相似度 (使用Prompt-BERT的整个句子向量):", cos_sim.item())
print("余弦相似度 (使用Prompt-BERT的[MASK] token 向量):", cos_sim2.item())



In [None]:
# 3 引入attention mask
template = 'This sentence : "{sentence}" means [MASK].'.replace('[MASK]', tokenizer.mask_token)
prefix = template.split("{sentence}")[0]
suffix = template.split("{sentence}")[1]
prefix_input_ids = tokenizer(prefix, return_tensors='pt', padding=True, truncation=True)['input_ids'].view(-1)
suffix_inputs_ids = tokenizer(suffix, return_tensors='pt', padding=True, truncation=True)['input_ids'].view(-1)
sentence_inputs_ids = tokenizer(sentence, return_tensors='pt', padding=True, truncation=True)['input_ids'].view(-1)

prefix_input_ids = prefix_input_ids[:-1]
suffix_inputs_ids = suffix_inputs_ids[1:]
sentence_inputs_ids = sentence_inputs_ids
input_ids = torch.cat([prefix_input_ids, sentence_inputs_ids, suffix_inputs_ids])

prompt_weight = 0
attention_mask = torch.cat([
    torch.full(prefix_input_ids.size(),prompt_weight),
    torch.full(sentence_inputs_ids.size(),float(1)),
    torch.full(suffix_inputs_ids.size(),prompt_weight)
])

inputs3 = {'input_ids': input_ids.view(1, -1), 'attention_mask': attention_mask.view(1, -1)}
with torch.no_grad():
    outputs3 = model(**inputs3)
    emb3 = outputs3.last_hidden_state[inputs3['input_ids'] == tokenizer.mask_token_id]  # mask
cos_sim3 = F.cosine_similarity(emb1, emb3)
print("引入attention mask的的相似度", cos_sim3.item())

In [4]:
# 采样数据集
import json
import random
import os
import numpy as np

dataset_path = 'data/wiki1m_for_simcse_ner.json'

with open(dataset_path, 'r', encoding='utf-8') as f:
    dataset = json.load(f)

print("数据集大小:", len(dataset))
print("数据集示例:", dataset[0])

# 采样1000个样本
dataset_sample = random.sample(dataset, 1000)


数据集大小: 1000000
数据集示例: {'text': 'YMCA in South Australia', 'entities': [{'text': 'YMCA', 'start_position': 0, 'end_position': 4, 'label': 'ORG', 'confidence': 0.9874159693717957}, {'text': 'South Australia', 'start_position': 8, 'end_position': 23, 'label': 'LOC', 'confidence': 0.9811530113220215}]}


In [3]:
dict_path = 'data/wiki1m_for_simcse_ner_entity_dict.json'
with open(dict_path, 'r', encoding='utf-8') as f:
    entity_dict = json.load(f)


In [10]:
import hashlib
def hash(text):
    return hashlib.md5(text.encode()).hexdigest()

output_path = 'data/wiki1k_with_entity_knowledge.json'
for item in dataset_sample:
    entity_list = item['entities']
    for entity in entity_list:
        entity_hash = hash(entity['text']) + '.json'
        if entity_hash in entity_dict:
            entity['knowledge'] = entity_dict[entity_hash]
            print(entity['knowledge'])
    

[{'id': 'Q167402', 'title': 'Q167402', 'pageid': 167931, 'concepturi': 'http://www.wikidata.org/entity/Q167402', 'repository': 'wikidata', 'url': '//www.wikidata.org/wiki/Q167402', 'display': {'label': {'value': 'Australian Plant Name Index', 'language': 'en'}, 'description': {'value': 'online database', 'language': 'en'}}, 'label': 'Australian Plant Name Index', 'description': 'online database', 'match': {'type': 'label', 'language': 'en', 'text': 'Australian Plant Name Index'}}, {'id': 'Q50776', 'title': 'Q50776', 'pageid': 52849, 'concepturi': 'http://www.wikidata.org/entity/Q50776', 'repository': 'wikidata', 'url': '//www.wikidata.org/wiki/Q50776', 'display': {'label': {'value': 'Australian rules football', 'language': 'en'}, 'description': {'value': 'contact sport invented in Melbourne', 'language': 'en'}}, 'label': 'Australian rules football', 'description': 'contact sport invented in Melbourne', 'match': {'type': 'label', 'language': 'en', 'text': 'Australian rules football'}}, 

In [None]:
# 测试apex是否可用
from transformers.utils import is_apex_available
print(is_apex_available())

In [1]:
# 测试评测能力
from utils.auto_eval import eval

eval('model/unsup-PromptBERT-baseline',pooler='avg')

  return np.dot(u, v) / (np.linalg.norm(u) * np.linalg.norm(v))
  results[dataset] = {'pearson': pearsonr(sys_scores, gs_scores),
  'spearman': spearmanr(sys_scores, gs_scores),
  all_pearson = pearsonr(all_sys_scores, all_gs_scores)
  all_spearman = spearmanr(all_sys_scores, all_gs_scores)


KeyboardInterrupt: 