In [3]:
import numpy as np
import torch
from transformers import BertTokenizer, BertForMaskedLM

2023-09-25 18:09:31.861656: I tensorflow/core/util/port.cc:110] oneDNN custom operations are on. You may see slightly different numerical results due to floating-point round-off errors from different computation orders. To turn them off, set the environment variable `TF_ENABLE_ONEDNN_OPTS=0`.
2023-09-25 18:09:31.891592: I tensorflow/core/platform/cpu_feature_guard.cc:182] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.
To enable the following instructions: AVX2 AVX_VNNI FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.


In [4]:
embedding_path = './counter-fitted-vectors.txt' 
embeddings = []
with open(embedding_path, 'r') as ifile:
    for line in ifile:
        embedding = [float(num) for num in line.strip().split()[1:]]
        embeddings.append(embedding)
        
embeddings = np.array(embeddings)
# print(embeddings.T.shape) # 300*65713

# calculate all embedding-vector's l2 norm
norm = np.linalg.norm(embeddings, axis=1, keepdims=True)
# normalication
embeddings = np.asarray(embeddings / norm, "float32")
product = np.dot(embeddings, embeddings.T)
np.save(('cos_sim_counter_fitting.npy'), product)

In [5]:
idx2word = {}
word2idx = {}

print("Building vocab...")
with open('./counter-fitted-vectors.txt', 'r') as ifile:
    for line in ifile:
        word = line.split()[0]
        if word not in idx2word:
            idx2word[len(idx2word)] = word
            word2idx[word] = len(idx2word) - 1

Building vocab...


In [6]:
# print('Load pre-computed cosine similarity matrix from {}'.format('cos_sim_counter_fitting.npy'))
cos_sim = np.load('cos_sim_counter_fitting.npy')

In [7]:
def pick_most_similar_words_batch(src_words, sim_mat, idx2word, ret_count=10, threshold=0.):
    """
    Given a list of source words (their indices), a similarity matrix, and an index-to-word mapping,
    this function returns the top `ret_count` similar words for each source word, filtered by a given threshold.
    
    Parameters:
    - src_words: List of source word indices.
    - sim_mat: Similarity matrix of shape (vocab_size, vocab_size).
    - idx2word: A mapping from word index to actual word.
    - ret_count: Number of top similar words to return for each source word.
    - threshold: A similarity threshold to filter out words.
    
    Returns:
    - sim_words: A list of lists containing similar words for each source word.
    - sim_values: A list of lists containing similarity values for each word in sim_words.
    """
    
    # 对于每个src_word，找到其与其他所有单词的相似度排名（从高到低）
    sim_order = np.argsort(-sim_mat[src_words, :])[:, 1:1 + ret_count]
    
    sim_words, sim_values = [], []  # 初始化列表以保存结果

    # 遍历src_words的每个词
    for idx, src_word in enumerate(src_words):
        # 获取对应src_word的相似度值
        sim_value = sim_mat[src_word][sim_order[idx]]
        
        # 根据阈值筛选出大于等于threshold的相似度值
        mask = sim_value >= threshold
        
        # 使用mask获取单词和其相似度值
        sim_word, sim_value = sim_order[idx][mask], sim_value[mask]
        
        # 将单词索引转换为实际的单词
        sim_word = [idx2word[id] for id in sim_word]
        
        # 保存结果
        sim_words.append(sim_word)
        sim_values.append(sim_value)

    return sim_words, sim_values  # 返回相似单词及其相似度值

In [8]:
original_sentence = "Whoever wrote the screenplay for this movie obviously never consulted any books about Lucille Ball, especially her autobiography. I've never seen so many mistakes in a biopic, ranging from her early years in Celoron and Jamestown to her later years with Desi. I could write a whole list of factual errors, but it would go on for pages. In all, I believe that Lucille Ball is one of those inimitable people who simply cannot be portrayed by anyone other than themselves. If I were Lucie Arnaz and Desi, Jr., I would be irate at how many mistakes were made in this film. The filmmakers tried hard, but the movie seems awfully sloppy to me."

original_sentence_list = original_sentence.split()
perturb_idxes = [6, 7]

In [9]:
words_perturb = [(idx, original_sentence_list[idx]) for idx in perturb_idxes]
words_perturb_idx = [word2idx[word] for idx, word in words_perturb if word in word2idx]
synonym_words, _ = pick_most_similar_words_batch(words_perturb_idx, cos_sim, idx2word, 10, 0.5)

In [10]:
synonym_words

[['film',
  'filmmaking',
  'films',
  'movies',
  'cinema',
  'cinematographic',
  'cinematic',
  'cinematography',
  'kino',
  'flick'],
 ['evidently',
  'clearly',
  'manifestly',
  'naturally',
  'patently',
  'apparently',
  'plainly',
  'definitely',
  'surely',
  'undoubtedly']]

In [11]:
synonyms_all = []
for idx, word in words_perturb:
    if word in word2idx:
        synonyms = synonym_words.pop(0)
        if synonyms:
            synonyms_all.append((idx, synonyms))

In [12]:
synonyms_all[0][0], synonyms_all[0][1],synonyms_all[0][1][0] # synonyms_all[0][]

(6,
 ['film',
  'filmmaking',
  'films',
  'movies',
  'cinema',
  'cinematographic',
  'cinematic',
  'cinematography',
  'kino',
  'flick'],
 'film')

In [13]:
text_prime = original_sentence_list
len_text = len(original_sentence_list)

In [14]:
new_texts_list = []
for idx, synonyms in synonyms_all:
    new_texts = [text_prime[:idx] + [synonym] + text_prime[min(idx + 1, len_text):] for synonym in synonyms]
    new_texts_list.append(new_texts)

In [15]:
new_texts_list # 仅替换一个词的句子列表

[[['Whoever',
   'wrote',
   'the',
   'screenplay',
   'for',
   'this',
   'film',
   'obviously',
   'never',
   'consulted',
   'any',
   'books',
   'about',
   'Lucille',
   'Ball,',
   'especially',
   'her',
   'autobiography.',
   "I've",
   'never',
   'seen',
   'so',
   'many',
   'mistakes',
   'in',
   'a',
   'biopic,',
   'ranging',
   'from',
   'her',
   'early',
   'years',
   'in',
   'Celoron',
   'and',
   'Jamestown',
   'to',
   'her',
   'later',
   'years',
   'with',
   'Desi.',
   'I',
   'could',
   'write',
   'a',
   'whole',
   'list',
   'of',
   'factual',
   'errors,',
   'but',
   'it',
   'would',
   'go',
   'on',
   'for',
   'pages.',
   'In',
   'all,',
   'I',
   'believe',
   'that',
   'Lucille',
   'Ball',
   'is',
   'one',
   'of',
   'those',
   'inimitable',
   'people',
   'who',
   'simply',
   'cannot',
   'be',
   'portrayed',
   'by',
   'anyone',
   'other',
   'than',
   'themselves.',
   'If',
   'I',
   'were',
   'Lucie',
   'A

In [17]:
# 1. 加载预训练的BERT模型和tokenizer
model_name = "bert-base-uncased"
model = BertForMaskedLM.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
model.eval()

def mask_sentence(sentence, mask_idx, tokenizer):
    """Inserts a [MASK] token at the specified index of the sentence."""
    tokens = tokenizer.tokenize(sentence)
    tokens[mask_idx] = '[MASK]'
    return tokenizer.convert_tokens_to_string(tokens)


Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.weight', 'cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [18]:
word_mask_probability_list = []

for index, idx in enumerate(perturb_idxes):
    msk_sentence = mask_sentence(original_sentence, mask_idx=idx, tokenizer=tokenizer)
    inputs = tokenizer(msk_sentence, return_tensors="pt")
    mask_idx = torch.where(inputs["input_ids"][0] == tokenizer.mask_token_id)[0].item()

    # 3. 使用BERT预测[MASK]位置的概率分布
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits[0, mask_idx].softmax(dim=0)

    word_mask_probability = {}
    
    for word in synonyms_all[index][1]:
        word_id = tokenizer.convert_tokens_to_ids(word)
        word_probability = predictions[word_id].item()
        word_mask_probability[word] = word_probability

    total_value = sum(word_mask_probability.values())
    normalized_data = {k: v / total_value for k, v in word_mask_probability.items()}

    word_mask_probability_list.append(normalized_data)        


    

In [19]:
word_mask_probability_list

[{'film': 0.9986778823714723,
  'filmmaking': 5.832526251584305e-06,
  'films': 0.0006380815453449224,
  'movies': 0.0003449605651159915,
  'cinema': 4.65305612896654e-05,
  'cinematographic': 7.458551687577721e-07,
  'cinematic': 5.908666508536942e-06,
  'cinematography': 1.197741370419556e-06,
  'kino': 7.458551687577721e-07,
  'flick': 0.0002781143123090565},
 {'evidently': 0.0701647586899207,
  'clearly': 0.08903565516000746,
  'manifestly': 0.0003112191691114321,
  'naturally': 0.017603581241432697,
  'patently': 0.0003112191691114321,
  'apparently': 0.4477781690909761,
  'plainly': 0.0008638359003986049,
  'definitely': 0.19978818133903484,
  'surely': 0.11636039484706538,
  'undoubtedly': 0.05778298539294134}]

In [20]:
new_sentences = [[" ".join(tokens) for tokens in sublist] for sublist in new_texts_list]

In [21]:
from sentence_transformers import SentenceTransformer, util
model = SentenceTransformer('bert-base-nli-mean-tokens')


word_similarity_list = []
# 对单一句子进行编码
query_embedding = model.encode(original_sentence, convert_to_tensor=True)

for i in range(len(new_sentences)):
    # 对多个句子进行编码
    sentence_embeddings = model.encode(new_sentences[i], convert_to_tensor=True)
        
    # 计算相似度
    similarity_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)
    similarity_scores = similarity_scores.to('cpu').numpy().reshape(-1)
    
    word_similarity_probability = {}
    for idx, word in enumerate(synonyms_all[i][1]):
        word_similarity_probability[word] = similarity_scores[idx]

    total_value = sum(word_similarity_probability.values())
    normalized_data = {k: v / total_value for k, v in word_similarity_probability.items()}

    word_similarity_list.append(normalized_data)    

In [22]:
word_similarity_list

[{'film': 0.1000958600757268,
  'filmmaking': 0.10002038176865653,
  'films': 0.10007075434850948,
  'movies': 0.10010088480173511,
  'cinema': 0.09993933568324213,
  'cinematographic': 0.09992467327017511,
  'cinematic': 0.10005054205992259,
  'cinematography': 0.09975876183017135,
  'kino': 0.10003000752049906,
  'flick': 0.10000879864136183},
 {'evidently': 0.10003353012568778,
  'clearly': 0.10005780362924134,
  'manifestly': 0.0999790116816042,
  'naturally': 0.1000346456688592,
  'patently': 0.09995810270397941,
  'apparently': 0.09992767283383275,
  'plainly': 0.10004715526260503,
  'definitely': 0.1000134861414312,
  'surely': 0.09998646255551386,
  'undoubtedly': 0.09996212939724523}]

In [None]:
word_mask_probability_list

[{'film': 0.9986778823714723,
  'filmmaking': 5.832526251584305e-06,
  'films': 0.0006380815453449224,
  'movies': 0.0003449605651159915,
  'cinema': 4.65305612896654e-05,
  'cinematographic': 7.458551687577721e-07,
  'cinematic': 5.908666508536942e-06,
  'cinematography': 1.197741370419556e-06,
  'kino': 7.458551687577721e-07,
  'flick': 0.0002781143123090565},
 {'evidently': 0.0701647586899207,
  'clearly': 0.08903565516000746,
  'manifestly': 0.0003112191691114321,
  'naturally': 0.017603581241432697,
  'patently': 0.0003112191691114321,
  'apparently': 0.4477781690909761,
  'plainly': 0.0008638359003986049,
  'definitely': 0.19978818133903484,
  'surely': 0.11636039484706538,
  'undoubtedly': 0.05778298539294134}]

In [23]:
result = []
for dict1, dict2 in zip(word_mask_probability_list, word_similarity_list):
    # 使用字典推导将两个字典的对应值相乘
    multiplied_dict = {key: dict1[key] * dict2[key] for key in dict1}
    result.append(multiplied_dict)

In [24]:
import math 

for d in result:
    for key, value in d.items():
        d[key] = math.log(value)

In [25]:
result

[{'film': -2.302949943798226,
  'filmmaking': -14.354441628221856,
  'films': -9.658922268466581,
  'movies': -10.273657204999589,
  'cinema': -12.278593149461582,
  'cinematographic': -16.412073043487982,
  'cinematic': -14.341170185155304,
  'cinematography': -15.940073355208451,
  'kino': -16.41101946214994,
  'flick': -10.489975443885966},
 {'evidently': -4.959158954903232,
  'clearly': -4.720725593401568,
  'manifestly': -10.377808168361867,
  'naturally': -6.341891614318058,
  'patently': -10.37801732390325,
  'apparently': -3.1067659538204215,
  'plainly': -9.356241388807426,
  'definitely': -3.9129478076472592,
  'surely': -4.453783528640738,
  'undoubtedly': -5.154024787752433}]

In [39]:
from itertools import product

result_dict = {}

# 获取所有字典的items()组合
for combination in product(*(d.items() for d in result)):
    combined_key = tuple(k for k, v in combination)
    combined_value = sum(v for k, v in combination)
    result_dict[combined_key] = combined_value

result_dict

{('film', 'evidently'): -7.262108898701459,
 ('film', 'clearly'): -7.023675537199794,
 ('film', 'manifestly'): -12.680758112160092,
 ('film', 'naturally'): -8.644841558116283,
 ('film', 'patently'): -12.680967267701476,
 ('film', 'apparently'): -5.409715897618648,
 ('film', 'plainly'): -11.659191332605651,
 ('film', 'definitely'): -6.215897751445485,
 ('film', 'surely'): -6.756733472438964,
 ('film', 'undoubtedly'): -7.456974731550659,
 ('filmmaking', 'evidently'): -19.313600583125087,
 ('filmmaking', 'clearly'): -19.075167221623424,
 ('filmmaking', 'manifestly'): -24.732249796583723,
 ('filmmaking', 'naturally'): -20.696333242539914,
 ('filmmaking', 'patently'): -24.732458952125107,
 ('filmmaking', 'apparently'): -17.461207582042277,
 ('filmmaking', 'plainly'): -23.710683017029282,
 ('filmmaking', 'definitely'): -18.267389435869116,
 ('filmmaking', 'surely'): -18.808225156862594,
 ('filmmaking', 'undoubtedly'): -19.50846641597429,
 ('films', 'evidently'): -14.618081223369813,
 ('films

In [40]:
result_dict

{('film', 'evidently'): -7.262108898701459,
 ('film', 'clearly'): -7.023675537199794,
 ('film', 'manifestly'): -12.680758112160092,
 ('film', 'naturally'): -8.644841558116283,
 ('film', 'patently'): -12.680967267701476,
 ('film', 'apparently'): -5.409715897618648,
 ('film', 'plainly'): -11.659191332605651,
 ('film', 'definitely'): -6.215897751445485,
 ('film', 'surely'): -6.756733472438964,
 ('film', 'undoubtedly'): -7.456974731550659,
 ('filmmaking', 'evidently'): -19.313600583125087,
 ('filmmaking', 'clearly'): -19.075167221623424,
 ('filmmaking', 'manifestly'): -24.732249796583723,
 ('filmmaking', 'naturally'): -20.696333242539914,
 ('filmmaking', 'patently'): -24.732458952125107,
 ('filmmaking', 'apparently'): -17.461207582042277,
 ('filmmaking', 'plainly'): -23.710683017029282,
 ('filmmaking', 'definitely'): -18.267389435869116,
 ('filmmaking', 'surely'): -18.808225156862594,
 ('filmmaking', 'undoubtedly'): -19.50846641597429,
 ('films', 'evidently'): -14.618081223369813,
 ('films