In [1]:
import numpy as np

# wordvector's embedding-path
embedding_path = './counter-fitted-vectors.txt' 


embeddings = []

with open(embedding_path, 'r') as ifile:
    for line in ifile:
        embedding = [float(num) for num in line.strip().split()[1:]]
        embeddings.append(embedding)
        
embeddings = np.array(embeddings)
print(embeddings.T.shape)

(300, 65713)


In [2]:
# calculate all embedding-vector's l2 norm
norm = np.linalg.norm(embeddings, axis=1, keepdims=True)
# normalication
embeddings = np.asarray(embeddings / norm, "float32")
product = np.dot(embeddings, embeddings.T)
np.save(('cos_sim_counter_fitting.npy'), product)

In [3]:
idx2word = {}
word2idx = {}

print("Building vocab...")
with open('./counter-fitted-vectors.txt', 'r') as ifile:
    for line in ifile:
        word = line.split()[0]
        if word not in idx2word:
            idx2word[len(idx2word)] = word
            word2idx[word] = len(idx2word) - 1

Building vocab...


In [5]:
print('Load pre-computed cosine similarity matrix from {}'.format('cos_sim_counter_fitting.npy'))
cos_sim = np.load('cos_sim_counter_fitting.npy')

Load pre-computed cosine similarity matrix from cos_sim_counter_fitting.npy


In [7]:
def pick_most_similar_words_batch(src_words, sim_mat, idx2word, ret_count=10, threshold=0.):
    """
    Given a list of source words (their indices), a similarity matrix, and an index-to-word mapping,
    this function returns the top `ret_count` similar words for each source word, filtered by a given threshold.
    
    Parameters:
    - src_words: List of source word indices.
    - sim_mat: Similarity matrix of shape (vocab_size, vocab_size).
    - idx2word: A mapping from word index to actual word.
    - ret_count: Number of top similar words to return for each source word.
    - threshold: A similarity threshold to filter out words.
    
    Returns:
    - sim_words: A list of lists containing similar words for each source word.
    - sim_values: A list of lists containing similarity values for each word in sim_words.
    """
    
    # 对于每个src_word，找到其与其他所有单词的相似度排名（从高到低）
    sim_order = np.argsort(-sim_mat[src_words, :])[:, 1:1 + ret_count]
    
    sim_words, sim_values = [], []  # 初始化列表以保存结果

    # 遍历src_words的每个词
    for idx, src_word in enumerate(src_words):
        # 获取对应src_word的相似度值
        sim_value = sim_mat[src_word][sim_order[idx]]
        
        # 根据阈值筛选出大于等于threshold的相似度值
        mask = sim_value >= threshold
        
        # 使用mask获取单词和其相似度值
        sim_word, sim_value = sim_order[idx][mask], sim_value[mask]
        
        # 将单词索引转换为实际的单词
        sim_word = [idx2word[id] for id in sim_word]
        
        # 保存结果
        sim_words.append(sim_word)
        sim_values.append(sim_value)

    return sim_words, sim_values  # 返回相似单词及其相似度值

In [31]:
text_ls = "Whoever wrote the screenplay for this movie obviously never consulted any books about Lucille Ball, especially her autobiography. I've never seen so many mistakes in a biopic, ranging from her early years in Celoron and Jamestown to her later years with Desi. I could write a whole list of factual errors, but it would go on for pages. In all, I believe that Lucille Ball is one of those inimitable people who simply cannot be portrayed by anyone other than themselves. If I were Lucie Arnaz and Desi, Jr., I would be irate at how many mistakes were made in this film. The filmmakers tried hard, but the movie seems awfully sloppy to me.".split()
perturb_idxes = [7]

In [32]:
words_perturb = [(idx, text_ls[idx]) for idx in perturb_idxes]

In [33]:
words_perturb

[(7, 'obviously')]

In [34]:
words_perturb_idx = [word2idx[word] for idx, word in words_perturb if word in word2idx]

In [35]:
synonym_words, _ = pick_most_similar_words_batch(words_perturb_idx, cos_sim, idx2word, 10, 0.5)

In [36]:
synonym_words

[['evidently',
  'clearly',
  'manifestly',
  'naturally',
  'patently',
  'apparently',
  'plainly',
  'definitely',
  'surely',
  'undoubtedly']]

In [37]:
synonyms_all = []
for idx, word in words_perturb:
    if word in word2idx:
        synonyms = synonym_words.pop(0)
        if synonyms:
            synonyms_all.append((idx, synonyms))

In [38]:
synonyms_all

[(7,
  ['evidently',
   'clearly',
   'manifestly',
   'naturally',
   'patently',
   'apparently',
   'plainly',
   'definitely',
   'surely',
   'undoubtedly'])]

In [39]:
text_prime = text_ls
len_text = len(text_ls)

In [40]:
for idx, synonyms in synonyms_all:
    new_texts = [text_prime[:idx] + [synonym] + text_prime[min(idx + 1, len_text):] for synonym in synonyms]
    print(new_texts)

[['Whoever', 'wrote', 'the', 'screenplay', 'for', 'this', 'movie', 'evidently', 'never', 'consulted', 'any', 'books', 'about', 'Lucille', 'Ball,', 'especially', 'her', 'autobiography.', "I've", 'never', 'seen', 'so', 'many', 'mistakes', 'in', 'a', 'biopic,', 'ranging', 'from', 'her', 'early', 'years', 'in', 'Celoron', 'and', 'Jamestown', 'to', 'her', 'later', 'years', 'with', 'Desi.', 'I', 'could', 'write', 'a', 'whole', 'list', 'of', 'factual', 'errors,', 'but', 'it', 'would', 'go', 'on', 'for', 'pages.', 'In', 'all,', 'I', 'believe', 'that', 'Lucille', 'Ball', 'is', 'one', 'of', 'those', 'inimitable', 'people', 'who', 'simply', 'cannot', 'be', 'portrayed', 'by', 'anyone', 'other', 'than', 'themselves.', 'If', 'I', 'were', 'Lucie', 'Arnaz', 'and', 'Desi,', 'Jr.,', 'I', 'would', 'be', 'irate', 'at', 'how', 'many', 'mistakes', 'were', 'made', 'in', 'this', 'film.', 'The', 'filmmakers', 'tried', 'hard,', 'but', 'the', 'movie', 'seems', 'awfully', 'sloppy', 'to', 'me.'], ['Whoever', 'wrot

In [41]:
import torch
from transformers import BertTokenizer, BertForMaskedLM

In [42]:
# 1. 加载预训练的BERT模型和tokenizer
model_name = "bert-base-uncased"
model = BertForMaskedLM.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
model.eval()

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight', 'bert.pooler.dense.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


BertForMaskedLM(
  (bert): BertModel(
    (embeddings): BertEmbeddings(
      (word_embeddings): Embedding(30522, 768, padding_idx=0)
      (position_embeddings): Embedding(512, 768)
      (token_type_embeddings): Embedding(2, 768)
      (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_affine=True)
      (dropout): Dropout(p=0.1, inplace=False)
    )
    (encoder): BertEncoder(
      (layer): ModuleList(
        (0-11): 12 x BertLayer(
          (attention): BertAttention(
            (self): BertSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.1, inplace=False)
            )
            (output): BertSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (LayerNorm): LayerNorm((768,), eps=1e-12, elementwise_a

In [43]:
def mask_sentence(sentence, mask_idx, tokenizer):
    """Inserts a [MASK] token at the specified index of the sentence."""
    tokens = tokenizer.tokenize(sentence)
    tokens[mask_idx] = '[MASK]'
    return tokenizer.convert_tokens_to_string(tokens)

# 2. 标记化句子并获得[MASK]的位置
sentence = "Whoever wrote the screenplay for this movie obviously never consulted any books about Lucille Ball, especially her autobiography. I've never seen so many mistakes in a biopic, ranging from her early years in Celoron and Jamestown to her later years with Desi. I could write a whole list of factual errors, but it would go on for pages. In all, I believe that Lucille Ball is one of those inimitable people who simply cannot be portrayed by anyone other than themselves. If I were Lucie Arnaz and Desi, Jr., I would be irate at how many mistakes were made in this film. The filmmakers tried hard, but the movie seems awfully sloppy to me."

new_sentence = mask_sentence(sentence, mask_idx=7, tokenizer=tokenizer)

In [44]:

inputs = tokenizer(new_sentence, return_tensors="pt")
mask_idx = torch.where(inputs["input_ids"][0] == tokenizer.mask_token_id)[0].item()

# 3. 使用BERT预测[MASK]位置的概率分布
with torch.no_grad():
    outputs = model(**inputs)
    predictions = outputs.logits[0, mask_idx].softmax(dim=0)

In [45]:
word_mask_probability = {}

for idx, word_list in synonyms_all:
    for word in word_list:
        word_id = tokenizer.convert_tokens_to_ids(word)
        word_probability = predictions[word_id].item()
        word_mask_probability[word] = word_probability
        print(f"Probability of '{word}' being the masked token: {word_probability:.10f}")

Probability of 'evidently' being the masked token: 0.0001021026
Probability of 'clearly' being the masked token: 0.0001295633
Probability of 'manifestly' being the masked token: 0.0000004529
Probability of 'naturally' being the masked token: 0.0000256165
Probability of 'patently' being the masked token: 0.0000004529
Probability of 'apparently' being the masked token: 0.0006515997
Probability of 'plainly' being the masked token: 0.0000012570
Probability of 'definitely' being the masked token: 0.0002907286
Probability of 'surely' being the masked token: 0.0001693258
Probability of 'undoubtedly' being the masked token: 0.0000840849


In [46]:
# 1. 计算字典中所有值的和

total_value = sum(word_mask_probability.values())
# 2. 使用这个和来对每个值进行归一化
normalized_data = {k: v / total_value for k, v in word_mask_probability.items()}

normalized_data

{'evidently': 0.0701647586899207,
 'clearly': 0.08903565516000746,
 'manifestly': 0.0003112191691114321,
 'naturally': 0.017603581241432697,
 'patently': 0.0003112191691114321,
 'apparently': 0.4477781690909761,
 'plainly': 0.0008638359003986049,
 'definitely': 0.19978818133903484,
 'surely': 0.11636039484706538,
 'undoubtedly': 0.05778298539294134}

In [47]:
sorted_data = dict(sorted(normalized_data.items(), key=lambda item: item[1], reverse=True))


In [48]:
sorted_data

{'apparently': 0.4477781690909761,
 'definitely': 0.19978818133903484,
 'surely': 0.11636039484706538,
 'clearly': 0.08903565516000746,
 'evidently': 0.0701647586899207,
 'undoubtedly': 0.05778298539294134,
 'naturally': 0.017603581241432697,
 'plainly': 0.0008638359003986049,
 'manifestly': 0.0003112191691114321,
 'patently': 0.0003112191691114321}

In [49]:
import bert_score
def replace_word_at_index(sentence, index, replacement_dict):
    """Replace the word at the specified index if it exists in the replacement_dict."""
    tokens = sentence.split()  # Split sentence into tokens
    # if tokens[index] in replacement_dict:
    #     tokens[index] = replacement_dict[tokens[index]]  # Replace the word at the specified index
    word_replace = []

    for word in replacement_dict.keys():
        tokens[index] = word
        word_replace.append(" ".join(tokens))
        
    return word_replace# Join tokens back into a sentence

def compute_bertscore(candidates, references, lang="en", verbose=True, model_type="bert-base-uncased"):

    candidates = [candidates]
    P, R, F1 = bert_score.score(candidates, references, lang=lang, verbose=verbose, model_type=model_type)
    
    return P, R, F1
# Example usage:
candidates = "Whoever wrote the screenplay for this movie obviously never consulted any books about Lucille Ball, especially her autobiography. I've never seen so many mistakes in a biopic, ranging from her early years in Celoron and Jamestown to her later years with Desi. I could write a whole list of factual errors, but it would go on for pages. In all, I believe that Lucille Ball is one of those inimitable people who simply cannot be portrayed by anyone other than themselves. If I were Lucie Arnaz and Desi, Jr., I would be irate at how many mistakes were made in this film. The filmmakers tried hard, but the movie seems awfully sloppy to me."
references = replace_word_at_index(sentence=candidates, index=7, replacement_dict=sorted_data)
references

["Whoever wrote the screenplay for this movie apparently never consulted any books about Lucille Ball, especially her autobiography. I've never seen so many mistakes in a biopic, ranging from her early years in Celoron and Jamestown to her later years with Desi. I could write a whole list of factual errors, but it would go on for pages. In all, I believe that Lucille Ball is one of those inimitable people who simply cannot be portrayed by anyone other than themselves. If I were Lucie Arnaz and Desi, Jr., I would be irate at how many mistakes were made in this film. The filmmakers tried hard, but the movie seems awfully sloppy to me.",
 "Whoever wrote the screenplay for this movie definitely never consulted any books about Lucille Ball, especially her autobiography. I've never seen so many mistakes in a biopic, ranging from her early years in Celoron and Jamestown to her later years with Desi. I could write a whole list of factual errors, but it would go on for pages. In all, I believe 

In [50]:
bertscore = {}
for idx, reference in enumerate(references):
    reference = [reference]

    P, R, F1 = compute_bertscore(candidates, reference)
    bertscore[list(sorted_data.keys())[idx]] = F1


calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.84 seconds, 1.19 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.02 seconds, 64.60 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.02 seconds, 64.42 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.02 seconds, 55.51 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.02 seconds, 65.38 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.02 seconds, 58.52 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.02 seconds, 65.48 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.02 seconds, 64.01 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.02 seconds, 63.39 sentences/sec
calculating scores...
computing bert embedding.


  0%|          | 0/1 [00:00<?, ?it/s]

computing greedy matching.


  0%|          | 0/1 [00:00<?, ?it/s]

done in 0.02 seconds, 62.89 sentences/sec


In [51]:
# 1. 计算字典中所有值的和

total_value = sum(bertscore.values())

In [52]:
# 2. 使用这个和来对每个值进行归一化
normalized_data = {k: v / total_value for k, v in bertscore.items()}

normalized_data

{'apparently': tensor([0.1001]),
 'definitely': tensor([0.1001]),
 'surely': tensor([0.1001]),
 'clearly': tensor([0.1002]),
 'evidently': tensor([0.1001]),
 'undoubtedly': tensor([0.1001]),
 'naturally': tensor([0.1000]),
 'plainly': tensor([0.1000]),
 'manifestly': tensor([0.0997]),
 'patently': tensor([0.0997])}