In [26]:
import numpy as np
import torch
from transformers import BertTokenizer, BertForSequenceClassification, BertForMaskedLM
import re


In [31]:
from sentence_transformers import SentenceTransformer, util


In [2]:
# 加载同义词集
cos_sim = np.load('cos_sim_counter_fitting.npy')

In [3]:
# 选择最相似的同义词
def pick_most_similar_words_batch(src_words, sim_mat, idx2word, ret_count=10, threshold=0.):
    """
    Given a list of source words (their indices), a similarity matrix, and an index-to-word mapping,
    this function returns the top `ret_count` similar words for each source word, filtered by a given threshold.
    
    Parameters:
    - src_words: List of source word indices.
    - sim_mat: Similarity matrix of shape (vocab_size, vocab_size).
    - idx2word: A mapping from word index to actual word.
    - ret_count: Number of top similar words to return for each source word.
    - threshold: A similarity threshold to filter out words.
    
    Returns:
    - sim_words: A list of lists containing similar words for each source word.
    - sim_values: A list of lists containing similarity values for each word in sim_words.
    """
    
    # 对于每个src_word，找到其与其他所有单词的相似度排名（从高到低）
    sim_order = np.argsort(-sim_mat[src_words, :])[:, 1:1 + ret_count]
    
    sim_words, sim_values = [], []  # 初始化列表以保存结果

    # 遍历src_words的每个词
    for idx, src_word in enumerate(src_words):
        # 获取对应src_word的相似度值
        sim_value = sim_mat[src_word][sim_order[idx]]
        
        # 根据阈值筛选出大于等于threshold的相似度值
        mask = sim_value >= threshold
        
        # 使用mask获取单词和其相似度值
        sim_word, sim_value = sim_order[idx][mask], sim_value[mask]
        
        # 将单词索引转换为实际的单词
        sim_word = [idx2word[id] for id in sim_word]
        
        # 保存结果
        sim_words.append(sim_word)
        sim_values.append(sim_value)

    return sim_words, sim_values  # 返回相似单词及其相似度值

In [4]:
# 计算影响最大的词
def influential_tokens(sentence, model, tokenizer, k=5):
    # 确保模型在评估模式
    model.eval()

    # 获取原始logit输出
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True)
    with torch.no_grad():
        original_logits = model(**inputs).logits

    # 为每个token计算logit差异
    token_ids = inputs["input_ids"][0].tolist()  # 获取token ids
    tokens = [tokenizer.decode([token_id]) for token_id in token_ids]  # 转换为tokens
    diffs = []

    for i, token_id in enumerate(token_ids):
        # 如果是[CLS], [SEP]或[PAD]，则跳过
        if token_id in [tokenizer.cls_token_id, tokenizer.sep_token_id, tokenizer.pad_token_id]:
            continue

        # 将当前token替换为[MASK]
        masked_input_ids = inputs["input_ids"].clone()
        masked_input_ids[0][i] = tokenizer.mask_token_id

        # 获取mask后的logits
        with torch.no_grad():
            masked_logits = model(input_ids=masked_input_ids, attention_mask=inputs["attention_mask"]).logits

        # 计算logit差异
        diff = torch.abs(original_logits - masked_logits).sum().item()
        diffs.append((tokens[i], i, diff))  # 保存token, index和差异值

    # 根据差异大小排序tokens
    sorted_tokens = sorted(diffs, key=lambda x: x[2], reverse=True)

    # 返回前k个token及其索引
    return [(token_info[0], token_info[1]) for token_info in sorted_tokens[:k]]

In [6]:
# 正则表达式去掉特殊字符
def extract_words(sentence):
    words = re.findall(r'\b\w+\b', sentence)
    return ' '.join(words)

In [7]:
# 加载模型

# 窃取的模型
model_path = '/home/ubuntu/zhc/work/adversarial_output'
tokenizer = BertTokenizer.from_pretrained(model_path)
model = BertForSequenceClassification.from_pretrained(model_path)

sentence = "This is by far the worst customer service place in the country. Good selection but sometimes I'll rather pay more elsewhere to avoid the employees there"
sentence = extract_words(sentence)
# print(len(sentence))
k = 10

top_k_tokens_and_indices = influential_tokens(sentence, model, tokenizer, k)

In [8]:
# topk个影响最大的词
top_k_tokens_and_indices

[('worst', 6),
 ('customer', 7),
 ('place', 9),
 ('service', 8),
 ('by', 3),
 ('the', 25),
 ('avoid', 24),
 ('this', 1),
 ('to', 23),
 ('is', 2)]

In [9]:
def get_token_from_encoded(sentence, index, tokenizer):
    token_ids = tokenizer.encode(sentence, add_special_tokens=True)
    token = tokenizer.decode([token_ids[index]])
    return token


perturb_idxes = []
words_perturb = []
for i in range(k):
    perturb_idxes.append(top_k_tokens_and_indices[i][1])
    words_perturb.append(top_k_tokens_and_indices[i][0])


In [12]:
perturb_idxes, words_perturb #得到替换词下标和词

([6, 7, 9, 8, 3, 25, 24, 1, 23, 2],
 ['worst',
  'customer',
  'place',
  'service',
  'by',
  'the',
  'avoid',
  'this',
  'to',
  'is'])

In [15]:
idx2word = {}
word2idx = {}

print("Building vocab...")
with open('./counter-fitted-vectors.txt', 'r') as ifile:
    for line in ifile:
        word = line.split()[0]
        if word not in idx2word:
            idx2word[len(idx2word)] = word
            word2idx[word] = len(idx2word) - 1

Building vocab...


In [16]:
words_perturb_idx = [word2idx[word] for word in words_perturb if word in word2idx]

In [18]:
words_perturb_idx # 同义词典的位置

[29711, 43347, 45368, 361, 52770, 20088, 55632, 51532, 33735, 24167]

In [19]:
words_perturb = [(idx, idx2word[idx]) for idx in words_perturb_idx]
words_perturb

[(29711, 'worst'),
 (43347, 'customer'),
 (45368, 'place'),
 (361, 'service'),
 (52770, 'by'),
 (20088, 'the'),
 (55632, 'avoid'),
 (51532, 'this'),
 (33735, 'to'),
 (24167, 'is')]

In [20]:
synonym_words, _ = pick_most_similar_words_batch(words_perturb_idx, cos_sim, idx2word, 20, 0.5)

In [22]:
# 得到top20同义词
synonym_words

[['lousiest',
  'worse',
  'meanest',
  'pire',
  'gravest',
  'hardest',
  'trickiest',
  'roughest',
  'harshest',
  'toughest',
  'biggest',
  'grandest',
  'strongest',
  'greatest',
  'hugest',
  'strictest',
  'shittiest',
  'largest',
  'ugliest',
  'finest'],
 ['client',
  'clients',
  'customers',
  'consumers',
  'clientele',
  'users',
  'shoppers',
  'buyers',
  'user',
  'patrons',
  'guests',
  'subscribers',
  'subscriber',
  'consumer',
  'beneficiaries',
  'beneficiary',
  'recipients',
  'diners',
  'receivers',
  'recipient'],
 ['placing',
  'mise',
  'stead',
  'platz',
  'site',
  'putting',
  'venue',
  'situ',
  'location',
  'placement',
  'locations',
  'spot',
  'plaza',
  'placements',
  'places',
  'put',
  'positioning',
  'scene',
  'pleasure',
  'loco'],
 ['servicing', 'services', 'serving', 'serve', 'serves', 'department'],
 ['per',
  'para',
  'paras',
  'par',
  'here',
  'via',
  'at',
  'pair',
  'through',
  'for',
  'torque',
  'under',
  'doublet'

In [24]:
synonym_words[0]

['lousiest',
 'worse',
 'meanest',
 'pire',
 'gravest',
 'hardest',
 'trickiest',
 'roughest',
 'harshest',
 'toughest',
 'biggest',
 'grandest',
 'strongest',
 'greatest',
 'hugest',
 'strictest',
 'shittiest',
 'largest',
 'ugliest',
 'finest']

In [27]:
# 1. 加载预训练的BERT模型和tokenizer
model_name = "bert-base-uncased"
model = BertForMaskedLM.from_pretrained(model_name)
tokenizer = BertTokenizer.from_pretrained(model_name)
model.eval()

def mask_sentence(sentence, mask_idx, tokenizer):
    """Inserts a [MASK] token at the specified index of the sentence."""
    tokens = tokenizer.tokenize(sentence)
    tokens[mask_idx] = '[MASK]'
    return tokenizer.convert_tokens_to_string(tokens)

Some weights of the model checkpoint at bert-base-uncased were not used when initializing BertForMaskedLM: ['cls.seq_relationship.bias', 'bert.pooler.dense.weight', 'bert.pooler.dense.bias', 'cls.seq_relationship.weight']
- This IS expected if you are initializing BertForMaskedLM from the checkpoint of a model trained on another task or with another architecture (e.g. initializing a BertForSequenceClassification model from a BertForPreTraining model).
- This IS NOT expected if you are initializing BertForMaskedLM from the checkpoint of a model that you expect to be exactly identical (initializing a BertForSequenceClassification model from a BertForSequenceClassification model).


In [28]:
word_mask_probability_list = []

for index, idx in enumerate(perturb_idxes):    
    msk_sentence = mask_sentence(sentence, mask_idx=idx-1, tokenizer=tokenizer)

    print(idx,msk_sentence)
    
    inputs = tokenizer(msk_sentence, return_tensors="pt")
    mask_idx = torch.where(inputs["input_ids"][0] == tokenizer.mask_token_id)[0].item()
    # 3. 使用BERT预测[MASK]位置的概率分布
    with torch.no_grad():
        outputs = model(**inputs)
        predictions = outputs.logits[0, mask_idx].softmax(dim=0)

    word_mask_probability = {}
    
    for word in synonym_words[index]:
        word_id = tokenizer.convert_tokens_to_ids(word)
        word_probability = predictions[word_id].item()
        word_mask_probability[word] = word_probability
        # print(word_mask_probability)
    # print("---")
    
    total_value = sum(word_mask_probability.values())
    normalized_data = {k: v / total_value for k, v in word_mask_probability.items()}

    word_mask_probability_list.append(normalized_data)      

6 this is by far the [MASK] customer service place in the country good selection but sometimes i ll rather pay more elsewhere to avoid the employees there
7 this is by far the worst [MASK] service place in the country good selection but sometimes i ll rather pay more elsewhere to avoid the employees there
9 this is by far the worst customer service [MASK] in the country good selection but sometimes i ll rather pay more elsewhere to avoid the employees there
8 this is by far the worst customer [MASK] place in the country good selection but sometimes i ll rather pay more elsewhere to avoid the employees there
3 this is [MASK] far the worst customer service place in the country good selection but sometimes i ll rather pay more elsewhere to avoid the employees there
25 this is by far the worst customer service place in the country good selection but sometimes i ll rather pay more elsewhere to avoid [MASK] employees there
24 this is by far the worst customer service place in the country goo

In [29]:
word_mask_probability_list

[{'lousiest': 3.36199143199791e-06,
  'worse': 0.00029483650906247347,
  'meanest': 3.36199143199791e-06,
  'pire': 3.36199143199791e-06,
  'gravest': 3.36199143199791e-06,
  'hardest': 0.007192660904432566,
  'trickiest': 3.36199143199791e-06,
  'roughest': 3.36199143199791e-06,
  'harshest': 3.36199143199791e-06,
  'toughest': 3.36199143199791e-06,
  'biggest': 0.36415523862722426,
  'grandest': 3.36199143199791e-06,
  'strongest': 0.00652603025452569,
  'greatest': 0.005930805948555896,
  'hugest': 3.36199143199791e-06,
  'strictest': 3.36199143199791e-06,
  'shittiest': 3.36199143199791e-06,
  'largest': 0.5721295179169974,
  'ugliest': 3.36199143199791e-06,
  'finest': 0.04372720395058579},
 {'client': 0.07052511201904697,
  'clients': 0.005505798024715439,
  'customers': 0.24449151609369407,
  'consumers': 0.05662476344170525,
  'clientele': 0.0015177043687120927,
  'users': 0.011127843131108182,
  'shoppers': 0.0015177043687120927,
  'buyers': 0.0037192539655181754,
  'user': 0.

In [30]:
def replace_token_at_index(sentence, index, replacement_word, tokenizer):
    token_ids = tokenizer.encode(sentence, add_special_tokens=False)
    replacement_ids = tokenizer.encode(replacement_word, add_special_tokens=False)
    
    if len(replacement_ids) != 1:
        # 将replacement_word设置为"[UNK]"标记
        replacement_word = "[UNK]"
        replacement_ids = tokenizer.encode(replacement_word, add_special_tokens=False)
    
    token_ids[index] = replacement_ids[0]
    return tokenizer.decode(token_ids)


def generate_nested_sentences(sentence, indices, replacement_words_nested_list, tokenizer):
    if len(indices) != len(replacement_words_nested_list):
        raise ValueError("Length of indices and replacement words nested list should be the same.")
    
    nested_sentences = []
    for idx, replacement_words_list in zip(indices, replacement_words_nested_list):
        modified_sentences_for_idx = []
        for replacement_word in replacement_words_list:
            modified_sentence = replace_token_at_index(sentence, idx, replacement_word, tokenizer)
            modified_sentences_for_idx.append(modified_sentence)
        nested_sentences.append(modified_sentences_for_idx)
    return nested_sentences

indices = perturb_idxes
replacement_words_nested_list = synonym_words

nested_sentences = generate_nested_sentences(sentence, indices, replacement_words_nested_list, tokenizer)

for i, sentences_for_index in enumerate(nested_sentences):
    print(f"For index {indices[i]}:")
    for modified_sentence in sentences_for_index:
        print(f"  - {modified_sentence}")


For index 6:
  - this is by far the worst [UNK] service place in the country good selection but sometimes i ll rather pay more elsewhere to avoid the employees there
  - this is by far the worst worse service place in the country good selection but sometimes i ll rather pay more elsewhere to avoid the employees there
  - this is by far the worst [UNK] service place in the country good selection but sometimes i ll rather pay more elsewhere to avoid the employees there
  - this is by far the worst [UNK] service place in the country good selection but sometimes i ll rather pay more elsewhere to avoid the employees there
  - this is by far the worst [UNK] service place in the country good selection but sometimes i ll rather pay more elsewhere to avoid the employees there
  - this is by far the worst hardest service place in the country good selection but sometimes i ll rather pay more elsewhere to avoid the employees there
  - this is by far the worst [UNK] service place in the country goo

In [32]:
model = SentenceTransformer('bert-base-nli-mean-tokens')


word_similarity_list = []
# 对单一句子进行编码
query_embedding = model.encode(sentences=sentence, convert_to_tensor=True)

for i in range(len(nested_sentences)):
    # 对多个句子进行编码
    sentence_embeddings = model.encode(nested_sentences[i], convert_to_tensor=True)
        
    # 计算相似度
    similarity_scores = util.pytorch_cos_sim(query_embedding, sentence_embeddings)
    similarity_scores = similarity_scores.to('cpu').numpy().reshape(-1)
    
    word_similarity_probability = {}
    for idx, word in enumerate(synonym_words[i]):    # break
        word_similarity_probability[word] = similarity_scores[idx]
    total_value = sum(word_similarity_probability.values())
    normalized_data = {k: v / total_value for k, v in word_similarity_probability.items()}

    word_similarity_list.append(normalized_data)    

In [33]:
word_similarity_list

[{'lousiest': 0.050281814278912554,
  'worse': 0.05015069382778549,
  'meanest': 0.050281814278912554,
  'pire': 0.050281814278912554,
  'gravest': 0.050281814278912554,
  'hardest': 0.04946358667217611,
  'trickiest': 0.050281814278912554,
  'roughest': 0.050281814278912554,
  'harshest': 0.050281814278912554,
  'toughest': 0.050281814278912554,
  'biggest': 0.049070851525799423,
  'grandest': 0.050281814278912554,
  'strongest': 0.049538524803608035,
  'greatest': 0.05000881538350827,
  'hugest': 0.050281814278912554,
  'strictest': 0.050281814278912554,
  'shittiest': 0.050281814278912554,
  'largest': 0.04802518614299593,
  'ugliest': 0.050281814278912554,
  'finest': 0.05007875601826349},
 {'client': 0.05008598147882964,
  'clients': 0.05006311607325184,
  'customers': 0.0500539195519733,
  'consumers': 0.04999016739599157,
  'clientele': 0.04994412483835196,
  'users': 0.0500473848660583,
  'shoppers': 0.04994412483835196,
  'buyers': 0.05006863458553143,
  'user': 0.050172685969

In [34]:
import math 
combined_probs = []
for dict1, dict2 in zip(word_similarity_list, word_mask_probability_list):
    combined_dict = {}
    for token in dict1:
        combined_dict[token] = math.log(dict1[token]) + math.log(dict2[token])
    combined_probs.append(combined_dict)

In [35]:
combined_probs

[{'lousiest': -15.593088883865258,
  'worse': -11.121812491440904,
  'meanest': -15.593088883865258,
  'pire': -15.593088883865258,
  'gravest': -15.593088883865258,
  'hardest': -7.941212594542692,
  'trickiest': -15.593088883865258,
  'roughest': -15.593088883865258,
  'harshest': -15.593088883865258,
  'toughest': -15.593088883865258,
  'biggest': -4.024665098260676,
  'grandest': -15.593088883865258,
  'strongest': -8.036961078072004,
  'greatest': -8.123151146251256,
  'hugest': -15.593088883865258,
  'strictest': -15.593088883865258,
  'shittiest': -15.593088883865258,
  'largest': -3.5944195776835803,
  'ugliest': -15.593088883865258,
  'finest': -6.123943246921323},
 {'client': -5.645800554583403,
  'clients': -8.196424303842848,
  'customers': -4.403229133232981,
  'consumers': -5.8672378177903095,
  'clientele': -9.487406771153351,
  'users': -7.493089946247938,
  'shoppers': -9.487406771153351,
  'buyers': -8.588592700836353,
  'user': -5.955698349607361,
  'patrons': -8.693

In [36]:
import heapq

def k_best_viterbi(probs, k=1):
    # 初始化路径
    paths = [(score, [token]) for token, score in probs[0].items()]
    
    for i in range(1, len(probs)):
        new_paths = []
        for prev_score, prev_path in paths:
            for next_token, next_score in probs[i].items():
                new_score = prev_score + next_score
                heapq.heappush(new_paths, (new_score, prev_path + [next_token]))
                if len(new_paths) > k:
                    heapq.heappop(new_paths)
        paths = new_paths
        
    return sorted(paths, key=lambda x: x[0], reverse=True)


all_paths = k_best_viterbi(combined_probs, k=1000)
for score, path in all_paths:
    print(path, score)


['largest', 'consumer', 'site', 'services', 'at', 'from', 'prevent', 'that', 'for', 'makes'] -34.74495742232332
['largest', 'consumer', 'site', 'services', 'at', 'from', 'bypass', 'that', 'for', 'makes'] -34.8211054285622
['biggest', 'consumer', 'site', 'services', 'at', 'from', 'prevent', 'that', 'for', 'makes'] -35.17520294290041
['largest', 'consumer', 'site', 'serving', 'at', 'from', 'prevent', 'that', 'for', 'makes'] -35.183474804104875
['biggest', 'consumer', 'site', 'services', 'at', 'from', 'bypass', 'that', 'for', 'makes'] -35.251350949139294
['largest', 'consumer', 'site', 'serving', 'at', 'from', 'bypass', 'that', 'for', 'makes'] -35.25962281034375
['largest', 'customers', 'site', 'services', 'at', 'from', 'prevent', 'that', 'for', 'makes'] -35.42236355876126
['largest', 'consumer', 'site', 'services', 'for', 'from', 'prevent', 'that', 'for', 'makes'] -35.43556011741977
['largest', 'customers', 'site', 'services', 'at', 'from', 'bypass', 'that', 'for', 'makes'] -35.498511565

In [37]:
perturb_idxes, all_paths

([6, 7, 9, 8, 3, 25, 24, 1, 23, 2],
 [(-34.74495742232332,
   ['largest',
    'consumer',
    'site',
    'services',
    'at',
    'from',
    'prevent',
    'that',
    'for',
    'makes']),
  (-34.8211054285622,
   ['largest',
    'consumer',
    'site',
    'services',
    'at',
    'from',
    'bypass',
    'that',
    'for',
    'makes']),
  (-35.17520294290041,
   ['biggest',
    'consumer',
    'site',
    'services',
    'at',
    'from',
    'prevent',
    'that',
    'for',
    'makes']),
  (-35.183474804104875,
   ['largest',
    'consumer',
    'site',
    'serving',
    'at',
    'from',
    'prevent',
    'that',
    'for',
    'makes']),
  (-35.251350949139294,
   ['biggest',
    'consumer',
    'site',
    'services',
    'at',
    'from',
    'bypass',
    'that',
    'for',
    'makes']),
  (-35.25962281034375,
   ['largest',
    'consumer',
    'site',
    'serving',
    'at',
    'from',
    'bypass',
    'that',
    'for',
    'makes']),
  (-35.42236355876126,
   

In [38]:
def replace_tokens_in_sentence(sentence, indices, replacement_tokens, tokenizer):
    # Tokenize the original sentence
    tokens = tokenizer.tokenize(sentence)
    
    # Replace tokens at specified indices
    for i, index in enumerate(indices):
        tokens[index-1] = replacement_tokens[i]

    # Decode back to sentence
    return tokenizer.decode(tokenizer.convert_tokens_to_ids(tokens))

# Example
tokenizer = BertTokenizer.from_pretrained('bert-base-uncased')

for score, path in all_paths:
    modified_sentence = replace_tokens_in_sentence(sentence, perturb_idxes, path, tokenizer)  
    
    print(modified_sentence)

that makes at far the largest consumer services site in the country good selection but sometimes i ll rather pay more elsewhere for prevent from employees there
that makes at far the largest consumer services site in the country good selection but sometimes i ll rather pay more elsewhere for bypass from employees there
that makes at far the biggest consumer services site in the country good selection but sometimes i ll rather pay more elsewhere for prevent from employees there
that makes at far the largest consumer serving site in the country good selection but sometimes i ll rather pay more elsewhere for prevent from employees there
that makes at far the biggest consumer services site in the country good selection but sometimes i ll rather pay more elsewhere for bypass from employees there
that makes at far the largest consumer serving site in the country good selection but sometimes i ll rather pay more elsewhere for bypass from employees there
that makes at far the largest customers

In [39]:


# 加载受害模型和tokenizer

black_box_path = "/home/ubuntu/zhc/work/output"
tokenizer = BertTokenizer.from_pretrained(black_box_path)
model = BertForSequenceClassification.from_pretrained(black_box_path)
model.eval()
model.to('cuda')  # if you are using GPU

def predict_sentiment(sentence, model, tokenizer):
    inputs = tokenizer(sentence, return_tensors="pt", truncation=True, padding=True, max_length=512)
    inputs = {k: v.to('cuda') for k, v in inputs.items()}  # Move to GPU if using GPU
    with torch.no_grad():
        outputs = model(**inputs)
    logits = outputs.logits
    predicted_class = torch.argmax(logits, dim=1).item()
    return predicted_class

original_sentiment = predict_sentiment(sentence, model, tokenizer)

print(original_sentiment)



0


### 上面的结果 是 负面输出 0 

In [40]:
success = False

for score, path in all_paths:
    modified_sentence = replace_tokens_in_sentence(sentence, indices, path, tokenizer)
    modified_sentiment = predict_sentiment(modified_sentence, model, tokenizer)
    print(modified_sentiment)
    if modified_sentiment != original_sentiment:
        print(f"Attack successful with sentence: {modified_sentence}")
        success = True
        break

if not success:
    print("Attack failed after trying all paths.")

0
1
Attack successful with sentence: that makes at far the largest consumer services site in the country good selection but sometimes i ll rather pay more elsewhere for bypass from employees there
