In [1]:
from tqdm.auto import tqdm, trange
import pickle
import requests
import json

from abc import ABC
from typing import List, Tuple, Callable, Dict
from fairseq.hub_utils import GeneratorHubInterface
from scipy.optimize import NonlinearConstraint, differential_evolution
from textdistance import levenshtein
import numpy as np

import torch

2024-04-13 17:52:54 | INFO | fairseq.tasks.text_to_speech | Please install tensorboardX: pip install tensorboardX


In [2]:
# Load pre-trained translation model
en2fr = torch.hub.load('pytorch/fairseq',
                       'transformer.wmt14.en-fr',
                       tokenizer='moses',
                       bpe='subword_nmt').cuda()

Using cache found in /home/ximic/.cache/torch/hub/pytorch_fairseq_main
2024-04-13 17:52:59 | INFO | fairseq.file_utils | loading archive file https://dl.fbaipublicfiles.com/fairseq/models/wmt14.en-fr.joined-dict.transformer.tar.bz2 from cache at /home/ximic/.cache/torch/pytorch_fairseq/53f403ba27ab138b06c1a8d78f5bb4f1722567ac3d3b3e41f821ec2cae2974da.7ef8ab763efda16d3c82dd8b5a574bdfe524e078bac7b444ea1a9c5d355b55ae
2024-04-13 17:53:14 | INFO | fairseq.tasks.translation | [en] dictionary: 44512 types
2024-04-13 17:53:14 | INFO | fairseq.tasks.translation | [fr] dictionary: 44512 types
2024-04-13 17:53:18 | INFO | fairseq.models.fairseq_model | {'_name': None, 'common': {'_name': None, 'no_progress_bar': False, 'log_interval': 10, 'log_format': 'json', 'log_file': None, 'aim_repo': None, 'aim_run_hash': None, 'tensorboard_logdir': None, 'wandb_project': None, 'azureml_logging': False, 'seed': 2, 'cpu': False, 'tpu': False, 'bf16': False, 'memory_efficient_bf16': False, 'fp16': True, 'memor

## Unknown Characters

Unusual characters, such as zero-width spaces and control sequences, are simply encoded as the `<unk>` character by the FairSeq implementation. This likely generalizes to many other NLP models.

In [3]:
# Define function for decoding from the source dictionary
def src_decode(sentence):
  res = []
  for idx in sentence:
    res.append(en2fr.src_dict.symbols[idx])
  return ' '.join(res)

## Invisible Characters

Certian Unicode chacacters are almost never visually rendered by design. Conveniently, they can be embedded within strings and copied + pasted on most systems. Most NLP models are not trained against these characters, making them  not present in source language dictionaries. Thus, they typically result in an `<unk>` embedded vector.

In [4]:
# Zero width space
ZWSP = chr(0x200B)
# Zero width joiner
ZWJ = chr(0x200D)
# Zero width non-joiner
ZWNJ = chr(0x200C)

print(f"{ZWSP}{ZWJ}")

​‍


## Homoglyphs

The Unicode specification defines several homoglyph documents. The following section retrieves these documents and creates mapping between homoglyph characters.

In [5]:
confusables = dict()
intentionals = dict()

# Retrieve Unicode Confusable homoglyph characters
conf_resp = requests.get("https://www.unicode.org/Public/security/latest/confusables.txt", stream=True)
for line in conf_resp.iter_lines():
  if len(line):
    line = line.decode('utf-8-sig')
    if line[0] != '#':
      line = line.replace("#*", "#")
      _, line = line.split("#", maxsplit=1)
      if line[3] not in confusables:
        confusables[line[3]] = []
      confusables[line[3]].append(line[7])

# Retrieve Unicode Intentional homoglyph characters
int_resp = requests.get("https://www.unicode.org/Public/security/latest/intentional.txt", stream=True)
for line in int_resp.iter_lines():
  if len(line):
    line = line.decode('utf-8-sig')
    if line[0] != '#':
      line = line.replace("#*", "#")
      _, line = line.split("#", maxsplit=1)
      if line[3] not in intentionals:
        intentionals[line[3]] = []
      intentionals[line[3]].append(line[7])

## Reorderings

Unicode Bidirectional (Bidi) Algorithm override characters can be used to render encodeded characters in any order. The following section defines a function which can generate 2^|n| reordered encodings of a given string of length n which are all rendered the same way in any system built on Google's Chromium.

This [site](https://www.soscisurvey.de/tools/view-chars.php) can be used to visualize the underlying encoding of the text.

In [6]:
# Unicode Bidi override characters
PDF = chr(0x202C)
LRE = chr(0x202A)
RLE = chr(0x202B)
LRO = chr(0x202D)
RLO = chr(0x202E)

PDI = chr(0x2069)
LRI = chr(0x2066)
RLI = chr(0x2067)

class Swap():
    """Represents swapped elements in a string of text."""
    def __init__(self, one, two):
        self.one = one
        self.two = two
    
    def __repr__(self):
        return f"Swap({self.one}, {self.two})"

    def __eq__(self, other):
        return self.one == other.one and self.two == other.two

    def __hash__(self):
        return hash((self.one, self.two))

def some(*els):
    """Returns the arguments as a tuple with Nones removed."""
    return tuple(filter(None, tuple(els)))

def swaps(chars: str) -> set:
    """Generates all possible swaps for a string."""
    def pairs(chars, pre=(), suf=()):
        orders = set()
        for i in range(len(chars)-1):
            prefix = pre + tuple(chars[:i])
            suffix = suf + tuple(chars[i+2:])
            swap = Swap(chars[i+1], chars[i])
            pair = some(prefix, swap, suffix)
            orders.add(pair)
            orders.update(pairs(suffix, pre=some(prefix, swap)))
            orders.update(pairs(some(prefix, swap), suf=suffix))
        return orders
    return pairs(chars) | {tuple(chars)}

def unswap(el: tuple) -> str:
    """Reverts a tuple of swaps to the original string."""
    if isinstance(el, str):
        return el
    elif isinstance(el, Swap):
        return unswap((el.two, el.one))
    else:
        res = ""
        for e in el:
            res += unswap(e)
        return res

def uniswap(els):
    res = ""
    for el in els:
        if isinstance(el, Swap):
            res += uniswap([LRO, LRI, RLO, LRI, el.one, PDI, LRI, el.two, PDI, PDF, PDI, PDF])
        elif isinstance(el, str):
            res += el
        else:
            for subel in el:
                res += uniswap([subel])
    return res

def strings_to_file(file, string):
  with open(file, 'w') as f:
      for swap in swaps(string):
          uni = uniswap(swap)
          print(uni, file=f)

def print_strings(string):
  for swap in swaps(string):
    uni = uniswap(swap)
    print(uni)

## Deletions

Unicode control characters used for deleting text can be encoded into strings. Upon rendering, these control characters are actioned and the appropriate surrounding text is not rendered. Yet, NLP models generally still "see" the surrounding text.

In [7]:
# Backspace character
BKSP = chr(0x8)
# Delete character
DEL = chr(0x7F)
# Carriage return character
CR = chr(0xD)

print(f"{CR}{BKSP}{DEL}")




## Untargeted Integrity Attacks

The performance of various NLP models can be degraded through the use of invisible character, homoglyph, reordering, and deletion attacks. The most effective attacks can be found, independent of the underlying model, using a genetic algorithm.

### Attack Setup

Each attack will be defined as an object and set of contstraints over which a genetic algorithm (differential evolution) will optimize. For these attacks, the visual representation of the input is fixed and the aim of the attack is to determine the imperceptible perturbation for which the supplied model's output will be maximally distant from the output of the unperturbed input.

Each attack will be derived from the following Objective abstract class.

In [8]:
class Objective(ABC):
  """ Abstract class representing objectives for scipy's genetic algorithms."""

  def __init__(self, model: GeneratorHubInterface, input: str, max_perturbs: int, distance: Callable[[str,str],int]):
    if not model:
      raise ValueError("Must supply model.")
    if not input:
      raise ValueError("Must supply input.")

    self.model: GeneratorHubInterface = model
    self.input: str = input
    self.max_perturbs: int = max_perturbs
    self.distance: Callable[[str,str],int] = distance
    self.output = self.model.translate(self.input)

  def objective(self) -> Callable[[List[float]], float]:
    def _objective(perturbations: List[float]) -> float:
      candidate: str = self.candidate(perturbations)
      translation: str = self.model.translate(candidate)
      return -self.distance(self.output, translation)
    return _objective

  def differential_evolution(self, print_result=True, verbose=True, maxiter=60, popsize=32, polish=False) -> str:
    result = differential_evolution(self.objective(), self.bounds(),
                                    disp=verbose, maxiter=maxiter,
                                    popsize=popsize, polish=polish)
    candidate = self.candidate(result.x)
    if (print_result):
      print(f"Result: {candidate}")
      print(f"Result Distance: {result.fun}")
      print(f"Perturbation Encoding: {result.x}")
      print(f"Input Translation: {self.output}")
      print(f"Result Translation: {self.model.translate(candidate)}")
    return candidate

  def bounds(self) -> List[Tuple[float, float]]:
    raise NotImplementedError()

  def candidate(self, perturbations: List[float]) -> str:
    raise NotImplementedError()


def natural(x: float) -> int:
    """Rounds float to the nearest natural number (positive int)"""
    return max(0, round(float(x)))

In [9]:
class InvisibleCharacterObjective(Objective):
  """Class representing an Objective which injects invisible characters."""

  def __init__(self, model: GeneratorHubInterface, input: str, max_perturbs: int = 25, invisible_chrs: List[str] = [ZWJ,ZWSP,ZWNJ], distance: Callable[[str,str],int] = levenshtein.distance, **kwargs):
    super().__init__(model, input, max_perturbs, distance)
    self.invisible_chrs: List[str] = invisible_chrs

  def bounds(self) -> List[Tuple[float, float]]:
    return [(0,len(self.invisible_chrs)-1), (-1, len(self.input)-1)] * self.max_perturbs

  def candidate(self, perturbations: List[float]) -> str:
    candidate = [char for char in self.input]
    for i in range(0, len(perturbations), 2):
      inp_index = natural(perturbations[i+1])
      if inp_index >= 0:
        inv_char = self.invisible_chrs[natural(perturbations[i])]
        candidate = candidate[:inp_index] + [inv_char] + candidate[inp_index:]
    return ''.join(candidate)

In [10]:
class HomoglyphObjective(Objective):

  def __init__(self, model: GeneratorHubInterface, input: str, max_perturbs=None, distance: Callable[[str,str],int] = levenshtein.distance, homoglyphs: Dict[str,List[str]] = intentionals, **kwargs):
    super().__init__(model, input, max_perturbs, distance)
    if not self.max_perturbs:
      self.max_perturbs = len(self.input)
    self.homoglyphs = homoglyphs
    self.glyph_map = []
    for i, char in enumerate(self.input):
      if char in self.homoglyphs:
        charmap = self.homoglyphs[char]
        charmap = list(zip([i] * len(charmap), charmap))
        self.glyph_map.extend(charmap)

  def bounds(self) -> List[Tuple[float, float]]:
    return [(-1, len(self.glyph_map)-1)] * self.max_perturbs

  def candidate(self, perturbations: List[float]) -> str:
    candidate = [char for char in self.input]  
    for perturb in map(natural, perturbations):
      if perturb >= 0:
        i, char = self.glyph_map[perturb]
        candidate[i] = char
    return ''.join(candidate)

In [11]:
class ReorderObjective(Objective):

  def __init__(self, model: GeneratorHubInterface, input: str, max_perturbs: int = 50, distance: Callable[[str,str],int] = levenshtein.distance, **kwargs):
    super().__init__(model, input, max_perturbs, distance)

  def bounds(self) -> List[Tuple[float, float]]:
    return [(-1,len(self.input)-1)] * self.max_perturbs

  def candidate(self, perturbations: List[float]) -> str:
    def swaps(els) -> str:
      res = ""
      for el in els:
          if isinstance(el, Swap):
              res += swaps([LRO, LRI, RLO, LRI, el.one, PDI, LRI, el.two, PDI, PDF, PDI, PDF])
          elif isinstance(el, str):
              res += el
          else:
              for subel in el:
                  res += swaps([subel])
      return res

    _candidate = [char for char in self.input]
    for perturb in map(natural, perturbations):
      if perturb >= 0 and len(_candidate) >= 2:
        perturb = min(perturb, len(_candidate) - 2)
        _candidate = _candidate[:perturb] + [Swap(_candidate[perturb+1], _candidate[perturb])] + _candidate[perturb+2:]

    return swaps(_candidate)

In [12]:
class DeletionObjective(Objective):
  """Class representing an Objective which injects deletion control characters."""

  def __init__(self, model: GeneratorHubInterface, input: str, max_perturbs: int = 100, distance: Callable[[str,str],int] = levenshtein.distance, del_chr: str = BKSP, ins_chr_min: str = '!', ins_chr_max: str = '~', **kwargs):
    super().__init__(model, input, max_perturbs, distance)
    self.del_chr: str = del_chr
    self.ins_chr_min: str = ins_chr_min
    self.ins_chr_max: str = ins_chr_max

  def bounds(self) -> List[Tuple[float, float]]:
    return [(-1,len(self.input)-1), (ord(self.ins_chr_min),ord(self.ins_chr_max))] * self.max_perturbs

  def candidate(self, perturbations: List[float]) -> str:
    candidate = [char for char in self.input]
    for i in range(0, len(perturbations), 2):
      idx = natural(perturbations[i])
      char = chr(natural(perturbations[i+1]))
      candidate = candidate[:idx] + [char, self.del_chr] + candidate[idx:]
      for j in range(i,len(perturbations), 2):
        perturbations[j] += 2
    return ''.join(candidate)

### SST2 Attacks

#### Attack Setup

In [13]:
# !wget https://cims.nyu.edu/~sbowman/multinli/multinli_1.0.zip
# !unzip multinli_1.0.zip
# !rm -rf __MACOSX/

# with open('multinli_1.0/multinli_1.0_dev_matched.jsonl', 'r') as f:
#   mnli_test = [json.loads(jline) for jline in f.readlines()]

# Load pre-trained translation model
# mnli = torch.hub.load('pytorch/fairseq',
#                        'roberta.large.mnli').eval().cuda()
# label_map = {'contradiction': 0, 'neutral': 1, 'entailment': 2}

In [14]:
from transformers import AutoTokenizer, AutoModelForSequenceClassification
from transformers import PreTrainedTokenizer, PreTrainedModel

tokenizer = AutoTokenizer.from_pretrained("./sst2_gpt2")
model = AutoModelForSequenceClassification.from_pretrained("./sst2_gpt2")

In [15]:
from datasets import load_dataset

dataset = load_dataset("sst2")["validation"]
dataset = dataset.to_dict()
dataset = [dict(zip(dataset, t)) for t in zip(*dataset.values())]
print(dataset[:3])

2024-04-13 17:53:34 | INFO | datasets | PyTorch version 1.13.1 available.


[{'idx': 0, 'sentence': "it 's a charming and often affecting journey . ", 'label': 1}, {'idx': 1, 'sentence': 'unflinchingly bleak and desperate ', 'label': 0}, {'idx': 2, 'sentence': 'allows us to hope that nolan is poised to embark a major career as a commercial yet inventive filmmaker . ', 'label': 1}]


In [16]:
test_input = tokenizer("This is a test!", return_tensors='pt')
print(test_input)
test_pred = model(**test_input)
print(test_pred.logits.squeeze())

sst2_model = (tokenizer, model)

{'input_ids': tensor([[1212,  318,  257, 1332,    0]]), 'attention_mask': tensor([[1, 1, 1, 1, 1]])}
tensor([-0.1196,  0.0964], grad_fn=<SqueezeBackward0>)


In [17]:
min_budget = 1
max_budget = min_budget + 1
iterations = 2
population = 16

In [18]:
class SST2Objective():

  def __init__(self, model: tuple[PreTrainedTokenizer, PreTrainedModel], input: str, label: int, max_perturbs: int):
    if not model:
      raise ValueError("Must supply model.")
    if not input:
      raise ValueError("Must supply input.")
    if label == None:
      raise ValueError("Must supply label.")
    self.model: tuple[PreTrainedTokenizer, PreTrainedModel] = model
    self.input: str = input
    self.label: int = label
    self.max_perturbs: int = max_perturbs

  def objective(self) -> Callable[[List[float]], float]:
    def _objective(perturbations: List[float]) -> float:
      candidate: str = self.candidate(perturbations)
      tokens = self.model[0](candidate, return_tensors="pt")
      predict = self.model[1](**tokens).logits.squeeze()
      if predict.argmax() != self.label:
        return -np.inf
      else:
        return predict.cpu().detach().numpy()[self.label]
    return _objective

  def differential_evolution(self, print_result=True, verbose=True, maxiter=3, popsize=32, polish=False) -> str:
    result = differential_evolution(self.objective(), self.bounds(),
                                    disp=verbose, maxiter=maxiter,
                                    popsize=popsize, polish=polish)
    candidate = self.candidate(result.x)
    if (print_result):
      print(f"Result: {candidate}")
      print(f"Correct Label Prediction: {result.fun}")
      print(f"Perturbation Encoding: {result.x}")
    return candidate

#### Invisible Character Attack

In [19]:
class InvisibleCharacterMnliObjective(SST2Objective, InvisibleCharacterObjective):
  
  def __init__(self, model: tuple[PreTrainedTokenizer, PreTrainedModel], 
                     input: str, label:int, max_perturbs: int = 10, invisible_chrs: List[str] = [ZWJ,ZWSP,ZWNJ], **kwargs):
    super().__init__(model, input, label, max_perturbs)
    self.invisible_chrs = invisible_chrs

#### Homoglyph Attack

In [20]:
class HomoglyphMnliObjective(SST2Objective, HomoglyphObjective):
  
  def __init__(self, model: tuple[PreTrainedTokenizer, PreTrainedModel], 
                     input: str, label:int, max_perturbs: int = 10, homoglyphs: Dict[str,List[str]] = intentionals, **kwargs):
    super().__init__(model, input, label, max_perturbs)
    self.homoglyphs = homoglyphs
    self.glyph_map = []
    for i, char in enumerate(self.input):
      if char in self.homoglyphs:
        charmap = self.homoglyphs[char]
        charmap = list(zip([i] * len(charmap), charmap))
        self.glyph_map.extend(charmap)

#### Reordering Attack

In [21]:
class ReorderMnliObjective(SST2Objective, ReorderObjective):
  
  def __init__(self, model: tuple[PreTrainedTokenizer, PreTrainedModel], 
                     input: str, label:int, max_perturbs: int = 10, **kwargs):
    super().__init__(model, input, label, max_perturbs)

#### Deletion Attack

In [22]:
class DeletionMnliObjective(SST2Objective, DeletionObjective):
  
  def __init__(self, model: tuple[PreTrainedTokenizer, PreTrainedModel], 
                     input: str, label:int, max_perturbs: int = 10, del_chr: str = BKSP, ins_chr_min: str = '!', ins_chr_max: str = '~', **kwargs):
    super().__init__(model, input, label, max_perturbs)
    self.del_chr: str = del_chr
    self.ins_chr_min: str = ins_chr_min
    self.ins_chr_max: str = ins_chr_max

#### Attack Performance

##### Experiment Setup

In [23]:
def sst2_experiment(model, objective, data, file, min_budget, max_budget, maxiter, popsize):
  perturbs = { '0': data }
  for budget in trange(min_budget, max_budget):
    perturbs[str(budget)] = dict()
    for test in tqdm(data, leave=False):
      obj = objective(model, test["sentence"], test["label"])
      example = obj.differential_evolution(print_result=False, verbose=False, maxiter=maxiter, popsize=popsize)
      perturbs[str(budget)][test["idx"]] = example
      with open(file, 'wb') as f:
          pickle.dump(perturbs, f)

##### Invisible Character Experiment

In [24]:
sst2_experiment(sst2_model, InvisibleCharacterMnliObjective, dataset[:20], "sst2_invisible_chars.pkl", min_budget, max_budget, iterations, population)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

##### Homoglyph Experiment

In [25]:
sst2_experiment(sst2_model, HomoglyphMnliObjective, dataset[:20], "sst2_homoglyphs.pkl", min_budget, max_budget, iterations, population)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

##### Reordering Experiment

In [26]:
sst2_experiment(sst2_model, ReorderMnliObjective, dataset[:20], "sst2_reorder.pkl", min_budget, max_budget, iterations, population)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

##### Deletion Experiment

In [27]:
sst2_experiment(sst2_model, DeletionMnliObjective, dataset[:20], "sst2_deletion.pkl", min_budget, max_budget, iterations, population)

  0%|          | 0/1 [00:00<?, ?it/s]

  0%|          | 0/20 [00:00<?, ?it/s]

## Targeted Integrity Attacks

Targeted imperceptible perturbation attacks craft imperceptible perturbations for a given input that attempt to produce a fixed output against a given model.

### SST2 Targeted Attack

These attacks target the SST2 sentiment classification task in a black box model that does have access to the resulting logits for each class during inference.

#### Experiment Setup

In [28]:
from torch.nn.functional import softmax

class SST2TargetedObjective(SST2Objective):

  def __init__(self, model: tuple[PreTrainedTokenizer, PreTrainedModel], 
                     input: str, label: int, target: int, max_perturbs: int):
    super().__init__(model, input, label, max_perturbs)
    self.target = target

  def objective(self) -> Callable[[List[float]], float]:
      def _objective(perturbations: List[float]) -> float:
        candidate: str = self.candidate(perturbations)
        tokens = self.model[0](candidate, return_tensors="pt")
        predict = self.model[1](**tokens).logits.squeeze()
        return -softmax(predict).cpu().detach().numpy()[self.target]
      return _objective

  def differential_evolution(self, print_result=True, verbose=True, maxiter=3, popsize=32, polish=False) -> str:
    result = differential_evolution(self.objective(), self.bounds(),
                                    disp=verbose, maxiter=maxiter,
                                    popsize=popsize, polish=polish)
    candidate = self.candidate(result.x)
    if (print_result):
      print(f"Result: {candidate}")
      print(f"Correct Label Prediction: {result.fun}")
      print(f"Perturbation Encoding: {result.x}")
    return candidate

#### Invisible Character Attack

In [29]:
class InvisibleCharacterTargetedSST2Objective(SST2TargetedObjective, InvisibleCharacterObjective):
  
  def __init__(self, model: tuple[PreTrainedTokenizer, PreTrainedModel], 
                     input: str, label:int, target: int, max_perturbs: int = 10, invisible_chrs: List[str] = [ZWJ,ZWSP,ZWNJ], **kwargs):
    super().__init__(model, input, label, target, max_perturbs)
    self.invisible_chrs = invisible_chrs

#### Homoglyph Attack

In [30]:
class HomoglyphTargetedSST2Objective(SST2TargetedObjective, HomoglyphObjective):
  
  def __init__(self, model: tuple[PreTrainedTokenizer, PreTrainedModel], 
                     input: str, label:int, target: int, max_perturbs: int = 10, homoglyphs: Dict[str,List[str]] = intentionals, **kwargs):
    super().__init__(model, input, label, target, max_perturbs)
    self.homoglyphs = homoglyphs
    self.glyph_map = []
    for i, char in enumerate(self.input):
      if char in self.homoglyphs:
        charmap = self.homoglyphs[char]
        charmap = list(zip([i] * len(charmap), charmap))
        self.glyph_map.extend(charmap)

#### Reordering Attack

In [31]:
class ReorderTargetedSST2Objective(SST2TargetedObjective, ReorderObjective):
  
  def __init__(self, model: tuple[PreTrainedTokenizer, PreTrainedModel], 
                     input: str, label:int, target: int, max_perturbs: int = 10, **kwargs):
    super().__init__(model, input, label, target, max_perturbs)

#### Deletion Attack

In [32]:
class DeletionTargetedSST2Objective(SST2TargetedObjective, DeletionObjective):
  
  def __init__(self, model: tuple[PreTrainedTokenizer, PreTrainedModel], 
                     input: str, label:int, target: int, max_perturbs: int = 10, del_chr: str = BKSP, ins_chr_min: str = '!', ins_chr_max: str = '~', **kwargs):
    super().__init__(model, input, label, target, max_perturbs)
    self.del_chr = del_chr
    self.ins_chr_min: str = ins_chr_min
    self.ins_chr_max: str = ins_chr_max

#### Attack Performance

##### Experiment Setup

In [33]:
from tqdm.auto import tqdm
from time import process_time
import pickle

def sst2_targeted_experiment(objective, model, label_map, inputs, file, min_budget = min_budget, max_budget = max_budget, maxiter = iterations, popsize = population):
  results = { '0': inputs }
  with tqdm(total=len(inputs)*(max_budget-min_budget+1)*len(label_map), desc="Adv. Examples") as pbar:
    for budget in range(min_budget, max_budget+1):
      results[str(budget)] = {}
      for input in inputs:
        results[str(budget)][input["idx"]] = []
        for label in range(len(label_map)):
          obj = objective(model, input["sentence"], input["label"], label, max_perturbs=budget)
          candidate = obj.differential_evolution(print_result=False, verbose=False, maxiter=maxiter, popsize=popsize)
          results[str(budget)][input["idx"]].append(candidate)
          with open(file, 'wb') as f:
            pickle.dump(results, f)
          pbar.update(1)

##### Invisible Character Experiment

In [34]:
sst2_targeted_experiment(InvisibleCharacterTargetedSST2Objective, sst2_model, [0, 1], dataset[:20], "sst2_invisibles_targeted.pkl", maxiter=iterations)

Adv. Examples:   0%|          | 0/80 [00:00<?, ?it/s]

  return -softmax(predict).cpu().detach().numpy()[self.target]


##### Homoglyph Experiment

In [35]:
sst2_targeted_experiment(HomoglyphTargetedSST2Objective, sst2_model, [0, 1], dataset[:20], "sst2_homoglyphs_targeted.pkl", maxiter=iterations)

Adv. Examples:   0%|          | 0/80 [00:00<?, ?it/s]

  return -softmax(predict).cpu().detach().numpy()[self.target]


##### Redordering Experiment

In [36]:
sst2_targeted_experiment(ReorderTargetedSST2Objective, sst2_model, [0, 1], dataset[:20], "sst2_reorderings_targeted.pkl", maxiter=iterations)

Adv. Examples:   0%|          | 0/80 [00:00<?, ?it/s]

  return -softmax(predict).cpu().detach().numpy()[self.target]


##### Deletion Experiment

In [37]:
sst2_targeted_experiment(DeletionTargetedSST2Objective, sst2_model, [0, 1], dataset[:20], "sst2_deletions_targeted.pkl", maxiter=iterations)

Adv. Examples:   0%|          | 0/80 [00:00<?, ?it/s]

  return -softmax(predict).cpu().detach().numpy()[self.target]


### SST2 (No Logits) Targeted Attack

This attack is identical to the SST2 targeted attack except that the adversary only has access to the predicted class and does not have access to the resulting logits for each class during inference.

#### Attack Setup

In [38]:
class SST2TargetedNoLogitsObjective(SST2TargetedObjective):

  def objective(self) -> Callable[[List[float]], float]:
      def _objective(perturbations: List[float]) -> float:
        candidate: str = self.candidate(perturbations)
        tokens = self.model[0](candidate, return_tensors="pt")
        predict = self.model[1](**tokens).logits.squeeze()
        if predict.argmax().item() == self.target:
          return -np.inf
        else:
          return np.inf
      return _objective

#### Invisible Character Attack

In [39]:
class InvisibleCharacterTargetedSST2NoLogitsObjective(SST2TargetedNoLogitsObjective, InvisibleCharacterTargetedSST2Objective):
  pass

#### Homoglyph Attack

In [40]:
class HomoglyphTargetedSST2NoLogitsObjective(SST2TargetedNoLogitsObjective, HomoglyphTargetedSST2Objective):
  pass

#### Reordering Attack

In [41]:
class ReorderTargetedSST2NoLogitsObjective(SST2TargetedNoLogitsObjective, ReorderTargetedSST2Objective):
  pass

#### Deletion Attack

In [42]:
class DeletionTargetedSST2NoLogitsObjective(SST2TargetedNoLogitsObjective, DeletionTargetedSST2Objective):
  pass

#### Attack Performance

##### Invisible Character Experiment

In [43]:
sst2_targeted_experiment(InvisibleCharacterTargetedSST2NoLogitsObjective, sst2_model, [0, 1], dataset[:20], "sst2_invisibles_targeted_nologits.pkl", maxiter=iterations)

Adv. Examples:   0%|          | 0/80 [00:00<?, ?it/s]

##### Homoglyph Experiment

In [44]:
sst2_targeted_experiment(HomoglyphTargetedSST2NoLogitsObjective, sst2_model, [0, 1], dataset[:20], "sst2_homoglyphs_targeted_nologits.pkl", maxiter=iterations)

Adv. Examples:   0%|          | 0/80 [00:00<?, ?it/s]

##### Reordering Experiment

In [45]:
sst2_targeted_experiment(ReorderTargetedSST2NoLogitsObjective, sst2_model, [0, 1], dataset[:20], "sst2_reorderings_targeted_nologits.pkl", maxiter=iterations)

Adv. Examples:   0%|          | 0/80 [00:00<?, ?it/s]

##### Deletion Exeriment

In [46]:
sst2_targeted_experiment(DeletionTargetedSST2NoLogitsObjective, sst2_model, [0, 1], dataset[:20], "sst2_deletions_targeted_nologits.pkl", maxiter=iterations)

Adv. Examples:   0%|          | 0/80 [00:00<?, ?it/s]