# Adversarial Attacks
- This notebook serves to demonstrate the vulnerablity of our BERT-BILSTM model to adversarial examples
- To perform our attack, we made use of the TextAttack library: https://github.com/QData/TextAttack
- This library hosts a number of attack implementation on NLP DNNs
- To demonstrate adversarial attacks, we utilised the A2T-MLM attack: https://arxiv.org/pdf/2109.00544.pdf

In [1]:
#imports
import numpy as np
import pandas as pd

import torch
import torch.nn as nn

from sklearn.model_selection import train_test_split

import transformers
from transformers import BertModel, BertTokenizer, PreTrainedModel, PretrainedConfig

import textattack
from textattack.models.wrappers import HuggingFaceModelWrapper

In [2]:
# Define seed for reproducability
def set_seed(seed = 0): 
    '''
    set random seed
    '''
    # random.seed(seed)
    torch.manual_seed(seed)
    np.random.seed(seed)
    torch.cuda.manual_seed_all(seed)
    torch.backends.cudnn.deterministic = True

set_seed(42)
device = 'cuda'

In [3]:
df = pd.read_csv("IMDB Dataset.csv") # read csv
X_train, X_test, y_train, y_test = train_test_split(df['review'], df['sentiment'], test_size=0.3, shuffle=True) # train test split of 7:3 ratio

# maps str to int
polarity_class = {"negative":0, "positive":1} # binary classes
y_train = y_train.apply(lambda x: polarity_class[x]) 
y_test = y_test.apply(lambda x: polarity_class[x])

# get adversarial examples dataset, we only take a sample of original test set for computational purposes
adv_X_train, adv_X_test, adv_y_train, adv_y_test = train_test_split(X_test, y_test, test_size=1/6, shuffle=True) 

adv_y_train = adv_y_train.values
adv_y_test = adv_y_test.values

In [4]:
# truncate texts, adversarial examples are slightly computationally
temp_text_truncated = []
for idx, review in enumerate(adv_X_test): 
    split_rev = review.split(" ")[:150] 
    concat_rev = ' '.join(split_rev) 
    temp_text_truncated.append((concat_rev, int(adv_y_test[idx])))

In [5]:
# wrap model to be compatible with TextAttack library
class MyConfig(PretrainedConfig):
    model_type = 'mymodel'
    def __init__(self, important_param=42, **kwargs):
        super().__init__(**kwargs)
        self.important_param = important_param

# Bert Model   
class BERT_Bi_Arch(PreTrainedModel): # for binary class
    config_class = MyConfig
    
    def __init__(self, config):
        super(BERT_Bi_Arch, self).__init__(config)
        self.bert = BertModel.from_pretrained('bert-base-uncased')
        self.softmax = nn.LogSoftmax(dim=1) 
        self.lstm = nn.LSTM(768, 256, batch_first=True,bidirectional=True)
        self.linear = nn.Linear(256*2, 2)
        self.logits = 0       
        self.loss_function = nn.NLLLoss() # loss function
        
    def __loss__(self, labels): # get loss
        loss = self.loss_function(self.logits, labels)
        return loss
    
    def get_input_embeddings(self):
        return self.bert.get_input_embeddings() 

    def __call__(self, input_ids, attention_mask, token_type_ids):
        sequence_output, pooled_output = self.bert(input_ids, attention_mask=attention_mask, return_dict=False)
        lstm_output, (h,c) = self.lstm(sequence_output) ## extract the 1st token's embeddings
        hidden = torch.cat((lstm_output[:,-1, :256],lstm_output[:,0, 256:]),dim=-1)
        linear_output = self.linear(hidden.view(-1,256*2))
        
        self.logits = linear_output
        return self.softmax(linear_output)

In [None]:
# define model class and tokenizer
config = MyConfig(4)
model = BERT_Bi_Arch(config)
model.load_state_dict(torch.load("./stored_weights/polarityBertBiLSTM.pth")) # load weights
tokenizer = BertTokenizer.from_pretrained("bert-base-uncased") # load tokenizer
model_wrapper = HuggingFaceModelWrapper(model, tokenizer) # wrap model and tokenizer together, textattack module needs this
model_wrapper.model.config.max_position_embeddings = 150

dataset = textattack.datasets.Dataset(temp_text_truncated)

# Attack 2500 samples with CSV logging and checkpointing  every 250 intervals 
attack = textattack.attack_recipes.A2TYoo2021.build(model_wrapper, mlm=True) # A2TYoo2021
attack_args = textattack.AttackArgs(query_budget = 100, num_examples=len(dataset), log_to_csv="A2TYoo2021.csv", checkpoint_interval=250, checkpoint_dir="checkpoints", disable_stdout=True)
attacker = textattack.Attacker(attack, dataset, attack_args)
attacker.attack_dataset()

In [35]:
# print adversarial examples metrics
df = pd.read_csv("A2TYoo2021.csv")

succesful_attacks = df['result_type'].value_counts()['Successful'] / len(df) * 100
print(f"Percentage of Successful Adversarial Attack: {round(succesful_attacks,2)}%")

failed_attacks = df['result_type'].value_counts()['Failed'] / len(df) * 100
print(f"Percentage of Failed Adversarial Attack: {round(failed_attacks,2)}%")

skipped_attacks = df['result_type'].value_counts()['Skipped'] / len(df) * 100
print(f"Percentage of Skipped Adversarial Attack: {round(skipped_attacks,2)}%")

Percentage of Successful Adversarial Attack: 51.96%
Percentage of Failed Adversarial Attack: 38.56%
Percentage of Skipped Adversarial Attack: 9.48%
