# Simple feature-space adversarial attack

We use the iris dataset to train a multi-class classification model, and then perform the simple feature-space adversarial attack on this model

0: setosa, 1: versicolor, and 2: versicolor

In [1]:
#Put all the libraries here
import numpy as np

import torch
import torch.optim as optim
import torch.nn as nn
import torch.nn.functional as F
from torch.utils.data import DataLoader, TensorDataset

from sklearn import datasets
from sklearn.metrics import accuracy_score, f1_score, confusion_matrix
from sklearn.model_selection import train_test_split

## Load iris dataset from scikit-learn 

In [2]:
iris = datasets.load_iris()

#Classe names
print("***********Classe names***********")
print(iris.target_names)
#Feature names
print("***********Feature names***********")
print(iris.feature_names)
#Data sample festure values
print("***********Data sample festure values***********")
print(iris.data)
print("***********Data size and feature size***********")
print(iris.data.shape)
#Data label values
print("***********Data label values***********")
print(iris.target)

***********Classe names***********
['setosa' 'versicolor' 'virginica']
***********Feature names***********
['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']
***********Data sample festure values***********
[[5.1 3.5 1.4 0.2]
 [4.9 3.  1.4 0.2]
 [4.7 3.2 1.3 0.2]
 [4.6 3.1 1.5 0.2]
 [5.  3.6 1.4 0.2]
 [5.4 3.9 1.7 0.4]
 [4.6 3.4 1.4 0.3]
 [5.  3.4 1.5 0.2]
 [4.4 2.9 1.4 0.2]
 [4.9 3.1 1.5 0.1]
 [5.4 3.7 1.5 0.2]
 [4.8 3.4 1.6 0.2]
 [4.8 3.  1.4 0.1]
 [4.3 3.  1.1 0.1]
 [5.8 4.  1.2 0.2]
 [5.7 4.4 1.5 0.4]
 [5.4 3.9 1.3 0.4]
 [5.1 3.5 1.4 0.3]
 [5.7 3.8 1.7 0.3]
 [5.1 3.8 1.5 0.3]
 [5.4 3.4 1.7 0.2]
 [5.1 3.7 1.5 0.4]
 [4.6 3.6 1.  0.2]
 [5.1 3.3 1.7 0.5]
 [4.8 3.4 1.9 0.2]
 [5.  3.  1.6 0.2]
 [5.  3.4 1.6 0.4]
 [5.2 3.5 1.5 0.2]
 [5.2 3.4 1.4 0.2]
 [4.7 3.2 1.6 0.2]
 [4.8 3.1 1.6 0.2]
 [5.4 3.4 1.5 0.4]
 [5.2 4.1 1.5 0.1]
 [5.5 4.2 1.4 0.2]
 [4.9 3.1 1.5 0.2]
 [5.  3.2 1.2 0.2]
 [5.5 3.5 1.3 0.2]
 [4.9 3.6 1.4 0.1]
 [4.4 3.  1.3 0.2]
 [5.1 3.4 1.5 0.2]
 

## Convert the data to PyTorch tensors 

In [3]:
X, y = iris.data, iris.target

#Split the data into two sets: 75% for training and 25% for testing
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# Convert to PyTorch tensors
X_train_tensor = torch.tensor(X_train, dtype=torch.float32)
y_train_tensor = torch.tensor(y_train, dtype=torch.long)
X_test_tensor = torch.tensor(X_test, dtype=torch.float32)
y_test_tensor = torch.tensor(y_test, dtype=torch.long)

# Create a TensorDataset for training and testing, respectively
train_dataset = TensorDataset(X_train_tensor, y_train_tensor)
test_dataset = TensorDataset(X_test_tensor, y_test_tensor)

# Create DataLoaders: here, we use mini-batch gradient descent, so need to specify the batch size
train_dataloader = DataLoader(train_dataset, batch_size=16, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=16, shuffle=False)

## Construct a multi-class classification model (a softmax regression model)

In [4]:
class MultiClassification(nn.Module):
    def __init__(self):
        super(MultiClassification, self).__init__()
        #The first "4" specifies that the feature dimension is 4, and the second "3" specifies that this is 3-class classification
        #bias=False indicates that the output is calculated as y=w.x instead of y=w.x + b
        self.fc = nn.Linear(4, 3, bias=False) 
    
    def forward(self, x):
        y = self.fc(x)
        
        return y

## Set up some hyperparameters: use cross entropy loss, gradient descent with Adam optimizer, learning rate, and epochs; Pre-define the functions for training, testing, and prediction

In [5]:
torch.manual_seed(42)
epochs = 50
learning_rate = 0.01
weight_decay = 5e-4
lossfunction = nn.CrossEntropyLoss() #Cross entropy loss for multi-class classification

#Instantiate the model from "MultiClassification" class definition
model = MultiClassification()

#Use Adam optimizer
optimizer = optim.Adam(model.parameters(), lr=learning_rate, weight_decay=weight_decay)

#Define the training function
def train(epoch, model, train_dataloader, optimizer, lossfunction):
    model.train()
    
    train_loss = 0.0
    train_total, train_correct = 0.0, 0.0 
    
    for X_batch, y_batch in train_dataloader:
        optimizer.zero_grad()

        #Get the predicted output
        predictions = model(X_batch)

        #Calculate the loss
        loss = lossfunction(predictions, y_batch)
        
        #Update the weights usning gradient descent with Adam optimizer
        loss.backward()
        optimizer.step()
        
        #Convert probabilities to multi-class predictions (reutrn the class with the maximal proability)
        _, train_predicted = torch.max(predictions.data, 1)
        
        #Calculate the training statistics
        train_loss += loss.item()
        train_total += y_batch.size(0)
        train_correct += (train_predicted == y_batch).sum().item()

    print("epoch (%d): Train accuracy: %.4f, loss: %.3f" % (epoch, train_correct/train_total, train_loss/train_total))

#Define the test function for test_dataloader
def test(model, test_dataloader):
    model.eval()
    
    test_correct, test_total = 0.0, 0.0
    y_test, y_pred = [], []
    
    with torch.no_grad():
        for X_batch, y_batch in test_dataloader:
            predictions = model(X_batch)
            
            _, test_predicted = torch.max(predictions.data, 1)
            test_total += y_batch.size(0)
            test_correct += (test_predicted == y_batch).sum().item()
            
            y_test += y_batch.tolist()
            y_pred += test_predicted.tolist()

    print('Test accuracy: %.4f' % (test_correct / test_total))
    
    return y_test, y_pred

#Define the function that returns a predicted label for a single input sample
def predict_label(model, single_input):
    model.eval()  
    with torch.no_grad():
        prediction = model(single_input)
        _, predicted_label = torch.max(prediction.data, 1)
    
    return predicted_label

#Define the function that returns predicted probabilities for a single input sample
def predict_probabilities(model, single_input):
    model.eval()  
    with torch.no_grad():
        prediction = model(single_input)
        predicted_probabilities = torch.softmax(prediction, dim=1).squeeze(0)
    
    return predicted_probabilities

#Define the function that returns model weight vector that is used to predict the target_label
def weight_vector(model, target_label):
    model.eval()
    with torch.no_grad():
        weights = list(model.parameters())[0]
    
    return weights[target_label]

## Train the model using train_dataloader

In [6]:
#Train the model
for epoch in range(1, epochs + 1):
    train(epoch, model, train_dataloader, optimizer, lossfunction)

epoch (1): Train accuracy: 0.6429, loss: 0.114
epoch (2): Train accuracy: 0.6518, loss: 0.079
epoch (3): Train accuracy: 0.6607, loss: 0.056
epoch (4): Train accuracy: 0.4911, loss: 0.052
epoch (5): Train accuracy: 0.4554, loss: 0.051
epoch (6): Train accuracy: 0.7411, loss: 0.047
epoch (7): Train accuracy: 0.6696, loss: 0.045
epoch (8): Train accuracy: 0.6786, loss: 0.043
epoch (9): Train accuracy: 0.7143, loss: 0.042
epoch (10): Train accuracy: 0.7411, loss: 0.041
epoch (11): Train accuracy: 0.7411, loss: 0.039
epoch (12): Train accuracy: 0.7321, loss: 0.038
epoch (13): Train accuracy: 0.7768, loss: 0.037
epoch (14): Train accuracy: 0.7857, loss: 0.036
epoch (15): Train accuracy: 0.8125, loss: 0.036
epoch (16): Train accuracy: 0.8214, loss: 0.035
epoch (17): Train accuracy: 0.7768, loss: 0.034
epoch (18): Train accuracy: 0.8393, loss: 0.034
epoch (19): Train accuracy: 0.8661, loss: 0.033
epoch (20): Train accuracy: 0.8304, loss: 0.032
epoch (21): Train accuracy: 0.8571, loss: 0.032
e

## Implement a simple feature-space adversarial attack

#### Randomly select an test input from test_dataset to perturb and select a target label to attack

In [7]:
#Set the random seed
np.random.seed(42)

#Number of test samples
number_of_samples = len(test_dataset)
#Get a random index from [0, number_of_samples)
index = np.random.randint(number_of_samples)

#Select the test input to perturb
test_input = test_dataset[index][0]
test_input_label = test_dataset[index][1]

#Here, we perform targeted adversarial attack: change the original_label to the target_label
original_label = test_input_label.item() #0-Setosa in this example
target_label = 2 #2-Virginica in this example

print("The index of the test input: ", index)
print("Test input feature vector: ", test_input)
print("Test input original label: ", original_label, iris.target_names[original_label])
print("Target label: ", target_label, iris.target_names[target_label])

The index of the test input:  28
Test input feature vector:  tensor([4.8000, 3.0000, 1.4000, 0.3000])
Test input original label:  0 setosa
Target label:  2 virginica


#### Search for a good instance from test_dataset for guidance

Find those test samples that are predicted as the target label and also closest to decision boundary

In [8]:
target_samples = [] #Test samples that are predicted as the target label
target_probs = []   #The prediction probabilities for each test sample

for sample, true_label in test_dataset:
    predicted_label = predict_label(model, sample.unsqueeze(0)) #unsqueeze(0) is used to ensure batch dimension
    predicted_proabilities = predict_probabilities(model, sample.unsqueeze(0))

    if predicted_label == target_label:
        target_samples.append(sample)
        target_probs.append(predicted_proabilities)

target_samples = torch.stack(target_samples)
target_probs = torch.stack(target_probs)

print(target_samples)
print(target_probs)

#Target samples that are closest to the decision boundary should be those that have the highest probability for the original label
#Rank target samples by highest probability for the original label
closest_to_boundary_indices = torch.argsort(target_probs[:, original_label], descending=True)

k = 5
top_k_boundary_indices = closest_to_boundary_indices[:k]
print(top_k_boundary_indices)

tensor([[7.7000, 2.6000, 6.9000, 2.3000],
        [6.9000, 3.1000, 5.1000, 2.3000],
        [6.2000, 2.2000, 4.5000, 1.5000],
        [6.5000, 3.2000, 5.1000, 2.0000],
        [6.5000, 3.0000, 5.8000, 2.2000],
        [6.4000, 2.8000, 5.6000, 2.2000],
        [6.1000, 3.0000, 4.9000, 1.8000],
        [6.4000, 2.8000, 5.6000, 2.1000],
        [7.9000, 3.8000, 6.4000, 2.0000],
        [6.7000, 3.0000, 5.2000, 2.3000],
        [6.7000, 2.5000, 5.8000, 1.8000],
        [6.8000, 3.2000, 5.9000, 2.3000],
        [6.3000, 2.5000, 5.0000, 1.9000]])
tensor([[1.1105e-04, 1.0091e-01, 8.9898e-01],
        [5.8958e-03, 3.4968e-01, 6.4442e-01],
        [9.5101e-03, 3.6839e-01, 6.2210e-01],
        [9.7916e-03, 4.1667e-01, 5.7353e-01],
        [1.3993e-03, 2.2372e-01, 7.7488e-01],
        [1.3151e-03, 2.0696e-01, 7.9172e-01],
        [1.1523e-02, 4.1935e-01, 5.6913e-01],
        [1.5738e-03, 2.2441e-01, 7.7402e-01],
        [5.0586e-03, 4.5233e-01, 5.4261e-01],
        [3.8024e-03, 2.9794e-01, 6.9826

Calculate the manhattan distance between test_input and target_samples and find those target_samples that are closest to test_input

In [9]:
#Calculate manhattan distance (L1 distance) between test_input and target_samples
distances = torch.sum(torch.abs(test_input - target_samples), dim=1)
print(distances)

#Target samples that are closest to the test_input should be those that have the shortest distance to the test_input
#Rank target samples by shortest distance to the test_input
nearest_neighbors_indices = torch.argsort(distances)
print(nearest_neighbors_indices)

tensor([10.8000,  7.9000,  6.5000,  7.3000,  8.0000,  7.9000,  6.3000,  7.8000,
        10.6000,  7.7000,  8.3000,  8.7000,  7.2000])
tensor([ 6,  2, 12,  3,  9,  7,  1,  5,  4, 10, 11,  8,  0])


Find a good instance for guidance

In [10]:
#The good instance is initialized as the nearest neighbor
good_instance = target_samples[nearest_neighbors_indices[0]]

#A good instance is one of the test_input's nearest neighbors that is among top k target samples close to decision boundary
for i in nearest_neighbors_indices:
    if i in top_k_boundary_indices:
        good_instance = target_samples[i]
        break

print("The found good instance is: ", good_instance)

The found good instance is:  tensor([6.1000, 3.0000, 4.9000, 1.8000])


#### Find a feature to perturb

Once we find a good instance for guidance, we can use greedy search to perturb individual feature from the most important one to the least important one

In [11]:
#Feature importance for the target label can be quantified by the weight vector to predict target label
featue_importances = weight_vector(model, target_label)

#Find the indices of features from the most important to the least important 
featue_importances_indices = torch.argsort(featue_importances, descending=True)

print("Features ordered by importance: ", [iris.feature_names[i] for i in featue_importances_indices])

Features ordered by importance:  ['petal width (cm)', 'petal length (cm)', 'sepal length (cm)', 'sepal width (cm)']


#### Perform the perturbation to generate an adversarial example

There are two ways here: <br>
(1) Directly update the value of the specific feature in the original test input to the value of that in the good instance <br>
(2) When perturbing a feature, fine-tune the perturbation by adding the feature value step by step

Here, let's look at the first way first:

In [12]:
adversarial_example = test_input.clone()
for feature in featue_importances_indices:
    adversarial_example[feature] = good_instance[feature]
    if predict_label(model, adversarial_example.unsqueeze(0)) == target_label:
        break
        
perturbation_size = torch.sum(torch.abs(adversarial_example - test_input))

print("Adversarial attack succeeds!")
print("The original test input is: ", test_input)
print("The target instance is: ", good_instance)
print("The adversarial example is: ", adversarial_example)
print("The size of perturbation is: {:.1f}".format(perturbation_size))

Adversarial attack succeeds!
The original test input is:  tensor([4.8000, 3.0000, 1.4000, 0.3000])
The target instance is:  tensor([6.1000, 3.0000, 4.9000, 1.8000])
The adversarial example is:  tensor([4.8000, 3.0000, 4.9000, 1.8000])
The size of perturbation is: 5.0


Then, let's look at the second way to fine-tune the perturbation:

In [13]:
adversarial_example = test_input.clone()
step_size = 0.1
success_flag = 0

for feature in featue_importances_indices:
    original_val = adversarial_example[feature]
    target_val = good_instance[feature]
    
    #Perturb the feature value towards the good_instance's value
    if original_val < target_val:
        while adversarial_example[feature] < target_val:
            adversarial_example[feature] += step_size
            #Ensure it doesn't go beyond the value in good_instance
            if adversarial_example[feature] > target_val:
                adversarial_example[feature] = target_val
            #Check if the model predicts the target label
            if predict_label(model, adversarial_example.unsqueeze(0)) == target_label:
                success_flag = 1
                break
    
    elif original_val > target_val:
        while adversarial_example[feature] > target_val:
            adversarial_example[feature] -= step_size
            #Ensure it doesn't go beyond the value in good_instance
            if adversarial_example[feature] < target_val:
                adversarial_example[feature] = target_val
            #Check if the model predicts the target label
            if predict_label(model, adversarial_example.unsqueeze(0)) == target_label:
                success_flag = 1
                break   
    if success_flag:
        break

perturbation_size = torch.sum(torch.abs(adversarial_example - test_input))

print("Adversarial attack succeeds!")
print("The original input is: ", test_input)
print("The target instance is: ", good_instance)
print("The adversarial example is: ", adversarial_example)
print("The size of perturbation is: {:.1f}".format(perturbation_size))

Adversarial attack succeeds!
The original input is:  tensor([4.8000, 3.0000, 1.4000, 0.3000])
The target instance is:  tensor([6.1000, 3.0000, 4.9000, 1.8000])
The adversarial example is:  tensor([4.8000, 3.0000, 4.0000, 1.8000])
The size of perturbation is: 4.1
