## Student Number: 99101321

## Name: Mohammad Taslimi

# **Notes**
In this notebook you are going to implement [NES](https://arxiv.org/pdf/1804.08598.pdf) blackbox attack and test in on Cifar10. First, you must implement the provided functions. Then you must test the attack and report the average number of queries and the success rate of the attack. You can define as many additional functions as you want. If you want to alter the structure of the provided code, please do as minimally as possible.

# **Dependencies**

In [None]:
from google.colab import drive
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch import Tensor
from typing import Type
import numpy as np
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from torch.autograd import Variable
from torch.utils.data import TensorDataset, DataLoader
from torchvision.models import resnet18

#**Defining the Model and Dataloader**
Here the link to a ResNet18 checkpoint is provided. Please use this checkpoint.

In [None]:
batch_size = 128
transform = transforms.Compose([transforms.RandomCrop(32, padding=4), transforms.RandomHorizontalFlip(), transforms.ToTensor(),])

trainset = torchvision.datasets.CIFAR10(root='/content/cifar10/', train = True, download = True, transform = transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size = batch_size, shuffle = True)

testset = torchvision.datasets.CIFAR10(root='/content/cifar10/', train = False, download = True, transform = transforms.Compose([transforms.ToTensor(),]))
testloader = torch.utils.data.DataLoader(testset, batch_size = batch_size, shuffle = False)

classes = ('plane', 'car', 'bird', 'cat', 'deer', 'dog', 'frog', 'horse', 'ship', 'truck')

Downloading https://www.cs.toronto.edu/~kriz/cifar-10-python.tar.gz to /content/cifar10/cifar-10-python.tar.gz


100%|██████████| 170498071/170498071 [02:58<00:00, 953482.80it/s]


Extracting /content/cifar10/cifar-10-python.tar.gz to /content/cifar10/
Files already downloaded and verified


In [None]:
class ResNet18(nn.Module):
  def __init__(self, num_cls):
    super().__init__()
    self.conv = nn.Sequential(
        *list(resnet18(weights=None).children())[:-2])

    self.fc = nn.Linear(512, num_cls)

  def forward(self, x):
    x = self.conv(x)
    x = torch.flatten(x, start_dim=1)
    logits = self.fc(x)

    return logits

In [None]:
!gdown 1-k2y76KAtvFxXDVPMDIzXtVa0sDgF0MQ

Downloading...
From (original): https://drive.google.com/uc?id=1-k2y76KAtvFxXDVPMDIzXtVa0sDgF0MQ
From (redirected): https://drive.google.com/uc?id=1-k2y76KAtvFxXDVPMDIzXtVa0sDgF0MQ&confirm=t&uuid=707a5a65-a169-463b-a5ee-0212558fa8c9
To: /content/resnet18_cifar10_model.pt
100% 44.8M/44.8M [00:01<00:00, 29.2MB/s]


In [None]:
device = 'cuda' if torch.cuda.is_available() else 'cpu'

CIFAR10_model_PATH = "/content/resnet18_cifar10_model.pt"
state_dict = torch.load(CIFAR10_model_PATH)


model = ResNet18(num_cls=10).to(device)
model.load_state_dict(state_dict)


total = 0
correct = 0
with torch.no_grad():
    for data in testloader:
        images, labels = data
        images = images.to(device)
        labels = labels.to(device)
        outputs = model(images)
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

print(f'Accuracy of the model on the test set: {100 * correct / total}')

Accuracy of the model on the test set: 83.87


# **Natural Evolutionary Strategies (NES)**

complete the functions in the cell below.

In [None]:
def estimate_grad(model, images, labels, search_variance, n):
    """
    NES Gradient Estimate

    inputs:
            - model: the target model (only used for computing the loss)
            - images: Tensor containing images. size: [number of images, 3, image_dim, image_dim]
            - labels: Tensor containing the original labels of images.
            - search_variance: sigma
            - n: number of samples to estimate the gradient

    outputs:
            - g: estimated gradients. has similar shape as images.

    Guide:  - Define g with initial value 0.
            - for n iterations do:
                - Define a tensor of random Gaussian noise with the input image shape (use torch.randn)
                - Divide this tensor by (sqrt(image_dim*image_dim*3)). (we do this due to the properties of gaussian distribution in high-dimensional space.)
                - Compute the gradient of the loss with finite difference method using the defined noise tensor.
                - Add the comupted value to g
    """

    ###################################
    #Your Code Goes Here (15 pt.)

    g = torch.zeros_like(images)

    _, _, image_dim, _ = images.shape
    model.eval()
    with torch.no_grad():
      for _ in range(n):
          noise = torch.randn_like(images) / torch.sqrt(torch.tensor(image_dim * image_dim * 3.0))

          perturbed_images_plus = images + search_variance * noise
          perturbed_images_minus = images - search_variance * noise

          output_plus = model(perturbed_images_plus)
          output_minus = model(perturbed_images_minus)

          loss_plus = torch.nn.functional.cross_entropy(output_plus, labels)
          loss_minus = torch.nn.functional.cross_entropy(output_minus, labels)

          gradient = ((loss_plus - loss_minus) * (noise)) / (search_variance)

          g += gradient

    return g



def one_iteration_pgd_attack(grad, images, original_images, args):
     perturbation = args['epsilon'] * grad.sign()
     perturbed_images = images + perturbation
     perturbed_images = torch.clamp(perturbed_images, original_images - args['delta'], original_images + args['delta'])
     perturbed_images = torch.clamp(perturbed_images, 0, 1)

     return perturbed_images

def generate_attacks(model, images, labels, args):
    '''
    The process for generating blackbox adversarial examples. Implement for l_infty attack.

    inputs:
            - model: The target model
            - images: Tensor containing images of a batch. size: [batch_size, 3, image_dim, image_dim]
            - labels: Tensor containing the original labels of images. size: [batch_size, num_classes]

    outputs:
            - attacks: Must have the same shape as images.
            - total_queries: number of queries till a successful attack for each sample
            - success: Flag showing if each attack was successful or not.
    '''

    batch_size = images.shape[0]
    total_queries = torch.zeros(batch_size)
    success = torch.zeros(batch_size)
    attacks = images.clone()
    attacks = attacks.to(device)
    success = success.to(device)
    total_queries = total_queries.to(device)

    model.eval()
    with torch.no_grad():
      while not torch.any(total_queries > args["max_queries"]):
          unsuccessful_indices = (success == 0).nonzero(as_tuple=True)[0]
          unsuccessful_indices = unsuccessful_indices.to(device)
          if len(unsuccessful_indices) == 0:
              break
          unsuccessful_images = attacks[unsuccessful_indices]
          unsuccessful_labels = labels[unsuccessful_indices]

          grad = estimate_grad(model, unsuccessful_images, unsuccessful_labels, args['search_variance'], args['gradient_num_samples'])

          unsuccessful_attacks = one_iteration_pgd_attack(grad, unsuccessful_images, images[unsuccessful_indices], args)

          attacks[unsuccessful_indices] = unsuccessful_attacks

          outputs = model(attacks)
          _, predicted = torch.max(outputs, 1)

          successful_attacks = (predicted != labels).float()
          success = torch.max(success, successful_attacks)

          total_queries[unsuccessful_indices] += args['gradient_num_samples']*2

    return attacks, total_queries, success


# **Attack Report**

Report the success rate and average number of qureies for blackbox $l_∞$ PGD attacks for $\sigma \in \{0.001, 0.01\}$.



In [None]:
from torch.utils.data import Subset
from sklearn.model_selection import train_test_split

# Define the arguments
args = {'max_queries': 1000, # maximum number of queries for one sample
        'search_variance': 0.01,  # sigma (will be modified for each experiment)
        'epsilon': 0.01,    # step size in pgd
        'delta': 0.05,      # radius on which the attack must be projected
        'batch_size': 10,
        'gradient_num_samples': 15, # number of samples used for estimating the gradients
        }

testset = torchvision.datasets.CIFAR10(root='./cifar10/', train=False, download=True, transform=transforms.Compose([transforms.ToTensor()]))

test_indices = list(range(len(testset)))
_, subset_indices = train_test_split(test_indices, test_size=0.2, random_state=42)
subset_testset = Subset(testset, subset_indices)

testloader = torch.utils.data.DataLoader(subset_testset, batch_size=args['batch_size'], shuffle=False)


def run_attack_for_sigma(sigma):
    args['search_variance'] = sigma
    total_success = 0
    total_queries = 0
    total_samples = 0
    with torch.no_grad():
      for images, labels in testloader:
          images, labels = images.to(device), labels.to(device)
          attacks, queries, success = generate_attacks(model, images, labels, args)
          total_success += success.sum().item()
          total_queries += queries.sum().item()
          total_samples += len(labels)

    success_rate = (total_success / total_samples) * 100
    average_queries = total_queries / total_samples

    return success_rate, average_queries

sigmas = [0.01, 0.001]
results = {}

for sigma in sigmas:
    success_rate, average_queries = run_attack_for_sigma(sigma)
    results[sigma] = {'success_rate': success_rate, 'average_queries': average_queries}


for sigma, result in results.items():
    print(f"Sigma: {sigma}")
    print(f"Success Rate: {result['success_rate']:.2f}%")
    print(f"Average Number of Queries: {result['average_queries']:.2f}")

####################################

Files already downloaded and verified
Sigma: 0.01
Success Rate: 54.95%
Average Number of Queries: 611.53
Sigma: 0.001
Success Rate: 55.50%
Average Number of Queries: 610.12
