# Boilerplate

Package installation, loading, and dataloaders. There's also a simple model defined. You can change it your favourite architecture if you want.

In [1]:


import torch
import torch.nn as nn
import torch.optim as optim
import torch.nn.functional as F
import numpy as np
import time
import matplotlib.pyplot as plt

from torchvision import datasets, transforms
# from tensorboardX import SummaryWriter

use_cuda = False
device = torch.device("cuda" if use_cuda else "cpu")
batch_size = 64
np.random.seed(42)
torch.manual_seed(42)


## Dataloaders
train_dataset = datasets.MNIST('mnist_data/', train=True, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))
test_dataset = datasets.MNIST('mnist_data/', train=False, download=True, transform=transforms.Compose(
    [transforms.ToTensor()]
))

train_loader = torch.utils.data.DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(test_dataset, batch_size=batch_size, shuffle=False)

## Simple NN. You can change this if you want. If you change it, mention the architectural details in your report.
class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc = nn.Linear(28*28, 200)
        self.fc2 = nn.Linear(200,10)

    def forward(self, x):
        x = x.view((-1, 28*28))
        x = F.relu(self.fc(x))
        x = self.fc2(x)
        return x

class Normalize(nn.Module):
    def forward(self, x):
        return (x - 0.1307)/0.3081

# Add the data normalization as a first "layer" to the network
# this allows us to search for adverserial examples to the real image, rather than
# to the normalized image
model = nn.Sequential(Normalize(), Net())

model = model.to(device)
model.train()

Sequential(
  (0): Normalize()
  (1): Net(
    (fc): Linear(in_features=784, out_features=200, bias=True)
    (fc2): Linear(in_features=200, out_features=10, bias=True)
  )
)

# Implement the Attacks

Functions are given a simple useful signature that you can start with. Feel free to extend the signature as you see fit.

You may find it useful to create a 'batched' version of PGD that you can use to create the adversarial attack.

In [51]:


def fgsm_untarget(model, x, y, eps):
    #TODO: implement this as an intermediate step of PGD
    # Notes: put the model in eval() mode for this function
    model.eval()

    x_prime = x.clone().detach()
    x_prime.requires_grad = True
    output = model(x_prime)
    loss = F.cross_entropy(output, y)
    loss.backward()

    gradient = x_prime.grad
    x_prime = x_prime + eps * gradient.sign()
    #x_prime = torch.clamp(x_prime, x-eps, x+eps)
    x_prime = torch.clamp(x_prime, 0, 1)

    return x_prime

def bim_untargeted(model, x, y, k=10, eps=0.1, eps_step=0.01):
    #TODO: implement this 
    # Notes: put the model in eval() mode for this function
    # x: input image
    # y: ground truth label for x
    # k: steps of FGSM
    # eps: projection region for PGD (note the need for normalization before projection, as eps values are for inputs in [0,1])
    # eps_step: step for one iteration of FGSM
    model.eval()
    x_prime = x.clone().detach()
    for i in range(k):
        x_prime = fgsm_untarget(model, x_prime, y, eps_step)
        # ensure x_primt in the l-infite ball zone
        x_prime = torch.clamp(x_prime, x + eps, x - eps)
        x_prime = torch.clamp(x_prime, 0, 1)

    return x_prime

def pgd_untargeted(model, x, y, k=10, eps=0.1, eps_step=0.01):
    #TODO: implement this 
    # Notes: put the model in eval() mode for this function
    # x: input image
    # y: ground truth label for x
    # k: steps of FGSM
    # eps: projection region for PGD (note the need for normalization before projection, as eps values are for inputs in [0,1])
    # eps_step: step for one iteration of FGSM
    model.eval()
    x_prime = x.clone().detach()
    x_prime += 2 * eps * (torch.rand_like(x_prime) - 0.5) # add a random noise to the initial x_prime
    x_prime = torch.clamp(x_prime, 0, 1)

    for i in range(k):
        x_prime = fgsm_untarget(model, x_prime, y, eps_step)
        # ensure x_primt in the l-infite ball zone
        x_prime = torch.clamp(x_prime, x + eps, x - eps)
        x_prime = torch.clamp(x_prime, 0, 1)

    return x_prime

# Implement Adversarial Training

In [43]:
def train_model(model, num_epochs, enable_defense=True, attack='pgd', eps=0.1, k=10):
    # TODO: implement this function that trains a given model on the MNIST dataset.
    # this is a general-purpose function for both standard training and adversarial training.
    # (toggle enable_defense parameter to switch between training schemes)
    opt = optim.SGD(model.parameters(), lr=0.1, momentum=0.95)
    
    for epoch in range(num_epochs):
        model.train()
        for  data, target in train_loader:
            data = data.to(device)
            target = target.to(device)
            if enable_defense:
                if attack == 'pgd':
                    data_adv = pgd_untargeted(model, data, target, k=k, eps=eps)
                else:
                    print('Attack not implemented, skipping')
                data = data_adv
            model.train()
            opt.zero_grad()
            output = model(data)
            loss = F.cross_entropy(output, target)
            loss.backward()
            opt.step()

In [47]:
def test_model_on_attacks(model, attack='pgd', eps=0.1):
    # TODO: implement this function to test the robust accuracy of the given model
    # use pgd_untargeted() within this function
    # model.eval()
    
    # correct = 0
    # tot = 0
    # for x, y in test_loader:
    #     if attack == 'pgd':
    #         data_adv = pgd_untargeted(model, x, y, eps=eps)
    #     else:
    #         print(f"Unsupported attack type: {attack}, skipping")
    #         continue
    #     output = model(data_adv)
    #     pred = output.argmax(dim=1, keepdim=True)
    #     correct += pred.eq(y.view_as(pred)).sum()
    #     tot += y.size(0)

    # robust_accuracy = correct / tot
    # print(f'Robust Accuracy: {robust_accuracy}')
    # return robust_accuracy
    model.eval()
    
    correct = 0         # Correct predictions on clean data
    correct_under_attack = 0     # Correct predictions on adversarial data
    correct_both = 0
    tot = 0             # Total number of samples
    
    for x, y in test_loader:
        with torch.no_grad():
            output = model(x)
            pred = output.argmax(dim=1, keepdim=True)
            correct += pred.eq(y.view_as(pred)).sum()
        
        # Generate adversarial examples
        if attack == 'pgd':
            x_attack = pgd_untargeted(model, x, y, eps=eps)
        elif attack == 'fgsm':
            x_attack = fgsm_untarget(model, x, y, eps)
        elif attack == 'bim':
            x_attack = bim_untargeted(model, x, y, eps=eps)
        else:
            print(f"Unsupported attack type: {attack}, skipping")
            continue
        
        # Evaluate model on adversarial examples
        with torch.no_grad():
            output_attack = model(x_attack)
            pred_attack = output_attack.argmax(dim=1, keepdim=True)
            correct_under_attack += pred_attack.eq(y.view_as(pred_attack)).sum().item()
        correct_both += sum(pred.eq(y.view_as(pred)) & pred_attack.eq(y.view_as(pred_attack)))
        tot += y.size(0)
    
    standard_accuracy = correct / tot
    roboust_accuracy = correct_both[0] / tot
    adversial_accuracy = correct_under_attack / tot
 
    
    print(f'Standard Accuracy: {standard_accuracy}')
    print(f'Robust Accuracy: {roboust_accuracy}')
    print(f"Adversial Accuracy:{adversial_accuracy}")
    

# Study Accuracy, Quality, etc.

Compare the various results and report your observations on the submission.

In [5]:
## train the original model
model = nn.Sequential(Normalize(), Net())
model = model.to(device)
model.train()

train_model(model, 20, False)
torch.save(model.state_dict(), 'weights.pt')

In [39]:
## PGD attack
model = nn.Sequential(Normalize(), Net())
model.load_state_dict(torch.load('weights.pt'))

for eps in [0.05, 0.1, 0.15, 0.2]:
    print(eps)
    test_model_on_attacks(model, attack='pgd', eps=eps)

  model.load_state_dict(torch.load('weights.pt'))


0.05
Standard Accuracy: 0.5360999703407288
Robust Accuracy: 0.5293999910354614
Adversial Accuracy:0.5301
0.1
Standard Accuracy: 0.5360999703407288
Robust Accuracy: 0.5206999778747559
Adversial Accuracy:0.522
0.15
Standard Accuracy: 0.5360999703407288
Robust Accuracy: 0.5105999708175659
Adversial Accuracy:0.5123
0.2
Standard Accuracy: 0.5360999703407288
Robust Accuracy: 0.498199999332428
Adversial Accuracy:0.5002


In [44]:
## PGD based adversarial training
model = nn.Sequential(Normalize(), Net())
eps = 0.1
train_model(model, 20, True, 'pgd', eps, k=10)
torch.save(model.state_dict(), f'weights_AT_{eps}.pt')

In [48]:
## PGD attack
model = nn.Sequential(Normalize(), Net())
model.load_state_dict(torch.load('weights_AT_0.1.pt'))

for eps in [0.05, 0.1, 0.15, 0.2]:
    print(eps)
    test_model_on_attacks(model, attack='fgsm', eps=eps)

  model.load_state_dict(torch.load('weights_AT_0.1.pt'))


0.05
Standard Accuracy: 0.7522000074386597
Robust Accuracy: 0.7070000171661377
Adversial Accuracy:0.7074
0.1
Standard Accuracy: 0.7522000074386597
Robust Accuracy: 0.7050999999046326
Adversial Accuracy:0.7056
0.15
Standard Accuracy: 0.7522000074386597
Robust Accuracy: 0.6887000203132629
Adversial Accuracy:0.6896
0.2
Standard Accuracy: 0.7522000074386597
Robust Accuracy: 0.6269999742507935
Adversial Accuracy:0.6276
