<a href="https://colab.research.google.com/github/Gosiuniunia/uczenie-glebokie/blob/main/attacks.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

Źródła:

Gu, Jindong, et al. "A survey on transferability of adversarial examples across deep neural networks." arXiv preprint arXiv:2310.17626 (2023).

Podder, Rakesh, and Sudipto Ghosh. "Impact of white-box adversarial attacks on convolutional neural networks." 2024 International Conference on Emerging Trends in Networks and Computer Communications (ETNCC). IEEE, 2024.

Qin, Yunxiao, et al. "Training meta-surrogate model for transferable adversarial attack." Proceedings of the AAAI conference on artificial intelligence. Vol. 37. No. 8. 2023.

SU, Jiawei; VARGAS, Danilo Vasconcellos; SAKURAI, Kouichi. One pixel attack for fooling deep neural networks. IEEE Transactions on Evolutionary Computation, 2019, 23.5: 828-841.

WONG, Eric; RICE, Leslie; KOLTER, J. Zico. Fast is better than free: Revisiting adversarial training. arXiv preprint arXiv:2001.03994, 2020.

MOOSAVI-DEZFOOLI, Seyed-Mohsen; FAWZI, Alhussein; FROSSARD, Pascal. Deepfool: a simple and accurate method to fool deep neural networks. In: Proceedings of the IEEE conference on computer vision and pattern recognition. 2016. p. 2574-2582.



In [1]:
from google.colab import drive
drive.mount('/content/drive')

Mounted at /content/drive


In [2]:
!pip install torchattacks
!pip install sewar

Collecting torchattacks
  Downloading torchattacks-3.5.1-py3-none-any.whl.metadata (927 bytes)
Collecting requests~=2.25.1 (from torchattacks)
  Downloading requests-2.25.1-py2.py3-none-any.whl.metadata (4.2 kB)
Collecting chardet<5,>=3.0.2 (from requests~=2.25.1->torchattacks)
  Downloading chardet-4.0.0-py2.py3-none-any.whl.metadata (3.5 kB)
Collecting idna<3,>=2.5 (from requests~=2.25.1->torchattacks)
  Downloading idna-2.10-py2.py3-none-any.whl.metadata (9.1 kB)
Collecting urllib3<1.27,>=1.21.1 (from requests~=2.25.1->torchattacks)
  Downloading urllib3-1.26.20-py2.py3-none-any.whl.metadata (50 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m50.1/50.1 kB[0m [31m2.3 MB/s[0m eta [36m0:00:00[0m
Downloading torchattacks-3.5.1-py3-none-any.whl (142 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m142.0/142.0 kB[0m [31m7.5 MB/s[0m eta [36m0:00:00[0m
[?25hDownloading requests-2.25.1-py2.py3-none-any.whl (61 kB)
[2K   [90m━━━━━━━━━━━━━━━━━━

In [3]:
import torch
import torch.nn as nn
import torchvision
import torchvision.transforms as transforms
import matplotlib.pyplot as plt
import numpy as np
import random
import os
from PIL import Image
import sewar
import pandas as pd
from torch.utils.data import DataLoader, ConcatDataset, Dataset

# Adversarial attacks PyTorch: https://github.com/Harry24k/adversarial-attacks-pytorch/tree/master
from torchattacks import PGD, FGSM, CW, AutoAttack, DeepFool, OnePixel

seed = 42
random.seed(seed)
torch.manual_seed(seed)

<torch._C.Generator at 0x7b3d1c2c06d0>

In [4]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
device

device(type='cuda')

## VGG and ResNet architecture

In [5]:
# VGG model architecture
class VGG16(nn.Module):
    def __init__(self, num_classes=10):
        super().__init__()

        self.block1 = nn.Sequential(
            nn.Conv2d(3, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.Conv2d(32, 32, 3, padding=1),
            nn.BatchNorm2d(32),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2)
        )

        self.block2 = nn.Sequential(
            nn.Conv2d(32, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.Conv2d(64, 64, 3, padding=1),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2)
        )

        # Blok 3
        self.block3 = nn.Sequential(
            nn.Conv2d(64, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.Conv2d(128, 128, 3, padding=1),
            nn.BatchNorm2d(128),
            nn.ReLU(inplace=True),
            nn.MaxPool2d(2)
        )

        self.block4 = nn.Sequential(
            nn.Conv2d(128, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True)
        )

        self.block5 = nn.Sequential(
            nn.Conv2d(256, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True),
            nn.Conv2d(256, 256, 3, padding=1),
            nn.BatchNorm2d(256),
            nn.ReLU(inplace=True)
        )

        self.classifier = nn.Sequential(
            nn.Linear(256*4*4, 512),
            nn.ReLU(True),
            nn.Dropout(0.2),
            nn.Linear(512, 512),
            nn.ReLU(True),
            nn.Dropout(0.2),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        x = self.block1(x)
        x = self.block2(x)
        x = self.block3(x)
        x = self.block4(x)
        # x = self.block5(x)
        x = torch.flatten(x, 1)
        x = self.classifier(x)
        return x


In [6]:
# ResNet18 model architecture
class BasicBlock(nn.Module):
    expansion = 1

    def __init__(self, input_channels, output_channels, stride=1):
        super().__init__()

        self.main_path = nn.Sequential(
            nn.Conv2d(input_channels, output_channels, kernel_size=3,
                      stride=stride, padding=1, bias=False),
            nn.BatchNorm2d(output_channels),
            nn.ReLU(inplace=True),
            nn.Conv2d(output_channels, output_channels, kernel_size=3,
                      stride=1, padding=1, bias=False),
            nn.BatchNorm2d(output_channels)
        )

        self.shortcut = nn.Sequential()
        if stride != 1 or input_channels != output_channels:
            self.shortcut = nn.Sequential(
                nn.Conv2d(input_channels, output_channels,
                          kernel_size=1, stride=stride, bias=False),
                nn.BatchNorm2d(output_channels)
            )

        self.relu = nn.ReLU(inplace=True)

    def forward(self, x):
        out = self.main_path(x) + self.shortcut(x)
        out = self.relu(out)
        return out


class ResNet18(nn.Module):
    def __init__(self, input_channels=3, num_classes=10):
        super().__init__()

        self.stem = nn.Sequential(
            nn.Conv2d(input_channels, 64, kernel_size=3,
                      stride=1, padding=1, bias=False),
            nn.BatchNorm2d(64),
            nn.ReLU(inplace=True)
        )

        self.layer1 = nn.Sequential(
            BasicBlock(input_channels=64, output_channels=64, stride=1),
            BasicBlock(input_channels=64, output_channels=64, stride=1)
        )
        self.layer2 = nn.Sequential(
            BasicBlock(input_channels=64, output_channels=128, stride=2),
            BasicBlock(input_channels=128, output_channels=128, stride=1)
        )
        self.layer3 = nn.Sequential(
            BasicBlock(input_channels=128, output_channels=256, stride=2),
            BasicBlock(input_channels=256, output_channels=256, stride=1)
        )
        self.layer4 = nn.Sequential(
            BasicBlock(input_channels=256, output_channels=512, stride=2),
            BasicBlock(input_channels=512, output_channels=512, stride=1)
        )

        self.classifier = nn.Sequential(
            nn.AdaptiveAvgPool2d((1, 1)),
            nn.Flatten(),
            nn.Linear(512, num_classes)
        )

    def forward(self, x):
        out = self.stem(x)
        out = self.layer1(out)
        out = self.layer2(out)
        out = self.layer3(out)
        out = self.layer4(out)
        out = self.classifier(out)
        return out


## Attack generation function

In [9]:
def generate_adversarial_images(source_model, dataset, mean, std, source_name, train=False, batch_size=256):
    """
    Generates adversarial images for all attacks in batches to avoid CUDA OOM.
    """
    attack_configs = [
        {"name": "FGSM", "atk": FGSM(source_model, eps=4/255)},
        {"name": "PGD", "atk": PGD(source_model, eps=4/255, alpha=2/255, steps=20, random_start=True)},
        {"name": "CW", "atk": CW(source_model, c=1, steps=300, lr=0.01)},
        {"name": "AutoAttack", "atk": AutoAttack(source_model, norm="Linf", eps=4/255)},
        {"name": "DeepFool", "atk": DeepFool(source_model, steps=50, overshoot=0.02)},
        {"name": "OnePixel", "atk": OnePixel(source_model, pixels=1, steps=50, popsize=20)}
    ]

    loader = DataLoader(dataset, batch_size=batch_size, shuffle=False)

    for config in attack_configs:
        attack_name = config["name"]
        atk = config["atk"]
        print(f"Generating {attack_name} attacks")

        if attack_name == "AutoAttack":
            atk.attacks_to_run = ['apgd-ce']

        atk.set_normalization_used(mean, std)
        adv_images_list = []
        adv_labels_list = []

        for batch_images, batch_labels in loader:

            batch_images, batch_labels = batch_images.to(device), batch_labels.to(device)

            adv_batch = atk(batch_images, batch_labels)

            adv_images_list.append(adv_batch.cpu())
            adv_labels_list.append(batch_labels.cpu())

            del batch_images, batch_labels, adv_batch

        adv_images_all = torch.cat(adv_images_list)
        adv_labels_all = torch.cat(adv_labels_list)

        folder_prefix = "train" if train else "test"
        save_images(adv_images_all, adv_labels_all, adv_images_folder_path, f"{folder_prefix}_{source_name}_{attack_name}")


## Adversarial images saving and loading functions

In [7]:
# Save images to the designated folder

def save_images(images, labels, path, folder_name=None):
  """
  This function saves given images with their labels in the given folder.
  Subfolder can be specified. Each image is saved as adv_image_{index}_label_{labels[index]}.png,
  """
  try:
    path = os.path.join(path, folder_name)
  except:
    pass

  os.makedirs(path, exist_ok=True)

  for i, (image_tensor, label) in enumerate(zip(images, labels)):
    img = image_tensor.detach().permute(1,2,0).cpu().numpy()
    img = (img * 255).clip(0,255).astype(np.uint8)
    image_pil = Image.fromarray(img)

    filename = f"adv_image_{i}_label_{int(label)}.png"
    filepath = os.path.join(path, filename)
    image_pil.save(filepath)

In [8]:
# Load all of the images in specified folder to torch dataset

class AdversarialImageDataset(Dataset):
    """
    This class loads the images from specified folder (save_images generated folder)
    and transforms it to torch dataset for future learning.
    """

    def __init__(self, root_dir, transform=None):
        self.root_dir = root_dir
        self.transform = transform
        self.image_files = []

        for file in os.listdir(root_dir):
            if file.endswith(".png"):
                self.image_files.append(file)

        self.image_files.sort()

    def __len__(self):
        return len(self.image_files)

    def __getitem__(self, idx):
        img_name = self.image_files[idx]
        path = os.path.join(self.root_dir, img_name)

        image = Image.open(path).convert("RGB")
        label = int(img_name.split("_")[-1].split(".")[0])

        if self.transform:
            image = self.transform(image)

        return image, label

## Evaluation metrics

In [None]:
# Attack efficiency metrics

def attack_success_rate(labels, preds):
    """
    Computes the Attack Success Rate (ASR).
    Measures how many attacks succeeded.
    """
    labels, preds = labels.to(device), preds.to(device)
    total = len(labels)
    succeeded = (preds != labels).sum().item()
    return 100.0 * succeeded / total

def fooling_rate(labels, source_pred, target_pred, clean_target_pred):
    """
    Computes the fooling rate for transfer attacks.

    - source_pred: predictions of the SOURCE model on adversarial images
    - target_pred: predictions of the TARGET model on adversarial images
    - clean_target_pred: predictions of the TARGET model on clean images

    We measure cases where:
    1) the adversarial sample fools the SOURCE model,
    2) it also fools the TARGET model,
    3) but the TARGET classified the clean version correctly.

    The fooling rate is: Q / P
    where:
        P = number of samples that fooled the source
        Q = number of samples that fooled both models
    """
    labels, source_pred, target_pred, clean_target_pred = labels.to(device), source_pred.to(device), target_pred.to(device), clean_target_pred.to(device)
    fool_source = source_pred != labels
    fool_target = target_pred != labels
    correct_target_clean = clean_target_pred == labels

    P = fool_source.sum().item()
    Q = (fool_source & fool_target & correct_target_clean).sum().item()

    return 100.0 * Q / P if P > 0 else 0.0

def same_mistake_rate(labels, source_pred, target_pred, clean_target_pred):
    """
    Computes the rate at which both the source and target models make
    the SAME wrong prediction on adversarial samples.

    We consider ONLY samples where:
    - the target model classified the CLEAN image correctly
    - the target model is fooled on the adversarial image

    Among those, we measure how often:
        source_pred == target_pred (same incorrect class)
    """
    labels, source_pred, target_pred, clean_target_pred = labels.to(device), source_pred.to(device), target_pred.to(device), clean_target_pred.to(device)
    correct_target_clean = clean_target_pred == labels
    fool_target = target_pred != labels
    same_mistake = source_pred == target_pred

    mask = fool_target & correct_target_clean
    denom = mask.sum().item()

    if denom == 0:
        return 0.0

    num = (mask & same_mistake).sum().item()
    return 100.0 * num / denom

# Perturbation quality metrics

def ssim(images, adv_images):
    """
    Computes the mean Structural Similarity Index (SSIM)
    between original and adversarial images.

    SSIM measures perceptual similarity considering:
    - luminance
    - contrast
    - structure

    SSIM = 1 means identical images.
    Lower values indicate stronger or more visible perturbations.
    """
    ssim_list = []

    for i in range(images.size(0)):
        img_1 = images[i].permute(1, 2, 0).detach().cpu().numpy().astype(np.uint8)
        img_2 = adv_images[i].permute(1, 2, 0).detach().cpu().numpy().astype(np.uint8)

        ssim_val, _ = sewar.ssim(img_1, img_2)
        ssim_list.append(ssim_val)

    return float(np.mean(ssim_list))


# Attack generation process
The procedure is as follows:

1. Attacks are generated on CIFAR10 test set using source model and then transferred to the target model.

2. To do adversarial training we generate attacks on CIFAR10 train set and then use it to train the model on which were they generated.

3. Attacks generated on the original models are used for evaluation. When calculating the metrics, we consider only those adversarial images which originals ware correctly classified by respected models.

In [11]:
vgg_model_path = "/content/drive/MyDrive/vgg_epoch_80.pth"
resnet_model_path = "/content/drive/MyDrive/model_ResNet18_cifar10_20251112.pth"
adv_images_folder_path = "/content/drive/MyDrive/adversarial_images" # "/content/drive/MyDrive/" is mandatory

In [10]:
# Original CIFAR10 Dataset
mean = [0.4914, 0.4822, 0.4465]
std = [0.2470, 0.2435, 0.2616]

transform = transforms.Compose([transforms.ToTensor(),])

trainset = torchvision.datasets.CIFAR10(root="./data", train=True, download=True, transform=transform)
testset = torchvision.datasets.CIFAR10(root='./data', train=False, download=True, transform=transform)

100%|██████████| 170M/170M [00:01<00:00, 90.1MB/s]


In [12]:
# VGG model
model_vgg = VGG16(num_classes=10)
model_vgg.load_state_dict(torch.load(vgg_model_path, map_location=device))
model_vgg.to(device)
model_vgg.eval()

VGG16(
  (block1): Sequential(
    (0): Conv2d(3, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(32, 32, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(32, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  (block2): Sequential(
    (0): Conv2d(32, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
    (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
    (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (5): ReLU(inplace=True)
    (6): MaxPool2d(kernel_size=2, stride=2, padding=0, dilation=1, ceil_mode=False)
  )
  

In [13]:
# ResNet model
model_resnet = ResNet18(3, 10)
model_resnet.load_state_dict(torch.load(resnet_model_path, map_location=device))
model_resnet.to(device)
model_resnet.eval()

ResNet18(
  (stem): Sequential(
    (0): Conv2d(3, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
    (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
    (2): ReLU(inplace=True)
  )
  (layer1): Sequential(
    (0): BasicBlock(
      (main_path): Sequential(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
        (2): ReLU(inplace=True)
        (3): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (4): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=True, track_running_stats=True)
      )
      (shortcut): Sequential()
      (relu): ReLU(inplace=True)
    )
    (1): BasicBlock(
      (main_path): Sequential(
        (0): Conv2d(64, 64, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1), bias=False)
        (1): BatchNorm2d(64, eps=1e-05, momentum=0.1, affine=Tru

In [14]:
# VGG -> Resnet
source_model = model_vgg
target_model = model_resnet
dataset = testset
generate_adversarial_images(source_model, dataset, mean, std, "VGG")

Generating FGSM attacks
Generating PGD attacks
Generating CW attacks
Generating AutoAttack attacks
Generating DeepFool attacks
Generating OnePixel attacks


In [15]:
# Resnet -> VGG
source_model = model_resnet
target_model = model_vgg
dataset = testset
generate_adversarial_images(source_model, dataset, mean, std, "ResNet")

Generating FGSM attacks
Generating PGD attacks
Generating CW attacks
Generating AutoAttack attacks
Generating DeepFool attacks
Generating OnePixel attacks


In [None]:
# Adversarial train set generation for VGG
source_model = model_vgg
dataset = trainset
generate_adversarial_images(source_model, dataset, mean, std, "VGG", train=True)

In [None]:
# Adversarial train set generation for ResNet
source_model = model_resnet
dataset = trainset
generate_adversarial_images(source_model, dataset, mean, std, "ResNet", train=True)