# AI & Security Project

**implementing_defensive_techniques.ipynb**: in this notebook we explore several defensive techniques, to make our model less prone to attacks.


In [1]:
import os
import json
import numpy as np
import torch
import torchvision
from torchvision import transforms
from torch.utils import data
from tqdm.notebook import tqdm

## Step 0: Configurations


In [2]:
# Define constants
DATASET_PATH = r"./data/TinyImageNet-sad/"  # Adjust this to the correct path where the dataset is stored
CHECKPOINT_PATH = r"./models/"
NORM_MEAN = np.array([0.485, 0.456, 0.406])
NORM_STD = np.array([0.229, 0.224, 0.225])
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")

# Define transformations
plain_transforms = transforms.Compose(
    [transforms.ToTensor(), transforms.Normalize(mean=NORM_MEAN, std=NORM_STD)]
)

# Construct the path to the dataset
imagenet_path = os.path.join(DATASET_PATH, "TinyImageNet")
assert os.path.isdir(imagenet_path), (
    f'Could not find the ImageNet dataset at the expected path: "{imagenet_path}". '
    "Please make sure the dataset is downloaded and the path is correct."
)

## Step 1: dataset and libraries


In [3]:
# Load the dataset
dataset = torchvision.datasets.ImageFolder(
    root=imagenet_path, transform=plain_transforms
)
data_loader = data.DataLoader(
    dataset, batch_size=32, shuffle=False, drop_last=False, num_workers=8
)

In [4]:
# Load label names
label_list_path = os.path.join(imagenet_path, "label_list.json")
assert os.path.isfile(
    label_list_path
), f'Label list file not found at "{label_list_path}".'

with open(label_list_path, "r") as f:
    label_names = json.load(f)

## Step 2: model functions

### Utility

In [5]:
def load_model(model_func, trainable=False):
    """Load a pretrained model given its name."""
    model = model_func()
    model = model.to(device) # Ensure the model is on the correct device
    model.eval()

    # Control gradient computation
    for p in model.parameters():
        p.requires_grad = trainable

    return model


def eval_model(dataset_loader, model, img_func=None):
    """Evaluate the model on the given dataset loader."""
    tp, tp_5, counter = 0.0, 0.0, 0.0
    for imgs, labels in tqdm(dataset_loader, desc="Validating...", leave=False):
        imgs = imgs.to(device)
        labels = labels.to(device)
        if img_func is not None:
            imgs = img_func(imgs, labels)
        with torch.no_grad():
            preds = model(imgs)
        tp += (preds.argmax(dim=-1) == labels).sum()
        tp_5 += (preds.topk(5, dim=-1)[1] == labels[..., None]).any(dim=-1).sum()
        counter += preds.shape[0]
    acc = tp.float().item() / counter
    top5 = tp_5.float().item() / counter
    print(f"\tTop-1 error: {(100.0 * (1 - acc)):4.2f}%")
    print(f"\tTop-5 error: {(100.0 * (1 - top5)):4.2f}%")
    return acc, top5

### FGSM Attack

In [6]:
# Define FGSM attack
def fgsm_attack(images, labels, model, epsilon):
    images.requires_grad = True
    outputs = model(images)
    loss = torch.nn.CrossEntropyLoss()(outputs, labels)
    model.zero_grad()
    loss.backward()
    perturbations = epsilon * images.grad.sign()
    adv_images = images + perturbations
    adv_images = torch.clamp(adv_images, 0, 1)  # Keep pixel values in range
    return adv_images

### PGD Attack

In [7]:
# PGD Attack Implementation (Stronger Alternative to FGSM)
def pgd_attack(images, labels, model, epsilon, alpha=0.01, num_iter=10):
    adv_images = images.clone().detach().to(device)
    adv_images.requires_grad = True
    for _ in range(num_iter):
        outputs = model(adv_images)
        loss = torch.nn.CrossEntropyLoss()(outputs, labels)
        model.zero_grad()
        loss.backward()
        grad = adv_images.grad.data
        adv_images = adv_images + alpha * grad.sign()
        perturbation = torch.clamp(adv_images - images, min=-epsilon, max=epsilon)
        adv_images = torch.clamp(images + perturbation, min=0, max=1).detach()
        adv_images.requires_grad = True
    return adv_images


### Carlini-Wagner (CW) Attack

In [8]:
import torch
import torch.nn.functional as F

def cw_attack(images, labels, model, c=1e-4, kappa=0, num_iter=10, lr=0.01):
    """
    Carlini-Wagner (CW) L2 attack.
    
    Args:
        images: Input images (batch).
        labels: True labels for the input images.
        model: Target model.
        c: Regularization constant for loss term.
        kappa: Confidence value (controls the attack strength).
        num_iter: Number of iterations for optimization.
        lr: Learning rate for optimization.
        
    Returns:
        Adversarial examples.
    """
    images = images.to(device)
    labels = labels.to(device)
    
    # Initialize variables
    adv_images = images.clone().detach()
    adv_images.requires_grad = True  # To compute gradients

    # Perturbation variable
    delta = torch.zeros_like(images, requires_grad=True, device=device)

    # Optimizer for the perturbation
    optimizer = torch.optim.Adam([delta], lr=lr)

    for _ in range(num_iter):
        # Generate adversarial examples
        adv_examples = torch.clamp(images + delta, min=0, max=1)

        # Model predictions on adversarial examples
        outputs = model(adv_examples)

        # Compute the loss
        target_onehot = F.one_hot(labels, num_classes=outputs.size(-1)).float()
        real = (target_onehot * outputs).sum(dim=1)
        other = ((1.0 - target_onehot) * outputs - target_onehot * 1e4).max(dim=1).values

        # CW loss
        cw_loss = torch.clamp(real - other + kappa, min=0).mean()
        l2_loss = (delta ** 2).sum(dim=[1, 2, 3]).mean()
        loss = l2_loss + c * cw_loss

        # Update delta (perturbation)
        optimizer.zero_grad()
        loss.backward()
        optimizer.step()

    # Return adversarial examples
    adv_images = torch.clamp(images + delta, min=0, max=1).detach()
    return adv_images

### Adversarial training

In [9]:
# Define adversarial training
def adversarial_training(model_func, train_loader, epsilon, EPOCHS=10):
    model = model_func()
    model = model.to(device)  # Ensure model is on the correct device
    model.train()
    optimizer = torch.optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
    loss_fn = torch.nn.CrossEntropyLoss()

    for epoch in range(EPOCHS):
        for imgs, labels in tqdm(train_loader, desc=f"Adversarial Training (epoch={epoch+1}/{EPOCHS})", leave=False):
            imgs, labels = imgs.to(device), labels.to(device)
            # Generate adversarial examples
            adv_imgs = fgsm_attack(imgs, labels, model, epsilon)
            # Combine clean and adversarial examples
            combined_imgs = torch.cat([imgs, adv_imgs])
            combined_labels = torch.cat([labels, labels])
            # Train on combined examples
            optimizer.zero_grad()
            preds = model(combined_imgs)
            loss = loss_fn(preds, combined_labels)
            loss.backward()
            optimizer.step()
    return model

## Step 3: Evaluate all models

In [10]:
from torchvision import models
from torchvision.models import ResNet18_Weights, ResNet50_Weights, ResNet152_Weights, VGG16_Weights, VGG19_Weights, RegNet_Y_128GF_Weights, ViT_H_14_Weights, ViT_L_16_Weights

import csv
import pandas as pd

list_of_models = {
    "ResNet18": lambda: models.resnet18(weights=ResNet18_Weights.DEFAULT),
    # "ResNet50": lambda: models.resnet50(weights=ResNet50_Weights.DEFAULT),
    # # "ResNet152": lambda: models.resnet152(weights=ResNet152_Weights.DEFAULT),
    # # "VGG16": lambda: models.vgg16(weights=VGG16_Weights.DEFAULT),
    # "VGG19": lambda: models.vgg19(weights=VGG19_Weights.DEFAULT),
}

evaluation_metrics = []


# Evaluate all models
epsilon = 0.03
for model_name, model_func in list_of_models.items():
    trained_model_path = f"./exports/models/{model_name}.pth"
    
    if not os.path.exists(trained_model_path):
        # Train and save the adversarial model if not already saved
        adv_model = adversarial_training(model_func, data_loader, epsilon, EPOCHS=10)
        torch.save(adv_model.state_dict(), trained_model_path)
        print(f"Adversarial model saved at: {trained_model_path}")
    else:
        # Load the adversarial model
        adv_model = load_model(model_func, trainable=True)
        adv_model.load_state_dict(torch.load(trained_model_path, weights_only=True))
        adv_model.eval()
        print(f"Adversarial model loaded from: {trained_model_path}")
    
    metrics = [model_name]
    
    
    print(f"\nEvaluating {model_name} (No Attack):")
    model = load_model(model_func)
    top1, top5 = eval_model(data_loader, model)
    metrics.extend([top1, top5])

    print(f"\nEvaluating {model_name} (With FGSM Attack):")
    model = load_model(model_func)
    top1, top5 = eval_model(data_loader, model, img_func=lambda x, y: fgsm_attack(x, y, model, epsilon))
    metrics.extend([top1, top5])
    
    print(f"\nEvaluating {model_name} (With PGD Attack):")
    model = load_model(model_func)
    top1, top5 = eval_model(data_loader, model, img_func=lambda x, y: pgd_attack(x, y, model, epsilon))
    metrics.extend([top1, top5])
    
    print(f"\nEvaluating {model_name} (With CW Attack):")
    model = load_model(model_func)
    top1, top5 = eval_model(data_loader, model, img_func=lambda x, y: cw_attack(x, y, model))
    metrics.extend([top1, top5])

    print(f"\nEvaluating {model_name} (With Defense against FGSM - Adversarial Training):")
    model = adv_model
    top1, top5 = eval_model(data_loader, model, img_func=lambda x, y: fgsm_attack(x, y, model, epsilon))
    metrics.extend([top1, top5])
    
    print(f"\nEvaluating {model_name} (With Defense against PGD - Adversarial Training):")
    model = adv_model
    top1, top5 = eval_model(data_loader, model, img_func=lambda x, y: pgd_attack(x, y, model, epsilon))
    metrics.extend([top1, top5])
    
    print(f"\nEvaluating {model_name} (With Defense against CW - Adversarial Training):")
    model = adv_model
    top1, top5 = eval_model(data_loader, model, img_func=lambda x, y: cw_attack(x, y, model))
    metrics.extend([top1, top5])
    
    # Append the metrics for this model to the evaluation list
    evaluation_metrics.append(metrics)
    
    print("\n------------------------------------------------------------------------------------\n")

columns = [
	"Model",
    "Top-1 (No Attack)", "Top-5 (No Attack)",
    "Top-1 (FGSM Attack)", "Top-5 (FGSM Attack)",
    "Top-1 (PGD Attack)", "Top-5 (PGD Attack)",
    "Top-1 (CW Attack)", "Top-5 (CW Attack)",
    "Top-1 (Defense FGSM)", "Top-5 (Defense FGSM)",
    "Top-1 (Defense PGD)", "Top-5 (Defense PGD)",
    "Top-1 (Defense CW)", "Top-5 (Defense CW)"
]

# Save the evaluation metrics to a CSV file
output_file = "./exports/data/evaluation_metrics.csv"
os.makedirs(os.path.dirname(output_file), exist_ok=True)

df = pd.DataFrame(evaluation_metrics, columns=columns)
df.to_csv(output_file, index=False)

print(f"Saved evaluation metrics to: {output_file}")

Adversarial model loaded from: ./exports/models/ResNet18.pth

Evaluating ResNet18 (No Attack):


Validating...:   0%|          | 0/157 [00:00<?, ?it/s]

	Top-1 error: 24.00%
	Top-5 error: 6.76%

Evaluating ResNet18 (With FGSM Attack):


Validating...:   0%|          | 0/157 [00:00<?, ?it/s]

	Top-1 error: 84.86%
	Top-5 error: 66.16%

Evaluating ResNet18 (With PGD Attack):


Validating...:   0%|          | 0/157 [00:00<?, ?it/s]

	Top-1 error: 99.82%
	Top-5 error: 97.74%

Evaluating ResNet18 (With CW Attack):


Validating...:   0%|          | 0/157 [00:00<?, ?it/s]

	Top-1 error: 88.96%
	Top-5 error: 56.12%

Evaluating ResNet18 (With Defense against FGSM - Adversarial Training):


Validating...:   0%|          | 0/157 [00:00<?, ?it/s]

	Top-1 error: 8.14%
	Top-5 error: 2.14%

Evaluating ResNet18 (With Defense against PGD - Adversarial Training):


Validating...:   0%|          | 0/157 [00:00<?, ?it/s]

	Top-1 error: 43.68%
	Top-5 error: 20.32%

Evaluating ResNet18 (With Defense against CW - Adversarial Training):


Validating...:   0%|          | 0/157 [00:00<?, ?it/s]

	Top-1 error: 4.56%
	Top-5 error: 0.80%

------------------------------------------------------------------------------------

Saved evaluation metrics to: ./exports/data/evaluation_metrics.csv
