In [None]:
!pip install piq sentence-transformers

In [None]:
import torch

def softsign(x):
    return x / (1 + x.abs())

def sign(x):
    return x.sign()

def tanh(x):
    return x.tanh()

def sigmoid(x):
    return 2 * x.sigmoid() - 1

def clipped_linear(x):
    return torch.clamp(x, -1, 1)



ACTIVATION = {
    "sign":sign,
    "softsign":softsign,
    "tanh":tanh,
    "sigmoid":sigmoid,
    "linear":clipped_linear,

}

In [None]:
import tqdm
import torch
import json, os
from piq.ssim import ssim 
from piq.fsim import fsim
from piq.psnr import psnr
from piq.vif import vif_p
from piq.ms_ssim import multi_scale_ssim
from piq.iw_ssim import information_weighted_ssim
from piq.mdsi import mdsi
from prettytable import PrettyTable
import numpy as np
from sentence_transformers import SentenceTransformer, util

model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')

def compute_metrics(img, adv_img, verbose=False):
    '''
    return: ssim, psnr, fsim, iw-ssim, ms-ssim, mdsi, vifp, mse, mae
    '''
    _ssim = ssim(adv_img, img, data_range=1.0).item()
    _psnr = psnr(adv_img, img, data_range=1.0).item()
    _fsim = fsim(adv_img, img, data_range=1.0).item()
    _iw_ssim = information_weighted_ssim(adv_img, img, data_range=1.0).item()
    _ms_ssim = multi_scale_ssim(adv_img, img, data_range=1.0).item()
    _mdsi = mdsi(adv_img, img, data_range=1.0).item()
    _vifp = vif_p(adv_img, img, data_range=1.0).item()
    _mse = float(torch.mean((img - adv_img) ** 2).cpu())
    _mae = float(torch.mean(torch.abs(img - adv_img)).cpu())
    if verbose: 
        print(f'SSIM: {_ssim}')
        print(f'PSNR: {_psnr}')
        print(f'FSIM: {_fsim}')
        print(f'IW-SSIM: {_iw_ssim}')
        print(f'MS-SSIM: {_ms_ssim}')
        print(f'MDSI: {_mdsi}')
        print(f'VIFp: {_vifp}')
        print(f'MSE: {_mse}')
        print(f'MAE: {_mae}')
    return [_ssim, _psnr, _fsim, _iw_ssim, _ms_ssim, _mdsi, _vifp, _mse, _mae]

def display_result(m, att, targeted, metrics, metrics_average):
    t = PrettyTable(["ITEM", "VALUE"])
    t.add_row(['METHOD', f"{m}_{att}"])
    t.add_row(["TARGETED", targeted])
    for idx in range(len(metrics)):
        t.add_row([metrics[idx], metrics_average[idx]])
    print(t)      

from PIL import Image
def save_adv(adv, path):
    # tiff.imwrite(f"{path.replace('.png', '.tiff')}", adv)
    img = Image.fromarray(adv, mode="RGB")
    img.save(path)


def compute_cosine_similarity(s1, s2):
    s1 =  s1.lower().strip().replace(" ", "")
    s2 =  s2.lower().strip().replace(" ", "")
    embedding1 = model.encode(s1, convert_to_tensor=True)
    embedding2 = model.encode(s2, convert_to_tensor=True)

    # Compute cosine similarity
    cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2)
    return cosine_similarity.item()

# Default using retain punc.
def compute_cosine_similarity(s1, s2):
    s1 =  s1.lower().strip().replace(" ", "")
    s2 =  s2.lower().strip().replace(" ", "")
    embedding1 = model.encode(s1, convert_to_tensor=True)
    embedding2 = model.encode(s2, convert_to_tensor=True)

    # Compute cosine similarity
    cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2)
    return cosine_similarity.item()

import string
# def compute_cosine_similarity(s1, s2, model_name):
#     s1 = s1.lower().strip().translate(str.maketrans("", "", string.punctuation))
#     s2 = s2.lower().strip().translate(str.maketrans("", "", string.punctuation))    
#     embedding1 = model.encode(s1, convert_to_tensor=True)
#     embedding2 = model.encode(s2, convert_to_tensor=True)

#     # Compute cosine similarity
#     cosine_similarity = util.pytorch_cos_sim(embedding1, embedding2)
#     return cosine_similarity.item()

In [None]:
import pandas as pd
import os
from PIL import Image
from torch.utils.data import Dataset

class VLMDataset(Dataset):
    def __init__(self, img_file, text_file, image_dir, num=100, processor=None):
        self.img_file = pd.read_csv(img_file)[:num]
        self.text_file = pd.read_csv(text_file, encoding="iso-8859-1")[:num]
        self.image_dir = image_dir
        self.processor = processor

    def __len__(self):
        return len(self.text_file)

    def __getitem__(self, idx):
        image_path = os.path.join(self.image_dir, self.img_file.loc[idx, 'ImageId']) + ".png"
        image = Image.open(image_path).convert('RGB')
        image = self.processor(images=image, return_tensors="pt").pixel_values.to("cuda")
        target_text = self.text_file.loc[idx, "Target"]
        processed_target_text = self.processor(text=target_text, return_tensors="pt").input_ids.to("cuda")
        return image_path, image, target_text, processed_target_text

In [None]:
import torch
from abc import ABC, abstractmethod
import math

class Attack(ABC):
    def __init__(self, 
                alpha, 
                eps, 
                mean=None, std=None,
                n_iter=10,
                device="cuda",
                activation:str="sign",
                ) -> None:
        
        self.n_iter = n_iter
        self.device = device
        if mean is not None:
            self.mean = torch.tensor(mean).view(1, 3, 1, 1).to(self.device)
        else:
            self.mean = torch.tensor([0, 0, 0]).view(1, 3, 1, 1).to(self.device)
        if std is not None:
            self.std = torch.tensor(std).view(1, 3, 1, 1).to(self.device)
            adjustment = [(1 / 255) / s for s in std]
        else:
            self.std = torch.tensor([1, 1, 1]).view(1, 3, 1, 1).to(self.device)
            adjustment = [1, 1, 1]

        self.ad_alpha = alpha * torch.tensor(adjustment, device=device).view(1, 3, 1, 1)
        self.ad_eps = eps * torch.tensor(adjustment, device=device).view(1, 3, 1, 1)

        self.perturbation = None
        if activation not in ACTIVATION:
            raise NotImplementedError(f"Please implement {activation} function in activation.py")
        self.activation_name = activation
        self.activation = ACTIVATION[activation]
        self.grad = None

    @abstractmethod
    def init_components(self):
        pass
    
    def earlystop(self, model, adv, processor):
        generated_ids = model.generate(pixel_values=adv)
        generated_caption = processor.decode(generated_ids[0], skip_special_tokens=True)
        return generated_caption

    def nesterov(self, adv, images):
        return adv

    def forward(self, model, images, target_texts, processed_target_texts, processor, 
                verbose_steps=100):
        images = images.to(self.device)
        processed_target_texts = processed_target_texts.to(self.device)
        self.perturbation = torch.zeros_like(images).to(self.device)
        self.init_components(images)
        adv = images.detach().clone().to(self.device)
        step = 0
        generated_captions = []
        for idx in range(self.n_iter):
            # adv.requires_grad = True

            # Initialize NAG
            adv_nes = self.nesterov(adv, images)
            adv_nes.requires_grad = True
            outputs = model(pixel_values=adv_nes, input_ids=processed_target_texts, labels=processed_target_texts
                           )
            loss = -outputs.loss
            loss.backward()

            self.grad = adv_nes.grad
            self.grad = self.grad / torch.mean(torch.abs(self.grad), dim=(1, 2, 3), keepdim=True)
            adv = self.update(adv, idx)
            adv = self.clip(adv, images).detach()
            generated_caption = self.earlystop(model, adv, processor)    
            cos_sim, g, t = compute_cosine_similarity(generated_caption, target_texts)   
            generated_captions.append(generated_caption)
            if (g == t) or cos_sim >= 0.99:
                print(f'Step {idx + 1}, Loss: {loss.item()}')
                print(f'Similarity: {cos_sim}')
                print(f"Caption: {generated_caption}")
                break
            step = idx
            if (idx + 1) % verbose_steps == 0:
                print(f'Step {idx + 1}, Loss: {loss.item()}')
                print(f'Similarity: {cos_sim}')
                print(f"Caption: {generated_caption}")

        return adv.detach(), generated_captions, loss.detach().item(), self.perturbation.detach(), step + 1
    
    @abstractmethod
    def update(self, adv, idx):
        pass

    def clip(self, adv, images):
        delta = torch.clamp(adv - images, min=-self.ad_eps, max=self.ad_eps)
        self.perturbation = delta
        adv_ = images + delta
        adv_ = adv_ * self.std + self.mean        
        adv_ = torch.clamp(adv_, 0, 1)
        adv_ = (adv_ - self.mean) / self.std
        return adv_

class Momentum(Attack):
    def __init__(self, alpha=1, eps=5, mean=None, std=None,  n_iter=10, device="cuda", 
                beta=1.0,
                activation="sign", 
                ):
        super().__init__(alpha, eps, mean, std, n_iter, device, activation)
        self.beta = beta
        self.momentum = None

    def init_components(self, x):
        self.momentum = torch.zeros_like(x).to(self.device)

    def update(self, adv, idx):
        g = self.beta * self.momentum + self.grad
        self.momentum = g
        return adv + self.ad_alpha * self.activation(g)
    
class Nesterov(Momentum):
    def __init__(self, alpha=1, eps=5, mean=None, std=None, n_iter=10, device="cuda", 
                beta=1.0,
                activation="sign"
                ) -> None:
        super().__init__(alpha, eps, mean, std, n_iter, device, beta, activation)

    def nesterov(self, adv, images):
        adv_nes = adv + self.beta * self.ad_alpha * self.activation(self.momentum)
        return self.clip(adv_nes, images)
    
class AdaGrad(Attack):
    def __init__(self, alpha=1, eps=5, mean=None, std=None, n_iter=10, device="cuda", 
                delta = 1e-8,
                activation="softsign"
                ) -> None:
        super().__init__(alpha, eps, mean, std, n_iter, device, activation)
        self.delta = delta
        self.squared_grad = None

    def init_components(self, x):
        self.squared_grad = torch.zeros_like(x).to(self.device)

    def update(self, adv, idx):
        self.squared_grad = self.squared_grad + self.grad ** 2
        g = self.grad / (torch.sqrt(self.squared_grad) + self.delta)
        return adv + self.ad_alpha * self.activation(g)
    
class AdaDelta(AdaGrad):
    def __init__(self, alpha=1, eps=5, mean=None, std=None, n_iter=10, device="cuda",
                beta=0.9, delta=1e-6,
                activation="softsign", 
                ) -> None:
        super().__init__(alpha, eps, mean, std, n_iter, device, delta, activation)
        self.beta = beta
        self.squared_x = None

    def init_components(self, x):
        self.squared_grad = torch.zeros_like(x).to(self.device)
        self.squared_x = torch.zeros_like(x).to(self.device)

    def update(self, adv, idx):
        self.squared_grad = self.beta * self.squared_grad + (1 - self.beta) * self.grad ** 2
        delta_x = torch.sqrt((self.squared_x + self.delta) / (self.squared_grad + self.delta)) * self.grad
        self.squared_x = self.beta * self.squared_x + (1 - self.beta) * delta_x ** 2
        return adv + self.activation(delta_x)
    
class RMSprop(AdaGrad):
    def __init__(self, alpha=1, eps=5, mean=None, std=None, n_iter=10, device="cuda",
                beta=0.99, delta=1e-8, 
                activation="softsign"
                ) -> None:
        super().__init__(alpha, eps, mean, std, n_iter, device, delta, activation)
        self.beta = beta

    def update(self, adv, idx):
        self.squared_grad = self.beta * self.squared_grad + (1 - self.beta) * self.grad ** 2
        g = self.grad / (torch.sqrt(self.squared_grad) + self.delta)
        return adv + self.ad_alpha * self.activation(g)
    
class Adam(Attack):
    def __init__(self, alpha=1, eps=5, mean=None, std=None, n_iter=10, device="cuda",
                beta_1=0.9, beta_2=0.999, delta=1e-8,
                activation="softsign"
                ) -> None:
        super().__init__(alpha, eps, mean, std, n_iter, device, activation)
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.delta = delta

    def init_components(self, x):
        self.momentum_1 = torch.zeros_like(x).to(self.device)
        self.momentum_2 = torch.zeros_like(x).to(self.device)

    def update(self, adv, idx):
        self.momentum_1 = self.beta_1 * self.momentum_1 + (1 - self.beta_1) * self.grad
        self.momentum_2 = self.beta_2 * self.momentum_2 + (1 - self.beta_2) * self.grad ** 2
        b_momentum_1 = self.momentum_1 / (1 - self.beta_1 ** (idx + 1))
        b_momentum_2 = self.momentum_2 / (1 - self.beta_2 ** (idx + 1))
        g = (b_momentum_1 / (torch.sqrt(b_momentum_2) + self.delta))
        return adv + self.ad_alpha * self.activation(g)

class AdaBelief(Adam):
    def __init__(self, alpha=1, eps=5, mean=None, std=None, n_iter=10, device="cuda",
                beta_1=0.9, beta_2=0.999, delta=1e-8,
                activation="softsign",
                ) -> None:
        super().__init__(alpha, eps, mean, std, n_iter, device, beta_1, beta_2, delta, activation)
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.delta = delta

    def update(self, adv, idx):
        self.momentum_1 = self.beta_1 * self.momentum_1 + (1 - self.beta_1) * self.grad
        self.momentum_2 = self.beta_2 * self.momentum_2 + (1 - self.beta_2) * (self.grad - self.momentum_1) ** 2 + self.delta
        b_momentum_1 = self.momentum_1 / (1 - self.beta_1 ** (idx + 1))
        b_momentum_2 = self.momentum_2 / (1 - self.beta_2 ** (idx + 1))
        g = (b_momentum_1 / (torch.sqrt(b_momentum_2) + self.delta))
        adv = adv + self.ad_alpha * self.activation(g)
        return adv

class NAdam(Adam):
    def __init__(self, alpha=1, eps=5, mean=None, std=None, n_iter=10, device="cuda",
                beta_1=0.9, beta_2=0.999, delta=1e-8,
                activation="softsign",
                ) -> None:
        super().__init__(alpha, eps, mean, std, n_iter, device, beta_1, beta_2, delta, activation)
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.delta = delta

    def update(self, adv, idx):
        self.momentum_1 = self.beta_1 * self.momentum_1 + (1 - self.beta_1) * self.grad
        self.momentum_2 = self.beta_2 * self.momentum_2 + (1 - self.beta_2) * self.grad ** 2
        b_momentum_1 = self.momentum_1 / (1 - self.beta_1 ** (idx + 1))
        b_momentum_2 = self.momentum_2 / (1 - self.beta_2 ** (idx + 1))
        g = (self.beta_1 * b_momentum_1 + (1 - self.beta_1) * self.grad / (1 - self.beta_1 ** (idx + 1))) / (torch.sqrt(b_momentum_2) + self.delta)
        adv = adv + self.ad_alpha * self.activation(g)
        return adv
    
class Adan(Attack):
    def __init__(self, alpha=1, eps=5, mean=None, std=None, n_iter=10, device="cuda",
                beta_1=0.02, beta_2=0.08, beta_3=0.01, delta=1e-8, 
                activation="softsign",
                ) -> None:
        super().__init__(alpha, eps, mean, std, n_iter, device, activation)
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.beta_3 = beta_3
        self.delta = delta
        self.m_k, self.v_k, self.n_k = None, None, None
        self.g_previous = None
    
    def init_components(self, x):
        self.m_k = torch.zeros_like(x).to(self.device)
        self.v_k = torch.zeros_like(x).to(self.device)
        self.n_k = torch.zeros_like(x).to(self.device)
        self.g_previous = torch.zeros_like(x).to(self.device)

    def update(self, adv, idx):
        bias_correction1 = 1.0 - math.pow(self.beta_1, idx + 1)
        bias_correction2 = 1.0 - math.pow(self.beta_2, idx + 1)
        bias_correction3_sq = math.sqrt(1.0 - math.pow(self.beta_3, idx + 1))
        self.m_k = (1 - self.beta_1) * self.m_k + self.beta_1 * self.grad
        self.v_k = (1 - self.beta_2) * self.v_k + self.beta_2 * (self.grad - self.g_previous)
        self.n_k = (1 - self.beta_3) * self.n_k + self.beta_3 * (self.grad + (1 - self.beta_2) * (self.grad - self.g_previous)) ** 2
        self.g_previous = self.grad.clone()
        de_norm = self.n_k.sqrt().div_(bias_correction3_sq).add_(self.delta)
        g1 = self.m_k / de_norm / bias_correction1
        g2 = self.v_k * (1 - self.beta_2) / de_norm / bias_correction2

        return adv + self.ad_alpha * self.activation(g1 + g2)
    
class Adai(Attack):
    def __init__(self, alpha=1, eps=5, mean=None, std=None, n_iter=10, device="cuda",
            beta_1=0.1, beta_2=0.99, dampening=1.0, delta=1e-3, 
            activation="softsign",
            ) -> None:
        super().__init__(alpha, eps, mean, std, n_iter, device, activation)
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.dampening = dampening
        self.delta = delta
        self.exp_avg, self.exp_avg_sq, self.beta1_prod = None, None, None
        self.param_size = None

    def init_components(self, images):
        self.exp_avg = torch.zeros_like(images)
        self.exp_avg_sq = torch.zeros_like(images)
        self.beta1_prod = torch.ones_like(images)
        self.param_size = images.numel()

    def update(self, adv, idx):
        exp_avg_sq_hat_sum = 0.0
        self.exp_avg_sq.mul_(self.beta_2).addcmul_(self.grad, self.grad, value=1.0 - self.beta_2)
        bias_correction2 = 1 - self.beta_2 ** (idx + 1)
        exp_avg_sq_hat_sum += self.exp_avg_sq.sum() / bias_correction2
        exp_avg_sq_hat_mean = exp_avg_sq_hat_sum / self.param_size

        exp_avg_sq_hat = self.exp_avg_sq / bias_correction2

        beta1 = (
            1.0
            - (exp_avg_sq_hat / exp_avg_sq_hat_mean).pow_(1.0 / (3.0 - 2.0 * self.dampening)).mul_(self.beta_1)
        ).clamp_(0.0, 1.0 - self.delta)
        beta3 = (1.0 - beta1).pow_(self.dampening)

        self.beta1_prod.mul_(beta1)

        self.exp_avg.mul_(beta1).addcmul_(beta3, self.grad)
        exp_avg_hat = self.exp_avg.div(1.0 - self.beta1_prod).mul_(math.pow(self.beta_1, 1. - self.dampening))

        return adv + self.ad_alpha * self.activation(exp_avg_hat)

class Yogi(Attack):
    def __init__(self, alpha=1, eps=5, mean=None, std=None, n_iter=10, device="cuda",
                beta_1=0.9, beta_2=0.999, initial_accumulator=1e-6, delta=1e-3, 
                activation="softsign",
                ) -> None:
        super().__init__(alpha, eps, mean, std, n_iter, device, activation)
        self.beta_1 = beta_1
        self.beta_2 = beta_2
        self.delta = delta
        self.initial_accumulator = initial_accumulator
        self.exp_avg, self.exp_avg_sq = None, None

    def init_components(self, images):
        self.exp_avg = torch.full_like(images, fill_value=self.initial_accumulator)
        self.exp_avg_sq = torch.full_like(images, fill_value=self.initial_accumulator)


    def update(self, adv, idx):
        bias_correction2_sq = math.sqrt(1.0 - math.pow(self.beta_2, idx + 1))
        grad_sq = self.grad * self.grad
        self.exp_avg.mul_(self.beta_1).add_(self.grad, alpha=1.0 - self.beta_1)
        self.exp_avg_sq.addcmul_((self.exp_avg_sq - grad_sq).sign_(), grad_sq, value=-(1.0 - self.beta_2))
        de_nom = self.exp_avg_sq.sqrt().div_(bias_correction2_sq).add_(self.delta)
        return adv + self.ad_alpha * self.activation(self.exp_avg / de_nom)
 


In [None]:
from transformers import BlipProcessor, BlipForConditionalGeneration, set_seed
import torch, tqdm
from torchvision import transforms
from torch.utils.data import DataLoader
import pandas as pd, os
from transformers import BlipProcessor, BlipForConditionalGeneration
from transformers import GitProcessor, GitForCausalLM
from transformers import AutoProcessor, AutoModelForCausalLM 
import torch
from torchvision import transforms
from datetime import datetime

TIME = datetime.now().strftime("%Y/%m/%d-%H:%M:%S")
N_ITER = 500
EPS = 3
SAVE_DIR = os.getcwd() + f'/{TIME.replace("/", "_").replace(":", "_")}_{N_ITER}_{EPS}'
# INPUT_DIR = r"C:\Users\User\Desktop\Thesis\dataset\images"
# INPUT_META = r"C:\Users\User\Desktop\Thesis\common.csv"
# TARGET_TEXT = r"C:\Users\User\Desktop\Thesis_\multimodal\BLIP_normal.csv"
INPUT_DIR = "/kaggle/input/nips-2017-adversarial-learning-development-set/images"
INPUT_META = "/kaggle/input/thesis-common/common.csv"
TARGET_TEXT = "/kaggle/input/image-captioning-attack-dataset/BLIP_sensitive.csv"

# Load BLIP model and processor
set_seed(226)
# Model configurations with their corresponding Hugging Face repositories
model_configs = {

    "GIT-Large": {
        "model_name": "microsoft/git-large",
        "processor": GitProcessor
    },
    "BLIP-Large": {
        "model_name": "Salesforce/blip-image-captioning-large",
        "processor": BlipProcessor
    },

    "BLIP-Base": {
        "model_name": "Salesforce/blip-image-captioning-base",
        "processor": BlipProcessor
    },

    "GIT-Base": {
        "model_name": "microsoft/git-base",
        "processor": GitProcessor
    },
    "FuseCap": {
        "model_name": "noamrot/FuseCap",
        "processor": AutoProcessor
    },

}
# Initialize dictionaries to hold models and processors
models = {}
processors = {}

# Load all models and processors
for model_key, config in model_configs.items():
    print(f"Loading {model_key}...")
    # Load the model
    if "GIT" in model_key:
        models[model_key] = GitForCausalLM.from_pretrained(config["model_name"]).to("cuda").eval()
    elif "BLIP" in model_key:
        models[model_key] = BlipForConditionalGeneration.from_pretrained(config["model_name"]).to("cuda").eval()
    elif "FuseCap" in model_key:
        models[model_key] =  BlipForConditionalGeneration.from_pretrained(config["model_name"]).to("cuda").eval()
    # Load the processor
    processors[model_key] = config["processor"].from_pretrained(config["model_name"])
    print(f"{model_key} loaded successfully.\n")
    

inv_transforms = {}

for model_key, processor in processors.items():
    mean = processor.image_processor.image_mean if hasattr(processor, "image_processor") else [0.5, 0.5, 0.5]
    std = processor.image_processor.image_std if hasattr(processor, "image_processor") else [0.5, 0.5, 0.5]
    inv_transforms[model_key] = {
        "inv_transform":transforms.Compose([
                            transforms.Normalize(mean=[-m / s for m, s in zip(mean, std)],
                                std=[1 / s for s in std])
        ]),
        "mean":mean, 
        "std":std}
    



for crafting_model_key, crafting_model in models.items():

    crafting_processor = processors[crafting_model_key]
    dataset = VLMDataset(INPUT_META, TARGET_TEXT, INPUT_DIR, 20, processor=crafting_processor)
    dataloader = DataLoader(dataset, shuffle=False, batch_size=1)
    crafting_inv_transform = inv_transforms[crafting_model_key]["inv_transform"]
    mean = inv_transforms[crafting_model_key]["mean"]
    std = inv_transforms[crafting_model_key]["std"]
    act = "linear"
    attack = {
        "Momentum":Momentum(alpha=1, eps=EPS, mean=mean, std=std, n_iter=N_ITER, activation=act),
        "Nesterov":Nesterov(alpha=1, eps=EPS, mean=mean, std=std, n_iter=N_ITER, activation=act),
        "Adam":Adam(alpha=1, eps=EPS, mean=mean, std=std, n_iter=N_ITER, activation=act),
        "NAdam":NAdam(alpha=1, eps=EPS, mean=mean, std=std, n_iter=N_ITER, activation=act),
        "AdaBelief":AdaBelief(alpha=1, eps=EPS, mean=mean, std=std, n_iter=N_ITER, activation=act),
    }
    for attack_name, attack_method in attack.items():
        results = {
            "CRAFTING_MODEL": [],
            "IMAGE_PATH": [], "CRAFTED_CAPTION": [], "TARGETED_TEXT": [],
            "SSIM": [], "PSNR": [], "FSIM": [], "IW-SSIM": [], "MS-SSIM": [],
            "MDSI": [], "VIF_P": [], "MSE": [], "MAE": [], "SIMILARITY": [],
            "STEP": [], "LOSS": [], "NOISE": [], "SUCCESS": [],
        }
        evaluated_results = {
            "EVALUATION_MODEL":[],
            "CRAFT_GENERATED_CAPTIONS":[],
            "EVAL_GENERATED_CAPTIONS":[],
            "TARGETED_CAPTIONS":[],
            "CRAFTED_SIMILARITY":[],
            "EVAL_SIMILARITY":[]
        }
        metrics = list(results.keys())[4:13]

        # Directory to save adversarial examples and results
        output_dir = fr"{SAVE_DIR}/{crafting_model_key}/{attack_name}"
        os.makedirs(output_dir, exist_ok=True)

        progress = tqdm.tqdm(dataloader, desc=f"{crafting_model_key}/{attack_name}")
        for image_paths, images, target_texts, processed_target_texts in progress:
            # Unpack the data
            image = images[0]  # (3, H, W)
            processed_target_text = processed_target_texts[0]
            target_text = target_texts[0]

            # Generate adversarial example using the crafting model
            adv, generated_captions, loss, perturb, steps = attack_method.forward(
                crafting_model, image, target_text, processed_target_text, crafting_processor, 10
            )
            adv = crafting_inv_transform(adv[0]).clamp(0, 1)  # Denormalize adversarial image
            adv_img_t = adv.cpu().numpy().transpose(1, 2, 0)  # Convert for saving
            ori = crafting_inv_transform(image).to("cuda").clamp(0, 1)
            # Save the adversarial image
            adv_img_t = np.clip(adv_img_t * 255, 0, 255).astype(np.uint8) 
            save_adv(adv_img_t, os.path.join(output_dir, os.path.basename(image_paths[0])))
            computed_metrics = compute_metrics(ori, adv.unsqueeze(0), True)

            # Remove all punctuations
            g = generated_captions[-1].lower().replace(" ", "").strip()
            t = target_text.lower().replace(" ", "").strip()
            cos_sim = compute_cosine_similarity(g, t)   
            
            if g == t or cos_sim >= 0.99:
                results["SUCCESS"].append(1)
            else:
                results["SUCCESS"].append(0)

            for idx, metric_name in enumerate(metrics):
                results[metric_name].append(computed_metrics[idx])

            results["CRAFTING_MODEL"].append(crafting_model_key)
            results["STEP"].append(steps)
            results["LOSS"].append(loss)
            results["NOISE"].append(torch.norm(perturb, 2).item())
            results["IMAGE_PATH"].append(image_paths[0])
            results["CRAFTED_CAPTION"].append("\n".join(generated_captions))
            results["TARGETED_TEXT"].append(target_text.lower())
            results["SIMILARITY"].append(cos_sim)

            # Save the results to a CSV
            output_file = os.path.join(output_dir, f"{crafting_model_key}_{attack_name}_results.csv")
            pd.DataFrame(results).to_csv(output_file, index=False)
            print(f"Checkpoint saved for {crafting_model_key} crafting with {attack_name} attack in {output_file}")
            
            # Evaluate the adversarial sample across all models
            for evaluation_model_key, evaluation_model in models.items():
                evaluation_processor = processors[evaluation_model_key]
                adv = torch.from_numpy(adv_img_t).permute(2, 0, 1)
                # Preprocess adversarial image for the evaluation model
                inputs = evaluation_processor(images=adv.unsqueeze(0), return_tensors="pt").to("cuda")
                # Generate caption with the evaluation model
                outputs = evaluation_model.generate(**inputs)
                generated_caption = evaluation_processor.decode(outputs[0], skip_special_tokens=True)
                similarity = compute_cosine_similarity(generated_caption.lower().strip().replace(" ", ""), t)

                evaluated_results["EVALUATION_MODEL"].append(evaluation_model_key)
                evaluated_results["CRAFT_GENERATED_CAPTIONS"].append(generated_captions[-1])
                evaluated_results["EVAL_GENERATED_CAPTIONS"].append(generated_caption)
                evaluated_results["TARGETED_CAPTIONS"].append(target_text)
                evaluated_results["CRAFTED_SIMILARITY"].append(cos_sim)
                evaluated_results["EVAL_SIMILARITY"].append(similarity)

                # Print progress
                progress.set_description(f"{crafting_model_key}/{attack_name}/{sum(results['SUCCESS'])}")
                output_file = os.path.join(output_dir, f"{crafting_model_key}_{attack_name}_eval_results.csv")
                pd.DataFrame(evaluated_results).to_csv(output_file, index=False)