In [1]:
import os
import torch
import torch.nn.functional as F
from torch.autograd import Variable as V
from torchvision import transforms
from PIL import Image
from tqdm import tqdm
from datasets import load_dataset
from torch.utils.data import IterableDataset, DataLoader
from transformers import CLIPProcessor, CLIPModel
import numpy as np
from styleaug import StyleAugmentor

In [None]:
from datasets import load_dataset
streamed_dataset = load_dataset("jxie/coco_captions", split="validation", streaming=True)
count = 0
for example in streamed_dataset:
    count += 1
    if count >= 1000:
        break

Resolving data files:   0%|          | 0/182 [00:00<?, ?it/s]

In [None]:

def clip_by_tensor(t, t_min, t_max):
    result = (t >= t_min).float() * t + (t < t_min).float() * t_min
    result = (result <= t_max).float() * result + (result > t_max).float() * t_max
    return result

def save_image(images, names, output_dir):
    if not os.path.exists(output_dir):
        os.makedirs(output_dir)
    for img, name in zip(images, names):
        img = Image.fromarray(img.astype('uint8'))
        img.save(os.path.join(output_dir, f"{name}.png"))

def gkern(kernlen=7, nsig=3):
    import scipy.stats as st
    x = np.linspace(-nsig, nsig, kernlen)
    kern1d = st.norm.pdf(x)
    kernel_raw = np.outer(kern1d, kern1d)
    kernel = kernel_raw / kernel_raw.sum()
    kernel = torch.FloatTensor(kernel).unsqueeze(0).unsqueeze(0)
    return kernel

T_kernel = gkern(7, 3)

In [None]:
def preprocess_clip_images(images):
    """
    Preprocess images to match CLIP's expected input format.
    :param images: Tensor of shape [batch_size, 3, H, W], values in [0, 1]
    :return: Tensor of shape [batch_size, 3, 224, 224], normalized
    """

    resize = transforms.Resize((224, 224))
    images = resize(images)

    mean = torch.tensor([0.48145466, 0.4578275, 0.40821073]).to(images.device).view(1, 3, 1, 1)
    std = torch.tensor([0.26862954, 0.26130258, 0.27577711]).to(images.device).view(1, 3, 1, 1)
    images = (images - mean) / std
    return images

def STM(images, captions, model, processor, min_val, max_val, device):
    Resize = transforms.Resize(size=(224, 224))
    momentum = 1.0
    num_iter = 10
    eps = 16.0 / 255.0
    alpha = eps / num_iter
    x = images.clone().to(device)
    grad = torch.zeros_like(x).to(device)
    N = 20
    beta = 2.0
    gamma = 0.5
    augmentor = StyleAugmentor()

    text_inputs = processor(text=captions, return_tensors="pt", padding=True, truncation=True).to(device)
    with torch.no_grad():
        text_outputs = model.get_text_features(**text_inputs)
        text_outputs = text_outputs / text_outputs.norm(dim=-1, keepdim=True)  # Normalize

    for i in range(num_iter):
        noise = torch.zeros_like(x).to(device)
        for n in range(N):
            x_aug = augmentor(x).to(device)
            x_new = gamma * x + (1 - gamma) * Resize(x_aug.detach()) + torch.randn_like(x).uniform_(-eps * beta, eps * beta)
            x_new = V(x_new, requires_grad=True).to(device)

            image_inputs = preprocess_clip_images(x_new)

            image_outputs = model.get_image_features(pixel_values=image_inputs)
            image_outputs = image_outputs / image_outputs.norm(dim=-1, keepdim=True)

            logits_per_image = torch.matmul(image_outputs, text_outputs.t()) * model.logit_scale.exp()

            batch_size = x_new.size(0)
            labels = torch.arange(batch_size).to(device)
            loss = F.cross_entropy(logits_per_image, labels)


            noise += torch.autograd.grad(loss, x_new, retain_graph=False, create_graph=False)[0]
        noise = noise / N

        noise = noise / torch.abs(noise).mean([1, 2, 3], keepdim=True)
        noise = momentum * grad + noise
        grad = noise

        x = x + alpha * torch.sign(noise)
        x = clip_by_tensor(x, min_val, max_val)
    return x.detach()

In [5]:
class CocoStreamDataset(IterableDataset):
    def __init__(self, hf_streamed_dataset, max_samples=None):
        self.dataset = hf_streamed_dataset
        self.max_samples = max_samples

    def __iter__(self):
        count = 0
        for example in self.dataset:
            image = example["image"].convert("RGB")
            caption = example["caption"]
            cocoid = example["cocoid"]
            yield {"image": image, "caption": caption, "cocoid": cocoid}
            count += 1
            if self.max_samples is not None and count >= self.max_samples:
                break

def custom_collate_fn(batch):
    images = [item["image"] for item in batch]
    captions = [item["caption"] for item in batch]
    cocoids = [item["cocoid"] for item in batch]
    return {"image": images, "caption": captions, "cocoid": cocoids}

In [None]:
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
model.eval()

Using a slow image processor as `use_fast` is unset and a slow processor was saved with this model. `use_fast=True` will be the default behavior in v4.50, even if the model was saved with a slow processor. This will result in minor differences in outputs. You'll still be able to use a slow processor with `use_fast=False`.


CLIPModel(
  (text_model): CLIPTextTransformer(
    (embeddings): CLIPTextEmbeddings(
      (token_embedding): Embedding(49408, 512)
      (position_embedding): Embedding(77, 512)
    )
    (encoder): CLIPEncoder(
      (layers): ModuleList(
        (0-11): 12 x CLIPEncoderLayer(
          (self_attn): CLIPSdpaAttention(
            (k_proj): Linear(in_features=512, out_features=512, bias=True)
            (v_proj): Linear(in_features=512, out_features=512, bias=True)
            (q_proj): Linear(in_features=512, out_features=512, bias=True)
            (out_proj): Linear(in_features=512, out_features=512, bias=True)
          )
          (layer_norm1): LayerNorm((512,), eps=1e-05, elementwise_affine=True)
          (mlp): CLIPMLP(
            (activation_fn): QuickGELUActivation()
            (fc1): Linear(in_features=512, out_features=2048, bias=True)
            (fc2): Linear(in_features=2048, out_features=512, bias=True)
          )
          (layer_norm2): LayerNorm((512,), eps=1e

In [None]:
T_kernel = T_kernel.to(device)
dataset = CocoStreamDataset(streamed_dataset, max_samples=500)
dataloader = DataLoader(dataset, batch_size=32, collate_fn=custom_collate_fn)

image_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
])

output_dir = "clip_stm_outputs"
os.makedirs(output_dir, exist_ok=True)

for batch in tqdm(dataloader):
  images = batch["image"]
  captions = batch["caption"]
  cocoids = batch["cocoid"]

  images_tensor = torch.stack([image_transform(img) for img in images]).to(device)

  images_min = clip_by_tensor(images_tensor - 16.0 / 255.0, 0.0, 1.0)
  images_max = clip_by_tensor(images_tensor + 16.0 / 255.0, 0.0, 1.0)

  adv_images = STM(images_tensor, captions, model, processor, images_min, images_max, device)

  adv_img_np = adv_images.cpu().numpy()
  adv_img_np = np.transpose(adv_img_np, (0, 2, 3, 1)) * 255
  save_image(adv_img_np, cocoids, output_dir)


16it [14:20, 53.81s/it]


In [None]:
def load_adversarial_image(cocoid, adv_dir):
    adv_path = os.path.join(adv_dir, f"{cocoid}.png")
    if os.path.exists(adv_path):
        return Image.open(adv_path).convert("RGB")
    else:
        raise FileNotFoundError(f"Adversarial image {adv_path} not found")

def evaluate_adversarial_images():
    device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

    model = CLIPModel.from_pretrained("openai/clip-vit-base-patch32").to(device)
    processor = CLIPProcessor.from_pretrained("openai/clip-vit-base-patch32")
    model.eval()

    dataset = CocoStreamDataset(streamed_dataset, max_samples=500)
    dataloader = DataLoader(dataset, batch_size=32, collate_fn=custom_collate_fn)

    image_transform = transforms.Compose([
        transforms.Resize((224, 224)),
        transforms.ToTensor(),
    ])

    adv_dir = "clip_stm_outputs"
    similarities_orig = []
    similarities_adv = []
    linf_norms = []
    top1_orig = 0
    top1_adv = 0
    top5_orig = 0
    top5_adv = 0
    total = 0

    for batch in tqdm(dataloader):
        orig_images = batch["image"]
        captions = batch["caption"]
        cocoids = batch["cocoid"]

        try:
            adv_images = [load_adversarial_image(cocoid, adv_dir) for cocoid in cocoids]
        except FileNotFoundError as e:
            print(e)
            continue

        inputs_orig = processor(text=captions, images=orig_images, return_tensors="pt", padding=True, truncation=True).to(device)
        inputs_adv = processor(text=captions, images=adv_images, return_tensors="pt", padding=True, truncation=True).to(device)

        with torch.no_grad():
            outputs_orig = model(**inputs_orig)
            logits_per_image_orig = outputs_orig.logits_per_image  
            similarity_orig = logits_per_image_orig.diag() / model.logit_scale.exp()  

            outputs_adv = model(**inputs_adv)
            logits_per_image_adv = outputs_adv.logits_per_image
            similarity_adv = logits_per_image_adv.diag() / model.logit_scale.exp()

        similarities_orig.extend(similarity_orig.cpu().numpy())
        similarities_adv.extend(similarity_adv.cpu().numpy())
        batch_size = logits_per_image_orig.size(0)


        total += batch_size

    avg_similarity_orig = np.mean(similarities_orig)
    avg_similarity_adv = np.mean(similarities_adv)
    

    asr = np.mean(np.array(similarities_adv) < np.array(similarities_orig) - 0.1)  

    print(f"Evaluation Results (over {total} images):")
    print(f"Average Cosine Similarity (Original): {avg_similarity_orig:.4f}")
    print(f"Average Cosine Similarity (Adversarial): {avg_similarity_adv:.4f}")
    print(f"Similarity Drop: {avg_similarity_orig - avg_similarity_adv:.4f}")
    print(f"Attack Success Rate (ASR, margin=0.1): {asr:.4f}")


In [9]:
evaluate_adversarial_images()

16it [00:13,  1.21it/s]

Evaluation Results (over 500 images):
Average Cosine Similarity (Original): 0.3099
Average Cosine Similarity (Adversarial): 0.2047
Similarity Drop: 0.1052
Attack Success Rate (ASR, margin=0.1): 0.5260





## Novelty: STM Attack Evaluation for Multimodal Models

**Evaluation Results (over 500 images):**
- **Average Cosine Similarity (Original)**: 0.3099
- **Average Cosine Similarity (Adversarial)**: 0.2047
- **Similarity Drop**: 0.1052
- **Attack Success Rate (ASR, margin=0.1)**: 0.5260

---