In [2]:
import torch
from PIL import Image
from transformers import AutoModel, CLIPImageProcessor
from torch import nn

models = [
    AutoModel.from_pretrained(
    'OpenGVLab/InternViT-300M-448px-V2_5',
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True).cuda().eval(),
    AutoModel.from_pretrained(
    'OpenGVLab/InternViT-6B-448px-V2_5',
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True).cuda().eval(),
    AutoModel.from_pretrained(
    'OpenGVLab/InternViT-6B-448px-V1-5',
    torch_dtype=torch.bfloat16,
    low_cpu_mem_usage=True,
    trust_remote_code=True).cuda().eval()
]

image_processors = [
    CLIPImageProcessor.from_pretrained('OpenGVLab/InternViT-300M-448px-V2_5'), 
    CLIPImageProcessor.from_pretrained('OpenGVLab/InternViT-6B-448px-V2_5'), 
    CLIPImageProcessor.from_pretrained('OpenGVLab/InternViT-6B-448px-V1-5'), 
]



[2025-03-17 12:16:11,651] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)




Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
tgt_path = "../data/mini_MathVista_grid/target/bar.png"
# adv_path = "../data/poisons/mini_MathVista_grid+i2i_InternEnsembleAttack/bar/31.png"
base_path = "../data/mini_MathVista_grid/base_512/31.jpg"
adv_path = base_path

tgt_image = Image.open(tgt_path).convert('RGB')
adv_image = Image.open(adv_path).convert('RGB')

adv_pixel_values = [image_processor(images=adv_image, return_tensors='pt').pixel_values for image_processor in image_processors]
tgt_pixel_values = [image_processor(images=tgt_image, return_tensors='pt').pixel_values for image_processor in image_processors]

adv_pixel_values = [adv_pixel_value.to(torch.bfloat16).cuda() for adv_pixel_value in adv_pixel_values]
tgt_pixel_values = [tgt_pixel_value.to(torch.bfloat16).cuda() for tgt_pixel_value in tgt_pixel_values]

with torch.no_grad():
    adv_outputs = [model(adv_pixel_value) for model, adv_pixel_value in zip(models, adv_pixel_values)]
    tgt_outputs = [model(tgt_pixel_value) for model, tgt_pixel_value in zip(models, tgt_pixel_values)]

In [6]:
cos = nn.CosineSimilarity(dim=0, eps=1e-6)
similarity = [cos(adv_output.last_hidden_state.view(-1), tgt_output.last_hidden_state.view(-1)) for adv_output, tgt_output in zip(adv_outputs, tgt_outputs)]
similarity # .9922 (adv)

# [tensor(0.7188, device='cuda:0', dtype=torch.bfloat16),
#  tensor(0.9102, device='cuda:0', dtype=torch.bfloat16),
#  tensor(0.6602, device='cuda:0', dtype=torch.bfloat16)] adv

[tensor(0.0859, device='cuda:0', dtype=torch.bfloat16),
 tensor(0.4766, device='cuda:0', dtype=torch.bfloat16),
 tensor(0.4082, device='cuda:0', dtype=torch.bfloat16)]

base: 
[tensor(0.2246, device='cuda:0', dtype=torch.bfloat16, grad_fn=<SumBackward1>),
 tensor(0.5430, device='cuda:0', dtype=torch.bfloat16, grad_fn=<SumBackward1>)]

adv: 
[tensor(0.2178, device='cuda:0', dtype=torch.bfloat16),
 tensor(0.5586, device='cuda:0', dtype=torch.bfloat16)]

In [41]:
torch.mean(similarity)

tensor(0.1396, device='cuda:0', dtype=torch.bfloat16, grad_fn=<MeanBackward0>)

In [7]:
[torch.sum(adv_output.last_hidden_state/torch.norm(adv_output.last_hidden_state)*tgt_output.last_hidden_state/torch.norm(tgt_output.last_hidden_state)) for adv_output, tgt_output in zip(adv_outputs, tgt_outputs)] # 0.2178 (adv)

[tensor(0.2178, device='cuda:0', dtype=torch.bfloat16),
 tensor(0.5586, device='cuda:0', dtype=torch.bfloat16),
 tensor(0.4707, device='cuda:0', dtype=torch.bfloat16)]

In [21]:
adv_outputs.last_hidden_state.size()

torch.Size([1, 1025, 1024])

In [1]:
class MyInternEnsemble():
    def __init__(self, tau=2):
        warnings.filterwarnings("ignore")
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
        self.models = [
            AutoModel.from_pretrained(
            'OpenGVLab/InternViT-300M-448px-V2_5',
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
            trust_remote_code=True).to(self.device).eval(),

            AutoModel.from_pretrained(
            'OpenGVLab/InternViT-6B-448px-V2_5',
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
            trust_remote_code=True).to(self.device).eval(),

            AutoModel.from_pretrained(
            'OpenGVLab/InternViT-6B-448px-V1-5',
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
            trust_remote_code=True).to(self.device).eval()
        ]

        self.preprocess = torchvision.transforms.Compose(
            [
                torchvision.transforms.Resize(448, interpolation=torchvision.transforms.InterpolationMode.BICUBIC, antialias=True),
                torchvision.transforms.Lambda(lambda img: torch.clamp(img, 0.0, 255.0) / 255.0),
                torchvision.transforms.CenterCrop(448),
                torchvision.transforms.Normalize((0.48145466, 0.4578275, 0.40821073), (0.26862954, 0.26130258, 0.27577711)), # CLIP imgs mean and std.
            ]
        )

        self.costs = torch.ones(2, len(self.models)).to(self.device)
        self.tau = tau
        self.critical = nn.CosineSimilarity(dim=0, eps=1e-6).to(self.device)


        
    def encode_image(self, image, use_grad=True):
        image_features_list = []
        image_tgt = self.preprocess(image)
        assert len(image_tgt.size()) == 3

        context = torch.enable_grad() if use_grad else torch.no_grad()
        with context:
            for model in self.models:
                image_features = model(image_tgt.unsqueeze(0)).last_hidden_state  # [bs, 512]
                image_features = image_features / image_features.norm()
                image_features_list.append(image_features)

        return image_features_list
    
    def get_gradients(self, adv_image_features_list, tgt_image_features_list, adv_tensor):
        model_losses=torch.zeros(len(self.models))
        loss = torch.zeros(1).to(self.device)
        model_losses = torch.stack([self.critical(adv_embed.view(-1), tgt_embed.view(-1)) for adv_embed, tgt_embed in zip(adv_image_features_list, tgt_image_features_list)])

        exp_cost_ratio = torch.exp(self.tau*(self.costs[1] / self.costs[0]+1e-16))
        weights = torch.sum(exp_cost_ratio, dim=0) / (len(self.models)*exp_cost_ratio)
        loss = torch.sum(weights * model_losses)

        self.costs[1] = self.costs[0]
        self.costs[0] = model_losses.clone().detach()


        gradient = torch.autograd.grad(loss, adv_tensor)[0]
        return gradient, torch.mean(model_losses)

In [2]:
import torch
from PIL import Image
from transformers import AutoModel
from torch import nn


import torchvision
from PIL import Image
import torch
import numpy as np

import warnings

ensemble = MyInternEnsemble()


[2025-03-17 07:09:41,147] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

Loading checkpoint shards:   0%|          | 0/3 [00:00<?, ?it/s]

In [5]:
def to_tensor(pic):
    mode_to_nptype = {"I": np.int32, "I;16": np.int16, "F": np.float32}
    img = torch.from_numpy(np.array(pic, mode_to_nptype.get(pic.mode, np.uint8), copy=True))
    img = img.view(pic.size[1], pic.size[0], len(pic.getbands()))
    img = img.permute((2, 0, 1)).contiguous()
    return img.to(dtype=torch.get_default_dtype())

transform_fn = torchvision.transforms.Compose(
    [
        # torchvision.transforms.Resize(224, interpolation=torchvision.transforms.InterpolationMode.BICUBIC),
        # torchvision.transforms.CenterCrop(224),
        torchvision.transforms.Lambda(lambda img: img.convert("RGB")),
        torchvision.transforms.Lambda(lambda img: to_tensor(img)),
        torchvision.transforms.Lambda(lambda img: torch.clamp(img, 0.0, 255.0)),
    ]
)

tgt_path = "../data/mini_MathVista_grid/target/bar.png"
adv_path = "../data/poisons/mini_MathVista_grid+i2i_EnsembleAttack/bar/1.png"
# base_path = "../data/mini_MathVista_grid/base_512/1.jpg"
# adv_path = base_path

tgt_image = Image.open(tgt_path).convert('RGB')
adv_image = Image.open(adv_path).convert('RGB')



adv_pixel_values = transform_fn(adv_image).to(torch.bfloat16).cuda().requires_grad_()
tgt_pixel_values = transform_fn(tgt_image).to(torch.bfloat16).cuda()

adv_image_features_list = ensemble.encode_image(adv_pixel_values)
tgt_image_features_list = ensemble.encode_image(tgt_pixel_values, use_grad=False)

ensemble.get_gradients(adv_image_features_list, tgt_image_features_list, adv_pixel_values)

(tensor([[[ 5.2929e-05,  5.0545e-05,  5.8174e-05,  ..., -3.9816e-05,
           -2.7657e-05,  2.4438e-05],
          [-4.8876e-05,  2.2531e-05,  5.6505e-05,  ..., -9.0599e-06,
           -4.6968e-05, -1.3411e-05],
          [-4.2200e-05, -6.9141e-06,  1.6451e-05,  ...,  2.8491e-05,
           -2.8849e-05, -8.3447e-05],
          ...,
          [-7.5817e-05, -4.1723e-05, -5.3883e-05,  ...,  9.0003e-06,
            2.1744e-04,  1.6308e-04],
          [-8.1539e-05, -4.4346e-05, -3.6716e-05,  ...,  2.6822e-05,
            2.1458e-04,  2.0599e-04],
          [-8.0585e-05, -3.7193e-05, -1.8358e-05,  ...,  1.4782e-05,
            4.2915e-06, -2.8729e-05]],
 
         [[-4.6492e-05, -7.5698e-06,  6.1691e-06,  ..., -2.6822e-05,
            3.7104e-06, -3.4094e-05],
          [-3.8385e-05, -2.2650e-06, -5.5730e-06,  ..., -2.0027e-05,
            3.0398e-05, -5.1498e-05],
          [-3.6240e-05, -1.2338e-05,  1.0133e-05,  ...,  3.2663e-05,
            1.1861e-05, -2.4676e-05],
          ...,
    

In [4]:
torch.cuda.empty_cache()