In [1]:
import torch
from transformers import CLIPModel, CLIPProcessor, CLIPTokenizer, CLIPImageProcessor
from PIL import Image
import torchvision
from torch.nn import DataParallel
import torchvision.transforms as T
from torchvision.transforms import InterpolationMode
import warnings

d_type = torch.float16

def get_image_encoder_clip():
    vision_tower_name = "openai/clip-vit-large-patch14"
    warnings.filterwarnings("ignore")

    vision_tower = CLIPModel.from_pretrained(vision_tower_name).eval().cuda()
    # vision_tower.half()
    vision_tower.requires_grad_(True)

    if torch.cuda.device_count() > 1:
        print(f"Using {torch.cuda.device_count()} GPUs")
        vision_tower = vision_tower

    img_size = 224  # Standard image size for CLIP

    processor = CLIPTokenizer.from_pretrained(vision_tower_name)

    return vision_tower, processor, None, img_size

def encode_image_clip(image_encoder, X_adv, img_size, bs, diff_aug, orig_sizes):
    
    transform = T.Compose([
        T.Resize(224, interpolation=InterpolationMode.BICUBIC),  # Resize image to CLIP's input size
        T.CenterCrop(224),
        T.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])  # CLIP normalization
    ])

    images = []
    for j in range(bs):
        orig_w, orig_h = orig_sizes[j]
        img = X_adv[j][:, :orig_h, :orig_w]

        if diff_aug:
            img = diff_aug(img).cuda()
        else:
            img = img.cuda()

        img = transform(img)
        images.append(img)

    # Concatenate images into a single tensor for processing
    images = torch.stack(images).to("cuda")

    with torch.autocast(device_type='cuda', dtype=d_type):
        image_embeds = image_encoder.get_image_features(pixel_values=images)

    return image_embeds


def i2t_similarity_clip(image_encoder, processor, image_tensor, text):
    # Preprocess image and text

    transform = T.Compose([
        T.Resize(224, interpolation=InterpolationMode.BICUBIC),  # Resize image to CLIP's input size
        T.CenterCrop(224),
        T.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])  # CLIP normalization
    ])

    image = transform(image_tensor.squeeze(0)).unsqueeze(0)
    # image = T.ToPILImage()(image_tensor.squeeze(0))

    text_inputs = processor([text], return_tensors='pt', padding=True, truncation=True)

    # inputs = processor(text=[text], images=image, return_tensors="pt", padding=True)
    # Get image and text embeddings
    # inputs = {key: value.cuda() for key, value in inputs.items()}
    outputs = image_encoder(input_ids=text_inputs['input_ids'].cuda(),
                attention_mask=text_inputs['attention_mask'].cuda(),
                pixel_values=image.cuda())

    similarity = outputs.logits_per_image
    return similarity


[2025-01-02 13:27:37,845] [INFO] [real_accelerator.py:110:get_accelerator] Setting ds_accelerator to cuda (auto detect)


KeyboardInterrupt: 

In [None]:
from PIL import Image
import requests
from torchvision import transforms

from transformers import CLIPProcessor, CLIPModel

model = CLIPModel.from_pretrained("openai/clip-vit-large-patch14").eval().cuda()
processor = CLIPProcessor.from_pretrained("openai/clip-vit-large-patch14")

`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["id2label"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["bos_token_id"]` will be overriden.
`text_config_dict` is provided which will be used to initialize `CLIPTextConfig`. The value `text_config["eos_token_id"]` will be overriden.


In [None]:
poison_path = "/usr/xtmp/mxy/VLM-Poisoning/data/poisons/mini_MathVista_grid/abst/0.png"
base_path = "data/task_data/mini_MathVista_grid/base_train/1.jpg"
image_poison = Image.open(poison_path)
image_base = Image.open(base_path)

texts = ["a cartoon illustration of a living room with a chair and table", "A diagram of a spring with a label stating that the spring force does negative work, decreasing speed and kinetic energy."]
texts = ["a cartoon illustration of a living room with a chair and table"]

inputs = processor(text=texts, 
                    images=[image_poison], return_tensors="pt", padding=True)

inputs = {key:value.cuda() for key, value in inputs.items()}

outputs = model(**inputs, output_attentions=True)
logits_per_image = outputs.logits_per_image # this is the image-text similarity score
# probs = logits_per_image.softmax(dim=1) # we can take the softmax to get the label probabilities
logits_per_image


tensor([[8.9219]], device='cuda:0', grad_fn=<TBackward0>)

In [None]:
def i2t_similarity_clip(image_encoder, processor, image_tensor, text):
    # Preprocess image and text

    transform = T.Compose([
        T.Resize(224, interpolation=InterpolationMode.BICUBIC),  # Resize image to CLIP's input size
        transforms.CenterCrop(224),
        T.Normalize(mean=[0.48145466, 0.4578275, 0.40821073], std=[0.26862954, 0.26130258, 0.27577711])  # CLIP normalization
    ])

    image = transform(image_tensor.squeeze(0)).unsqueeze(0).cuda()
    # image = T.ToPILImage()(image_tensor.squeeze(0))

    text_inputs = processor([text], return_tensors='pt', padding=True, truncation=True)
    text_inputs = {key:value.cuda() for key, value in text_inputs.items()}

    # inputs = processor(text=[text], images=image, return_tensors="pt", padding=True)
    # Get image and text embeddings
    # inputs = {key: value.cuda() for key, value in inputs.items()}
    outputs = image_encoder(**text_inputs,
                pixel_values=image)

    similarity = outputs.logits_per_image
    return similarity

In [None]:
i2t_similarity_clip(model, processor, transforms.ToTensor()(image_poison), texts[0])



tensor([[8.5963]], device='cuda:0', grad_fn=<TBackward0>)

In [None]:
# from training_models import get_internlm_model, get_response_internlm

# model, tokenizer = get_internlm_model()

In [None]:
# from PIL import Image

# poison_path = "/usr/xtmp/mxy/VLM-Poisoning/data/poisons/mini_MathVista_grid/abst/0.png"
# base_path = "data/task_data/mini_MathVista_grid/base_train/1.jpg"
# image_poison = Image.open(poison_path)
# image_base = Image.open(base_path)

In [None]:
# get_response_internlm(image_poison, "describe this image", tokenizer, model)

In [None]:
# get_response_internlm(image_base, "breifly describe this image", tokenizer, model)