In [1]:
import torch
from transformers import CLIPModel, CLIPTokenizer, CLIPVisionModel, CLIPVisionModelWithProjection, CLIPTextModelWithProjection, AutoTokenizer
from PIL import Image
import torchvision
from torch.nn import DataParallel
import torchvision.transforms as T
from torchvision.transforms import InterpolationMode
import warnings

class MyClip():
    def __init__(self):
        warnings.filterwarnings("ignore")
        model_name = "openai/clip-vit-base-patch32"
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.clip_vision_model = CLIPVisionModelWithProjection.from_pretrained(model_name).to(self.device)
        self.clip_text_model = CLIPTextModelWithProjection.from_pretrained(model_name).to(self.device)
        self.tokenizer = AutoTokenizer.from_pretrained(model_name)
        
        self.input_res = self.clip_vision_model.vision_model.config.image_size

    def encode_image(self, image):
        outputs = self.clip_vision_model(pixel_values=clip_preprocess(image))
        return outputs.image_embeds

    def encode_text(self, text):
        inputs = self.tokenizer(text, padding=True, return_tensors="pt")
        inputs.to(self.device)
        outputs = self.clip_text_model(**inputs)
        return outputs.text_embeds

In [2]:
import torch
from transformers import AutoModel, AutoTokenizer
import warnings

import numpy as np
import torch
import torchvision.transforms as T
from decord import VideoReader, cpu
from PIL import Image
from torchvision.transforms.functional import InterpolationMode
IMAGENET_MEAN = (0.485, 0.456, 0.406)
IMAGENET_STD = (0.229, 0.224, 0.225)
def build_transform(input_size):
    MEAN, STD = IMAGENET_MEAN, IMAGENET_STD
    transform = T.Compose([
        T.Lambda(lambda img: img.convert('RGB') if img.mode != 'RGB' else img),
        T.Resize((input_size, input_size), interpolation=InterpolationMode.BICUBIC),
        T.ToTensor(),
        T.Normalize(mean=MEAN, std=STD)
    ])
    return transform
def find_closest_aspect_ratio(aspect_ratio, target_ratios, width, height, image_size):
    best_ratio_diff = float('inf')
    best_ratio = (1, 1)
    area = width * height
    for ratio in target_ratios:
        target_aspect_ratio = ratio[0] / ratio[1]
        ratio_diff = abs(aspect_ratio - target_aspect_ratio)
        if ratio_diff < best_ratio_diff:
            best_ratio_diff = ratio_diff
            best_ratio = ratio
        elif ratio_diff == best_ratio_diff:
            if area > 0.5 * image_size * image_size * ratio[0] * ratio[1]:
                best_ratio = ratio
    return best_ratio
def dynamic_preprocess(image, min_num=1, max_num=12, image_size=448, use_thumbnail=False):
    orig_width, orig_height = image.size
    aspect_ratio = orig_width / orig_height
    # calculate the existing image aspect ratio
    target_ratios = set(
        (i, j) for n in range(min_num, max_num + 1) for i in range(1, n + 1) for j in range(1, n + 1) if
        i * j <= max_num and i * j >= min_num)
    target_ratios = sorted(target_ratios, key=lambda x: x[0] * x[1])
    # find the closest aspect ratio to the target
    target_aspect_ratio = find_closest_aspect_ratio(
        aspect_ratio, target_ratios, orig_width, orig_height, image_size)
    # calculate the target width and height
    target_width = image_size * target_aspect_ratio[0]
    target_height = image_size * target_aspect_ratio[1]
    blocks = target_aspect_ratio[0] * target_aspect_ratio[1]
    # resize the image
    resized_img = image.resize((target_width, target_height))
    processed_images = []
    for i in range(blocks):
        box = (
            (i % (target_width // image_size)) * image_size,
            (i // (target_width // image_size)) * image_size,
            ((i % (target_width // image_size)) + 1) * image_size,
            ((i // (target_width // image_size)) + 1) * image_size
        )
        # split the image
        split_img = resized_img.crop(box)
        processed_images.append(split_img)
    assert len(processed_images) == blocks
    if use_thumbnail and len(processed_images) != 1:
        thumbnail_img = image.resize((image_size, image_size))
        processed_images.append(thumbnail_img)
    return processed_images
def load_image(image_file, input_size=448, max_num=12):
    image = Image.open(image_file).convert('RGB')
    transform = build_transform(input_size=input_size)
    images = dynamic_preprocess(image, image_size=input_size, use_thumbnail=True, max_num=max_num)
    pixel_values = [transform(image) for image in images]
    pixel_values = torch.stack(pixel_values)
    return pixel_values

In [3]:
class MyInternVL():
    def __init__(self, model_name="OpenGVLab/InternVL2-8B-MPO", num_beams=3, patience=1000000, sleep_time=0, d_type=torch.bfloat16):
        warnings.filterwarnings("ignore")
        self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")

        self.model = AutoModel.from_pretrained(
            model_name,
            torch_dtype=torch.bfloat16,
            low_cpu_mem_usage=True,
            use_flash_attn=True,
            trust_remote_code=True).eval().to(self.device)
        
        self.tokenizer = AutoTokenizer.from_pretrained(model_name, trust_remote_code=True, use_fast=False)
        self.model.tokenizer = self.tokenizer
        self.d_type = d_type

    def get_response(self, image_path, user_prompt):
        pixel_values = load_image(image_path, max_num=12).to(torch.bfloat16).to(self.device)
        generation_config = dict(max_new_tokens=1024, do_sample=False)
        query = user_prompt
        with torch.no_grad():
            with torch.autocast(device_type='cuda', dtype=self.d_type):
            # with self.accelerator.autocast():
                response = self.model.chat(self.tokenizer, pixel_values, query, generation_config)
        prediction = response.strip()
        return prediction


In [4]:
encoder_model = MyInternVL()

FlashAttention2 is not installed.


Loading checkpoint shards:   0%|          | 0/4 [00:00<?, ?it/s]

Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [20]:
text = "The diagram illustrates a spring system where the spring force performs negative work on a mass, indicated by the decreasing speed and kinetic energy as the mass approaches the spring."
inputs = encoder_model.tokenizer(text, return_tensors='pt').to(encoder_model.device)
image_flags = torch.zeros((1, 1), dtype=torch.bfloat16).to(encoder_model.device)

generation_config = dict(max_new_tokens=1024, do_sample=False)
with torch.no_grad():
    outputs = encoder_model.model.chat(encoder_model.tokenizer, None, "who goes there", generation_config)

In [21]:
outputs

"I'm here to help! How can I assist you today?"

In [5]:
# image_path = '/usr/xtmp/mxy/VLM-Poisoning/data/mini_MathVista_grid/base/2.jpg'

# pixel_values = load_image(image_path, max_num=12).to(torch.bfloat16).to(encoder_model.device)
# outputs2 = encoder_model.model.vision_model(pixel_values)

In [6]:
# outputs.last_hidden_state.shape = torch.Size([9, 1025, 1024])

In [7]:
# outputs2.last_hidden_state.shape = torch.Size([13, 1025, 1024])