In [1]:
from PIL import Image
from transformers import AutoModel, AutoProcessor
import torch

  from .autonotebook import tqdm as notebook_tqdm


In [2]:
from transformers import AutoModel, AutoProcessor

model = AutoModel.from_pretrained("unum-cloud/uform-gen2-qwen-500m", trust_remote_code=True)
processor = AutoProcessor.from_pretrained("unum-cloud/uform-gen2-qwen-500m", trust_remote_code=True)

prompt = "Describe the image accurately"
image = Image.open("red_panda.jpg")

inputs = processor(text=[prompt], images=[image], return_tensors="pt")

with torch.inference_mode():
     output = model.generate(
        **inputs,
        do_sample=False,
        use_cache=True,
        max_new_tokens=256,
        eos_token_id=151645,
        pad_token_id=processor.tokenizer.pad_token_id
    )
prompt_len = inputs["input_ids"].shape[1]
decoded_text = processor.batch_decode(output[:, prompt_len:])[0]

Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.29it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.


In [3]:
prompt = "Describe the image shortly"
image = Image.open("0.png")

inputs = processor(text=[prompt], images=[image], return_tensors="pt")

with torch.inference_mode():
     output = model.generate(
        **inputs,
        do_sample=False,
        use_cache=True,
        max_new_tokens=256,
        eos_token_id=151645,
        pad_token_id=processor.tokenizer.pad_token_id
    )
prompt_len = inputs["input_ids"].shape[1]
decoded_text = processor.batch_decode(output[:, prompt_len:])[0]
decoded_text

"The image features a cartoon-style illustration of a person's face, colored in a vibrant red. The person's face is adorned with a red hat and a pair of white glasses. The person's eyes are large and round, with black pupils. The mouth is slightly open, and the nose is pointed upwards. The person's hair is short and straight, with a red color. The face is labeled with various body parts, including the head, neck, shoulders, arms, legs, and feet. The labels are clearly visible and easily readable. The image is a simple yet effective representation of a person's face, with each body part clearly labeled and labeled.<|im_end|>"

In [9]:

from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds


predict_step(['61.png']) # ['a woman in a hospital bed with a woman in a hospital bed']


['a collage of photos of a plant and a bird']

In [3]:
from lmdeploy import pipeline, TurbomindEngineConfig, ChatTemplateConfig
from lmdeploy.vl import load_image

model = 'OpenGVLab/Mini-InternVL-Chat-2B-V1-5'
image = load_image('https://raw.githubusercontent.com/open-mmlab/mmdeploy/main/tests/data/tiger.jpeg')
chat_template_config = ChatTemplateConfig('internvl-internlm2')
backend_config = TurbomindEngineConfig(session_len=8192, offload_folder='./test')  # specify the path to offload folder
pipe = pipeline(model, chat_template_config=chat_template_config, backend_config=backend_config)
response = pipe(('describe this image', image))
print(response.text)


Fetching 22 files:   0%|          | 0/22 [00:00<?, ?it/s]



ValueError: At least one of the model submodule will be offloaded to disk, please pass along an `offload_folder`.

In [5]:
import uform
from PIL import Image

# If you want to use the PyTorch model
model, processor = uform.get_model('unum-cloud/uform-gen2-qwen-500m') # Just English
#model, processor = uform.get_model('unum-cloud/uform-vl-multilingual-v2') # 21 Languages

# If you want to use the light-weight portable ONNX model
# Available combinations: cpu & fp32, gpu & fp32, gpu & fp16
# Check out Unum's Hugging Face space for more details: https://huggingface.co/unum-cloud
#model, processor = uform.get_model_onnx('unum-cloud/uform-vl-english-small', 'cpu', 'fp32')
model, processor = uform.get_model_onnx('unum-cloud/uform-vl-english-large', 'gpu', 'fp16')

text = 'a small red panda in a zoo'
image = Image.open('red_panda.jpg')

image_data = processor.preprocess_image(image)
text_data = processor.preprocess_text(text)

image_features, image_embedding = model.encode_image(image_data, return_features=True)
text_features, text_embedding = model.encode_text(text_data, return_features=True)

Fetching 13 files: 100%|██████████| 13/13 [00:01<00:00, 12.31it/s]


FileNotFoundError: [Errno 2] No such file or directory: 'C:\\Users\\Patrick\\.cache\\huggingface\\hub\\models--unum-cloud--uform-gen2-qwen-500m\\snapshots\\78dc2e4d600def7698d5fb3733bea4e22dd2f3f9/torch_weight.pt'

In [2]:
import os
import time
import json
import torch
from PIL import Image
from transformers import AutoModel, AutoProcessor, StoppingCriteria, StoppingCriteriaList

# Define paths
DATASET_PATH = r"C:\Users\Patrick\Documents\thesis\Dataset\OwnDataSet"
RESULTS_PATH = r"C:\Users\Patrick\Documents\thesis\Dataset\Results"

class StopOnTokens(StoppingCriteria):
    def __call__(self, input_ids: torch.LongTensor, scores: torch.FloatTensor, **kwargs) -> bool:
        stop_ids = [151645]
        for stop_id in stop_ids:
            if input_ids[0][-1] == stop_id:
                return True
        return False

def extract_assistant_response(full_response):
    # Split the response by roles
    parts = full_response.split('assistant\n')
    if len(parts) > 1:
        # Return only the last part (assistant's response)
        return parts[-1].strip()
    return full_response  # Return full response if splitting fails

def process_images(model, processor, dataset_path, device):
    results = []
    for filename in os.listdir(dataset_path):
        if filename.endswith((".png", ".jpg", ".jpeg")):
            img_path = os.path.join(dataset_path, filename)
            
            start_time = time.time()
            
            # Load and preprocess image
            image = Image.open(img_path).convert('RGB')
            pixel_values = processor.feature_extractor(image).unsqueeze(0).to(device)
            
            # Generate description
            prompt = "Describe the image shortly"
            messages = [
                {"role": "system", "content": "You are a helpful assistant."},
                {"role": "user", "content": f" <image>{prompt}"}
            ]
            model_inputs = processor.tokenizer.apply_chat_template(
                messages,
                add_generation_prompt=True,
                return_tensors="pt"
            ).to(device)

            attention_mask = torch.ones(
                1, model_inputs.shape[1] + processor.num_image_latents - 1
            ).to(device)
            
            inputs = {
                "input_ids": model_inputs,
                "images": pixel_values,
                "attention_mask": attention_mask
            }
            
            stop = StopOnTokens()
            with torch.no_grad():
                outputs = model.generate(
                    **inputs,
                    max_new_tokens=35,
                    stopping_criteria=StoppingCriteriaList([stop])
                )
            
            full_response = processor.tokenizer.decode(outputs[0], skip_special_tokens=True)
            response = extract_assistant_response(full_response)
            
            end_time = time.time()
            
            # Calculate metrics
            processing_time = end_time - start_time
            input_tokens = inputs['input_ids'].numel()
            output_tokens = outputs.numel()
            
            results.append({
                "filename": filename,
                "processing_time": processing_time,
                "input_tokens": input_tokens,
                "output_tokens": output_tokens,
                "alternative_text": response
            })
            
            print(f"Processed {filename}")
    
    return results

def save_results(results, output_path, device):
    os.makedirs(output_path, exist_ok=True)
    output_file = os.path.join(output_path, f"uform-gen2-qwen-500m_{device}_analysis_results.json")
    with open(output_file, "w") as f:
        json.dump(results, f, indent=2)
    print(f"Results saved to {output_file}")

def run_benchmark(device):
    print(f"Running benchmark on {device}")
    
    # Load model and processor
    model_path = 'unum-cloud/uform-gen2-qwen-500m'
    model = AutoModel.from_pretrained(model_path, trust_remote_code=True).to(device)
    processor = AutoProcessor.from_pretrained(model_path, trust_remote_code=True)

    # Process images
    results = process_images(model, processor, DATASET_PATH, device)

    # Print summary
    total_time = sum(r["processing_time"] for r in results)
    total_input_tokens = sum(r["input_tokens"] for r in results)
    total_output_tokens = sum(r["output_tokens"] for r in results)
    num_images = len(results)
    
    print(f"Processed {num_images} images")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Average time per image: {total_time/num_images:.2f} seconds")
    print(f"Total input tokens: {total_input_tokens}")
    print(f"Total output tokens: {total_output_tokens}")
    print(f"Average input tokens per image: {total_input_tokens/num_images:.2f}")
    print(f"Average output tokens per image: {total_output_tokens/num_images:.2f}")

    # Save results
    save_results(results, RESULTS_PATH, device)

def main():
    # Run benchmark on GPU
    if torch.cuda.is_available():
        run_benchmark("cuda")
    else:
        print("CUDA is not available. Skipping GPU benchmark.")

    # Run benchmark on CPU
    run_benchmark("cpu")

if __name__ == "__main__":
    main()

Running benchmark on cuda


Loading checkpoint shards: 100%|██████████| 2/2 [00:01<00:00,  1.19it/s]
Special tokens have been added in the vocabulary, make sure the associated word embeddings are fine-tuned or trained.
