In [1]:
import os
import time
import json
import torch
import transformers
from transformers import AutoModelForCausalLM, AutoTokenizer
from PIL import Image
import warnings

# Define paths
DATASET_PATH = r"C:\Users\Patrick\Documents\thesis\Dataset\OwnDataSet"
RESULTS_PATH = r"C:\Users\Patrick\Documents\thesis\Dataset\Results"

# Disable some warnings
transformers.logging.set_verbosity_error()
transformers.logging.disable_progress_bar()
warnings.filterwarnings('ignore')

def load_image(image_file):
    return Image.open(image_file).convert('RGB')

def process_images(model, tokenizer, dataset_path):
    results = []
    for filename in os.listdir(dataset_path):
        if filename.endswith((".png", ".jpg", ".jpeg")):
            img_path = os.path.join(dataset_path, filename)
            
            start_time = time.time()
            
            # Load image
            image = load_image(img_path)
            
            # Generate description
            prompt = 'Please describe the image shortly Maximum 150 characters.'
            messages = [
                {"role": "user", "content": f'<image>\n{prompt}'}
            ]
            text = tokenizer.apply_chat_template(
                messages,
                tokenize=False,
                add_generation_prompt=True
            )
            
            text_chunks = [tokenizer(chunk).input_ids for chunk in text.split('<image>')]
            input_ids = torch.tensor(text_chunks[0] + [-200] + text_chunks[1], dtype=torch.long).unsqueeze(0)
            
            image_tensor = model.process_images([image], model.config).to(dtype=model.dtype)
            
            output_ids = model.generate(
                input_ids,
                images=image_tensor,
                max_new_tokens=150,
                use_cache=True)[0]
            
            response = tokenizer.decode(output_ids[input_ids.shape[1]:], skip_special_tokens=True).strip()
            
            end_time = time.time()
            
            # Calculate metrics
            processing_time = end_time - start_time
            output_tokens = len(tokenizer.encode(response))
            
            results.append({
                "filename": filename,
                "processing_time": processing_time,
                "output_tokens": output_tokens,
                "alternative_text": response
            })
            
            print(f"Processed {filename}")
    
    return results

def save_results(results, output_path):
    os.makedirs(output_path, exist_ok=True)
    output_file = os.path.join(output_path, "nanoLLaVA_analysis_results.json")
    with open(output_file, "w") as f:
        json.dump(results, f, indent=2)
    print(f"Results saved to {output_file}")

# Setup for CPU or GPU
use_cpu = True  # Set this to False if you want to use GPU
device = 'cpu' if use_cpu else 'cuda'
torch.set_default_device(device)

# Load model and tokenizer
model = AutoModelForCausalLM.from_pretrained(
    'qnguyen3/nanoLLaVA',
    torch_dtype=torch.float16,
    device_map='auto',
    trust_remote_code=True)
tokenizer = AutoTokenizer.from_pretrained(
    'qnguyen3/nanoLLaVA',
    trust_remote_code=True)

# Process images
results = process_images(model, tokenizer, DATASET_PATH)

# Print summary
total_time = sum(r["processing_time"] for r in results)
total_output_tokens = sum(r["output_tokens"] for r in results)
num_images = len(results)

print(f"Processed {num_images} images")
print(f"Total processing time: {total_time:.2f} seconds")
print(f"Average time per image: {total_time/num_images:.2f} seconds")
print(f"Total output tokens: {total_output_tokens}")
print(f"Average output tokens per image: {total_output_tokens/num_images:.2f}")

# Save results
save_results(results, RESULTS_PATH)

  from .autonotebook import tqdm as notebook_tqdm


KeyboardInterrupt: 

In [4]:
import requests
from PIL import Image
from transformers import Blip2Processor, Blip2ForConditionalGeneration
import torch

print("Starting the process...")

print("Loading processor...")
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")
print("Processor loaded.")

print("Loading model...")
model = Blip2ForConditionalGeneration.from_pretrained("Salesforce/blip2-opt-2.7b")
print("Model loaded.")

print("Downloading image...")
img_url = 'https://storage.googleapis.com/sfr-vision-language-research/BLIP/demo.jpg' 
raw_image = Image.open(requests.get(img_url, stream=True).raw).convert('RGB')
print("Image downloaded and converted.")

question = "What is on the image?"
print("Processing inputs...")
inputs = processor(raw_image, question, return_tensors="pt")
print("Inputs processed.")

print("Generating output...")
with torch.no_grad():
    out = model.generate(**inputs, max_new_tokens=50)
print("Output generated.")

result = processor.decode(out[0], skip_special_tokens=True).strip()
print("Result:", result)

Starting the process...
Loading processor...
Processor loaded.
Loading model...


Loading checkpoint shards: 100%|██████████| 2/2 [00:21<00:00, 10.92s/it]


Model loaded.
Downloading image...
Image downloaded and converted.
Processing inputs...
Inputs processed.
Generating output...
Output generated.
Result: 


In [5]:
processor

Blip2Processor:
- image_processor: BlipImageProcessor {
  "do_convert_rgb": true,
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "image_mean": [
    0.48145466,
    0.4578275,
    0.40821073
  ],
  "image_processor_type": "BlipImageProcessor",
  "image_std": [
    0.26862954,
    0.26130258,
    0.27577711
  ],
  "processor_class": "Blip2Processor",
  "resample": 3,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}

- tokenizer: GPT2TokenizerFast(name_or_path='Salesforce/blip2-opt-2.7b', vocab_size=50265, model_max_length=1000000000000000019884624838656, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '</s>', 'eos_token': '</s>', 'unk_token': '</s>', 'pad_token': '<pad>'}, clean_up_tokenization_spaces=True),  added_tokens_decoder={
	1: AddedToken("<pad>", rstrip=False, lstrip=False, single_word=False, normalized=True, special=True),
	2: AddedToken("</s>", rstrip=False, lstrip=Fa

In [6]:
print("Generating output...")
with torch.no_grad():
    out = model.generate(**inputs, max_new_tokens=50)
print("Output generated.")
print("Raw output:", out)
print("Output shape:", out.shape)

Generating output...
Output generated.
Raw output: tensor([[50118]])
Output shape: torch.Size([1, 1])


In [7]:
result = processor.decode(out[0], skip_special_tokens=True).strip()
print("Result:", result)

# Manual decoding
vocab = processor.tokenizer.get_vocab()
inv_vocab = {v: k for k, v in vocab.items()}
manual_result = ' '.join([inv_vocab.get(token.item(), '[UNK]') for token in out[0]])
print("Manual decoding result:", manual_result)

Result: 
Manual decoding result: Ċ


In [8]:
print("Input IDs shape:", inputs['input_ids'].shape)
print("Attention mask shape:", inputs['attention_mask'].shape)
print("Sample of input IDs:", inputs['input_ids'][0][:10])  # Print first 10 tokens

Input IDs shape: torch.Size([1, 7])
Attention mask shape: torch.Size([1, 7])
Sample of input IDs: tensor([   2, 2264,   16,   15,    5, 2274,  116])


In [9]:
# Simple generation test
test_input = processor("What is the capital of France?", return_tensors="pt")
test_output = model.generate(**test_input, max_new_tokens=50)
test_result = processor.decode(test_output[0], skip_special_tokens=True).strip()
print("Test result:", test_result)

ValueError: Invalid image type. Expected either PIL.Image.Image, numpy.ndarray, torch.Tensor, tf.Tensor or jax.ndarray, but got <class 'str'>.

In [10]:
generated_ids = model.generate(
    pixel_values=inputs['pixel_values'],
    input_ids=inputs['input_ids'],
    attention_mask=inputs['attention_mask'],
    max_new_tokens=50,
    num_beams=5,
    no_repeat_ngram_size=2,
    early_stopping=True
)
generated_text = processor.batch_decode(generated_ids, skip_special_tokens=True)[0].strip()
print("Generated text:", generated_text)

Generated text: a woman sitting on the beach with her dog


In [13]:
import os
import time
import json
import torch
from transformers import Blip2Processor, Blip2ForConditionalGeneration
from PIL import Image
import warnings

# Define paths
DATASET_PATH = r"C:\Users\Patrick\Documents\thesis\Dataset\OwnDataSet"
RESULTS_PATH = r"C:\Users\Patrick\Documents\thesis\Dataset\Results"

# Disable some warnings
warnings.filterwarnings('ignore')

def load_image(image_file):
    return Image.open(image_file).convert('RGB')

def process_images(model, processor, dataset_path):
    results = []
    for filename in os.listdir(dataset_path):
        if filename.endswith((".png", ".jpg", ".jpeg")):
            img_path = os.path.join(dataset_path, filename)
            
            start_time = time.time()
            
            # Load and process image
            image = load_image(img_path)
            
            # Generate description
            prompt = 'Please describe the image shortly Maximum 150 characters.'
            inputs = processor(image, prompt, return_tensors="pt")
            inputs = {k: v.to(model.device) for k, v in inputs.items()}
            
            with torch.no_grad():
                output_ids = model.generate(
                    **inputs,
                    max_new_tokens=150,
                    num_beams=5,
                    no_repeat_ngram_size=2,
                    early_stopping=True
                )
            
            response = processor.decode(output_ids[0], skip_special_tokens=True).strip()
            
            end_time = time.time()
            
            # Calculate metrics
            processing_time = end_time - start_time
            output_tokens = len(processor.tokenizer.encode(response))
            
            results.append({
                "filename": filename,
                "processing_time": processing_time,
                "output_tokens": output_tokens,
                "alternative_text": response
            })
            
            print(f"Processed {filename}")
    
    return results

def save_results(results, output_path):
    os.makedirs(output_path, exist_ok=True)
    output_file = os.path.join(output_path, "blip2_analysis_results.json")
    with open(output_file, "w") as f:
        json.dump(results, f, indent=2)
    print(f"Results saved to {output_file}")

# Setup for CPU or GPU
use_cpu = True  # Set this to False if you want to use GPU

# Load model and processor
print("Loading model and processor...")
processor = Blip2Processor.from_pretrained("Salesforce/blip2-opt-2.7b")

if use_cpu:
    model = Blip2ForConditionalGeneration.from_pretrained(
        "Salesforce/blip2-opt-2.7b",
        torch_dtype=torch.float32,
        device_map="cpu"
    )
else:
    model = Blip2ForConditionalGeneration.from_pretrained(
        "Salesforce/blip2-opt-2.7b",
        torch_dtype=torch.float16,
        device_map="auto"
    )

print("Model and processor loaded.")

# Process images
print("Processing images...")
results = process_images(model, processor, DATASET_PATH)

# Print summary
total_time = sum(r["processing_time"] for r in results)
total_output_tokens = sum(r["output_tokens"] for r in results)
num_images = len(results)
print(f"Processed {num_images} images")
print(f"Total processing time: {total_time:.2f} seconds")
print(f"Average time per image: {total_time/num_images:.2f} seconds")
print(f"Total output tokens: {total_output_tokens}")
print(f"Average output tokens per image: {total_output_tokens/num_images:.2f}")

# Save results
save_results(results, RESULTS_PATH)

Loading model and processor...


Loading checkpoint shards: 100%|██████████| 2/2 [00:00<00:00,  3.86it/s]


Model and processor loaded.
Processing images...
Processed 01.jpg
Processed 02.jpg
Processed 03.jpg
Processed 04.jpg
Processed 05.jpg
Processed 06.png
Processed 07.png
Processed 08.png
Processed 09.png
Processed 10.png
Processed 11.png
Processed 12.png
Processed 13.png
Processed 14.jpg
Processed 15.png
Processed 15 images
Total processing time: 273.73 seconds
Average time per image: 18.25 seconds
Total output tokens: 15
Average output tokens per image: 1.00
Results saved to C:\Users\Patrick\Documents\thesis\Dataset\Results\blip2_analysis_results.json
