In [2]:
import os
import time
import json
import torch
from PIL import Image
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer

# Define paths
DATASET_PATH = r"C:\Users\Patrick\Documents\thesis\Dataset\OwnDataSet"
RESULTS_PATH = r"C:\Users\Patrick\Documents\thesis\Dataset\Results"

def load_model_and_processors(device):
    model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning").to(device)
    feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
    return model, feature_extractor, tokenizer

def process_images(model, feature_extractor, tokenizer, dataset_path, device):
    results = []
    for filename in os.listdir(dataset_path):
        if filename.endswith((".png", ".jpg", ".jpeg")):
            img_path = os.path.join(dataset_path, filename)
            
            start_time = time.time()
            
            # Load and preprocess image
            image = Image.open(img_path).convert('RGB')
            pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values.to(device)
            
            # Generate description
            with torch.no_grad():
                output = model.generate(pixel_values, max_length=35, num_return_sequences=1)
            
            response = tokenizer.decode(output[0], skip_special_tokens=True)
            
            end_time = time.time()
            
            # Calculate metrics
            processing_time = end_time - start_time
            input_tokens = pixel_values.numel()
            output_tokens = output.numel()
            
            results.append({
                "filename": filename,
                "processing_time": processing_time,
                "input_tokens": input_tokens,
                "output_tokens": output_tokens,
                "alternative_text": response
            })
            
            print(f"Processed {filename}")
    
    return results

def save_results(results, output_path, device):
    os.makedirs(output_path, exist_ok=True)
    output_file = os.path.join(output_path, f"vit-gpt2_{device}_analysis_results.json")
    with open(output_file, "w") as f:
        json.dump(results, f, indent=2)
    print(f"Results saved to {output_file}")

def run_benchmark(device):
    print(f"Running benchmark on {device}")
    
    # Load model and processors
    model, feature_extractor, tokenizer = load_model_and_processors(device)

    # Process images
    results = process_images(model, feature_extractor, tokenizer, DATASET_PATH, device)

    # Print summary
    total_time = sum(r["processing_time"] for r in results)
    total_input_tokens = sum(r["input_tokens"] for r in results)
    total_output_tokens = sum(r["output_tokens"] for r in results)
    num_images = len(results)
    
    print(f"Processed {num_images} images")
    print(f"Total processing time: {total_time:.2f} seconds")
    print(f"Average time per image: {total_time/num_images:.2f} seconds")
    print(f"Total input tokens: {total_input_tokens}")
    print(f"Total output tokens: {total_output_tokens}")
    print(f"Average input tokens per image: {total_input_tokens/num_images:.2f}")
    print(f"Average output tokens per image: {total_output_tokens/num_images:.2f}")

    # Save results
    save_results(results, RESULTS_PATH, device)

def main():
    # Run benchmark on GPU
    if torch.cuda.is_available():
        run_benchmark("cuda")
    else:
        print("CUDA is not available. Skipping GPU benchmark.")

    # Run benchmark on CPU
    run_benchmark("cpu")

if __name__ == "__main__":
    main()

Running benchmark on cuda




Processed 01.jpg
Processed 02.jpg
Processed 03.jpg
Processed 04.jpg
Processed 05.jpg
Processed 06.png
Processed 07.png
Processed 08.png
Processed 09.png
Processed 10.png
Processed 11.png
Processed 12.png
Processed 13.png
Processed 14.jpg
Processed 15.png
Processed 15 images
Total processing time: 3.59 seconds
Average time per image: 0.24 seconds
Total input tokens: 2257920
Total output tokens: 189
Average input tokens per image: 150528.00
Average output tokens per image: 12.60
Results saved to C:\Users\Patrick\Documents\thesis\Dataset\Results\vit-gpt2_cuda_analysis_results.json
Running benchmark on cpu
Processed 01.jpg
Processed 02.jpg
Processed 03.jpg
Processed 04.jpg
Processed 05.jpg
Processed 06.png
Processed 07.png
Processed 08.png
Processed 09.png
Processed 10.png
Processed 11.png
Processed 12.png
Processed 13.png
Processed 14.jpg
Processed 15.png
Processed 15 images
Total processing time: 15.80 seconds
Average time per image: 1.05 seconds
Total input tokens: 2257920
Total output 