In [None]:
import torch
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image
from transformers.utils import hub

hub.HUGGINGFACE_HUB_HTTP_TIMEOUT = 60

# Load model
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
processor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)
model.eval()

# Caption generation
def generate_caption(image_path, max_length=30, num_beams=4):
    image = Image.open(image_path).convert("RGB")
    pixel_values = processor(images=image, return_tensors="pt").pixel_values.to(device)
    output_ids = model.generate(pixel_values, max_length=max_length, num_beams=num_beams, early_stopping=True)
    return tokenizer.decode(output_ids[0], skip_special_tokens=True)

# Generate caption
if __name__ == "__main__":
    img_path = "/content/cat-1.jpeg"  # Replace with your image
    print("Caption:", generate_caption(img_path))

Fetching 1 files:   0%|          | 0/1 [00:00<?, ?it/s]

Caption: a cat sitting on the floor next to a rug 
