In [None]:
import os
import time
import csv
import torch
import pandas as pd
from PIL import Image
from tqdm import tqdm
from glob import glob
from pathlib import Path

from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer

In [None]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

max_length = 25
min_length = 20
num_beams = 4
gen_kwargs = {"max_length": max_length, "min_length": min_length, "num_beams": num_beams}

In [None]:
# Move model to CUDA device
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model.to(device)

def predict(input_dir):
  pre = []

  for image_path in tqdm(Path(input_dir).glob("*.jpg")):

    image = Image.open(image_path)
    # Pre-process the image for the DETR model
    pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)


    output_ids = model.generate(pixel_values, **gen_kwargs)

    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    preds = [pred.strip() for pred in preds]
    pre.append(preds)
  return pre

In [None]:
img_path = "E:/.../test_img/"

result = predict(img_path)

In [None]:
csv_file_path = 'E:/.../img_name.csv'  # output csv path
df = pd.read_csv(csv_file_path)

df['Caption'] = result
df.to_csv(csv_file_path, index=False)