In [7]:
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
import torch
from PIL import Image

model = VisionEncoderDecoderModel.from_pretrained("/data1/ViTgpt2/image-captioning-output")
feature_extractor = ViTImageProcessor.from_pretrained("/data1/ViTgpt2/image-captioning-output")
tokenizer = AutoTokenizer.from_pretrained("/data1/ViTgpt2/image-captioning-output")


device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds


print(predict_step(['/home/vcl3d/coco_dataset_VOX/test2015/COCO_test2015_000000000202.jpg']))

['1 The person1 is to the right of the tie']


In [8]:
model.eval()

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0-11): 12 x ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_featur

In [9]:
from optimum.onnxruntime import ORTModelForImageClassification
from optimum.onnxruntime import ORTModelForVision2Seq
from transformers import AutoFeatureExtractor
from pathlib import Path


model_id="/data1/ViTgpt2/image-captioning-output"
onnx_path = Path("onnx")

# load vanilla transformers and convert to onnx
model = ORTModelForVision2Seq.from_pretrained(model_id, from_transformers=False)
preprocessor = AutoFeatureExtractor.from_pretrained(model_id)
tokenizer = AutoTokenizer.from_pretrained(model_id)
# save onnx checkpoint and tokenizer
model.save_pretrained(onnx_path)
preprocessor.save_pretrained(onnx_path)


The argument `from_transformers` is deprecated, and will be removed in optimum 2.0.  Use `export` instead
Framework not specified. Using pt to export to ONNX.
Using framework PyTorch: 2.0.1+cu118


verbose: False, log level: Level.ERROR



Using framework PyTorch: 2.0.1+cu118


verbose: False, log level: Level.ERROR



Using framework PyTorch: 2.0.1+cu118
Asked a sequence length of 16, but a sequence length of 1 will be used with use_past == True for `decoder_input_ids`.


verbose: False, log level: Level.ERROR



['onnx/preprocessor_config.json']

In [10]:
from transformers import pipeline

vanilla_clf = pipeline("image-to-text", model=model, feature_extractor=preprocessor, tokenizer=tokenizer)
print(vanilla_clf("https://datasets-server.huggingface.co/assets/visual_genome/--/attributes_v1.0.0/train/0/image/image.jpg"))

[{'generated_text': '2 The stop sign is to the left of the train'}]


In [29]:
# from optimum.onnxruntime import ORTModelForImageClassification
# from optimum.onnxruntime import ORTModelForVision2Seq
# from transformers import AutoFeatureExtractor, VisionEncoderDecoderConfig
# from pathlib import Path

# from transformers import BertConfig, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel, GPT2Config
# model_id="/data1/ViTgpt2/image-captioning-output"
# onnx_path = Path("onnx")
# # Initializing a ViT & BERT style configuration
# config_encoder = ViTConfig()
# #config_decoder = BertConfig()
# config_decoder = GPT2Config()

# config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

# # Initializing a ViTBert model (with random weights) from a ViT & bert-base-uncased style configurations
# model = VisionEncoderDecoderModel(config=config)


# config_encoder = model.config.encoder
# config_decoder = model.config.decoder

# config_decoder.is_decoder = True
# config_decoder.add_cross_attention = True

# model.save_pretrained(model_id)

# encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained(model_id)
# model = VisionEncoderDecoderModel.from_pretrained(model_id)
# # load vanilla transformers and convert to onnx
# #model = ORTModelForVision2Seq.from_pretrained(model_id, from_transformers=True)
# preprocessor = AutoFeatureExtractor.from_pretrained(model_id)
# tokenizer = AutoTokenizer.from_pretrained(model_id)
# # save onnx checkpoint and tokenizer
# model.save_pretrained(onnx_path)
# preprocessor.save_pretrained(onnx_path)

['onnx/preprocessor_config.json']

In [14]:
import onnxruntime as ort
import onnx
decoder_model_path = '/data1/ViTgpt2/onnx/decoder_model.onnx'
decoder_onnx_model = onnx.load(decoder_model_path)
encoder_model_path = '/data1/ViTgpt2/onnx/encoder_model.onnx'
encoder_onnx_model = onnx.load(encoder_model_path)
decoder_session = ort.InferenceSession(decoder_model_path)
encoder_session = ort.InferenceSession(encoder_model_path)
input_names = [input.name for input in encoder_onnx_model.graph.input]
print("Input Names:", input_names)

Input Names: ['pixel_values']


In [19]:
output_names = [input.name for input in decoder_onnx_model.graph.input]
print("Output Names:", output_names)

Output Names: ['input_ids', 'encoder_hidden_states']


In [20]:
output_names

'input_ids'

In [24]:
encoder_session.get_providers()

['CPUExecutionProvider']

In [None]:
input_names
#inputs = {'input_ids': input_names[0], 'encoder_hidden_states': input_names[1]}

In [None]:
key_value_pairs = {}

# Iterate through the list of strings
for string in input_names:
    # Split the string into key and value based on the last dot (.)
    parts = string.split('.')
    if len(parts) == 2:
        key, value = parts
    else:
        key = string
        value = string
    
    # Add the key-value pair to the dictionary
    key_value_pairs[key] = value

# Now, key_value_pairs contains the key-value pairs
print(key_value_pairs)

In [None]:
key_value_pairs.values()

In [None]:
import numpy as np
from PIL import Image

# Load and preprocess an image
input_image = Image.open('/home/vcl3d/coco_dataset_VOX_mini/train2014/COCO_train2014_000000000009.jpg')
input_image = input_image.resize((224, 224))  # Resize to match model input size
input_image = np.array(input_image)  # Convert to numpy array
input_image = (input_image / 255.0).astype(np.float32)  # Normalize pixel values
#input_image = np.transpose(input_image, (2, 0, 1))  # Change data layout if needed

In [None]:
from transformers import AutoTokenizer
from onnxruntime import InferenceSession

tokenizer = AutoTokenizer.from_pretrained(model_id)
encoder_session = InferenceSession('/data1/ViTgpt2/onnx/encoder_model.onnx')
decoder_session = InferenceSession('/data1/ViTgpt2/onnx/decoder_model.onnx')
#ONNX Runtime expects NumPy arrays as input
#outputs = session.run(output_names, input_names)


In [21]:
image = Image.open('/home/vcl3d/coco_dataset_VOX_mini/train2014/COCO_train2014_000000000009.jpg')
pixel_values = feature_extractor(images=image, return_tensors="pt").pixel_values
gen_tokens = model.generate(pixel_values)

TypeError: transformers.generation.utils.GenerationMixin.generate() argument after ** must be a mapping, not Tensor

In [22]:
from transformers import AutoImageProcessor, ViTImageProcessor, VisionEncoderDecoderConfig
processor = ViTImageProcessor.from_pretrained("Centaur31/myVitGpt2")
inputs = processor(image, return_tensors="pt")
gen_tokens = model.generate(**inputs)
gen_tokens



tensor([[50256,    17,   383,  9396,    16,   318,  2029,   262,  9396,    18,
         50256]])

In [23]:
outputs = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)
outputs

['2 The bowl1 is above the bowl3']

In [None]:
from transformers import BertConfig, ViTConfig, VisionEncoderDecoderConfig, VisionEncoderDecoderModel, GPT2Config

# Initializing a ViT & BERT style configuration
config_encoder = ViTConfig()
config_decoder = BertConfig()

config = VisionEncoderDecoderConfig.from_encoder_decoder_configs(config_encoder, config_decoder)

# Initializing a ViTBert model (with random weights) from a ViT & bert-base-uncased style configurations
model = VisionEncoderDecoderModel(config=config)

# Accessing the model configuration
config_encoder = model.config.encoder
config_decoder = model.config.decoder
# set decoder config to causal lm
config_decoder.is_decoder = True
config_decoder.add_cross_attention = True

# Saving the model, including its configuration
model.save_pretrained("/data1/ViTgpt2/image-captioning-output")

# loading model and config from pretrained folder
encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained("/data1/ViTgpt2/image-captioning-output")
model = VisionEncoderDecoderModel.from_pretrained("/data1/ViTgpt2/image-captioning-output", config=encoder_decoder_config)

In [None]:
from transformers import AutoImageProcessor, AutoTokenizer, ViTImageProcessor
from optimum.onnxruntime import ORTModelForVision2Seq
from PIL import Image
import requests


processor = ViTImageProcessor.from_pretrained("Centaur31/myVitGpt2")
encoder_decoder_config = VisionEncoderDecoderConfig.from_pretrained("/data1/ViTgpt2/image-captioning-output")
tokenizer = AutoTokenizer.from_pretrained("Centaur31/myVitGpt2")
model = VisionEncoderDecoderModel.from_pretrained("/data1/ViTgpt2/image-captioning-output", config=encoder_decoder_config)

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(image, return_tensors="pt")

gen_tokens = model.generate(**inputs)
outputs = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)

In [None]:
from transformers import AutoImageProcessor, AutoTokenizer
from optimum.onnxruntime import ORTModelForVision2Seq
from optimum.onnxruntime import ORTModelForSeq2SeqLM
from PIL import Image
import requests


processor = ViTImageProcessor.from_pretrained("Centaur31/myVitGpt2")
tokenizer = AutoTokenizer.from_pretrained("Centaur31/myVitGpt2")
model = ORTModelForSeq2SeqLM.from_pretrained("Centaur31/myVitGpt2", export=True)

url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(url, stream=True).raw)
inputs = processor(image, return_tensors="pt")

gen_tokens = model.generate(**inputs)
outputs = tokenizer.batch_decode(gen_tokens, skip_special_tokens=True)

In [None]:
encoder_decoder_config

In [None]:
from transformers import GPT2Config, GPT2Model

# Initializing a GPT2 configuration
configuration = GPT2Config()

# Initializing a model (with random weights) from the configuration
model = GPT2Model(configuration)

# Accessing the model configuration
configuration = model.config