# Demo 01: Application of Image-Text Generation AI
Phase 01 - Just build, train and test the model
Model: nlpconnect vit-gpt2-image-captioning

In [3]:
# install packages
! pip install transformers

Collecting transformers
  Downloading transformers-4.29.2-py3-none-any.whl (7.1 MB)
Collecting tokenizers!=0.11.3,<0.14,>=0.11.1
  Downloading tokenizers-0.13.3-cp39-cp39-win_amd64.whl (3.5 MB)
Collecting huggingface-hub<1.0,>=0.14.1
  Downloading huggingface_hub-0.14.1-py3-none-any.whl (224 kB)
Installing collected packages: tokenizers, huggingface-hub, transformers
Successfully installed huggingface-hub-0.14.1 tokenizers-0.13.3 transformers-4.29.2


In [1]:
# import packages
import torch
import transformers
from transformers import VisionEncoderDecoderModel, ViTImageProcessor, AutoTokenizer
from PIL import Image

In [2]:
# prepare the environment to the model
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTImageProcessor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)

VisionEncoderDecoderModel(
  (encoder): ViTModel(
    (embeddings): ViTEmbeddings(
      (patch_embeddings): ViTPatchEmbeddings(
        (projection): Conv2d(3, 768, kernel_size=(16, 16), stride=(16, 16))
      )
      (dropout): Dropout(p=0.0, inplace=False)
    )
    (encoder): ViTEncoder(
      (layer): ModuleList(
        (0): ViTLayer(
          (attention): ViTAttention(
            (attention): ViTSelfAttention(
              (query): Linear(in_features=768, out_features=768, bias=True)
              (key): Linear(in_features=768, out_features=768, bias=True)
              (value): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
            (output): ViTSelfOutput(
              (dense): Linear(in_features=768, out_features=768, bias=True)
              (dropout): Dropout(p=0.0, inplace=False)
            )
          )
          (intermediate): ViTIntermediate(
            (dense): Linear(in_features=768, 

In [3]:
# proof save and load the models
torch.save(model, "vit-gpt-model.pth")

In [7]:
# load model
modelX = torch.load("vit-gpt-model.pth")

In [5]:
print(feature_extractor)

ViTImageProcessor {
  "do_normalize": true,
  "do_rescale": true,
  "do_resize": true,
  "feature_extractor_type": "ViTFeatureExtractor",
  "image_mean": [
    0.5,
    0.5,
    0.5
  ],
  "image_processor_type": "ViTImageProcessor",
  "image_std": [
    0.5,
    0.5,
    0.5
  ],
  "resample": 2,
  "rescale_factor": 0.00392156862745098,
  "size": {
    "height": 224,
    "width": 224
  }
}



In [15]:
# load feacture_extractor to some variable
import json

with open("vit-gpt-model/preprocessor_config.json") as f:
    feature_extractorX = json.load(f)

# print the contents of this json file
print(feature_extractorX)

{'do_normalize': True, 'do_rescale': True, 'do_resize': True, 'image_mean': [0.5, 0.5, 0.5], 'image_processor_type': 'ViTFeatureExtractor', 'image_std': [0.5, 0.5, 0.5], 'resample': 2, 'rescale_factor': 0.00392156862745098, 'size': {'height': 224, 'width': 224}}


In [6]:
print(tokenizer)

GPT2TokenizerFast(name_or_path='nlpconnect/vit-gpt2-image-captioning', vocab_size=50257, model_max_length=1024, is_fast=True, padding_side='right', truncation_side='right', special_tokens={'bos_token': '<|endoftext|>', 'eos_token': '<|endoftext|>', 'unk_token': '<|endoftext|>', 'pad_token': '<|endoftext|>'}, clean_up_tokenization_spaces=True)


In [1]:
max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}

def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = modelX.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds

In [19]:
# make some caption prediction
from PIL import Image

predict_step(["images/airport.jpg"])

['airplanes parked on the tarmac at an airport']

In [20]:
import os

image_urls = []
static_dir = os.path.join(os.getcwd(), "images/")
    
if os.path.isdir(static_dir):
    for filename in os.listdir(static_dir):
        if filename.endswith(".jpg") or filename.endswith(".png"):
            filename = "images/" + filename
            image_urls.append(filename)

In [13]:
print(image_urls)

['images/airport.jpg', 'images/jiraffes.jpg', 'images/messi.jpg', 'images/surfing.jpg']


In [21]:
# make some caption prediction
from PIL import Image

predict_step(image_urls)

['airplanes parked on the tarmac at an airport',
 'a herd of giraffe standing on top of a lush green field',
 'a man kicking a soccer ball on a field',
 'a man riding a wave on top of a surfboard']