## Image Caption Generator 

We are going to use Transformers model to generate caption from an Image.

### Installation



1.   Transformers
2.   Pytorch
3. Image 

For installation, please do pip install package_name

In Colab, Pytorch comes preinstalled and same goes with PIL for Image. You will only need to install **transformers** from Huggingface.




In [None]:
pip install transformers

In [2]:
from transformers import VisionEncoderDecoderModel, ViTFeatureExtractor, AutoTokenizer
import torch
from PIL import Image

In [4]:
model = VisionEncoderDecoderModel.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
feature_extractor = ViTFeatureExtractor.from_pretrained("nlpconnect/vit-gpt2-image-captioning")
tokenizer = AutoTokenizer.from_pretrained("nlpconnect/vit-gpt2-image-captioning")

device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
model.to(device)



max_length = 16
num_beams = 4
gen_kwargs = {"max_length": max_length, "num_beams": num_beams}
def predict_step(image_paths):
  images = []
  for image_path in image_paths:
    i_image = Image.open(image_path)
    if i_image.mode != "RGB":
      i_image = i_image.convert(mode="RGB")

    images.append(i_image)

  pixel_values = feature_extractor(images=images, return_tensors="pt").pixel_values
  pixel_values = pixel_values.to(device)

  output_ids = model.generate(pixel_values, **gen_kwargs)

  preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
  preds = [pred.strip() for pred in preds]
  return preds
  
predict_step(['Sample.jpg']) #

Config of the encoder: <class 'transformers.models.vit.modeling_vit.ViTModel'> is overwritten by shared encoder config: ViTConfig {
  "architectures": [
    "ViTModel"
  ],
  "attention_probs_dropout_prob": 0.0,
  "encoder_stride": 16,
  "hidden_act": "gelu",
  "hidden_dropout_prob": 0.0,
  "hidden_size": 768,
  "image_size": 224,
  "initializer_range": 0.02,
  "intermediate_size": 3072,
  "layer_norm_eps": 1e-12,
  "model_type": "vit",
  "num_attention_heads": 12,
  "num_channels": 3,
  "num_hidden_layers": 12,
  "patch_size": 16,
  "pooler_act": "tanh",
  "pooler_output_size": 768,
  "qkv_bias": true,
  "torch_dtype": "float32",
  "transformers_version": "4.51.3"
}

Config of the decoder: <class 'transformers.models.gpt2.modeling_gpt2.GPT2LMHeadModel'> is overwritten by shared decoder config: GPT2Config {
  "activation_function": "gelu_new",
  "add_cross_attention": true,
  "architectures": [
    "GPT2LMHeadModel"
  ],
  "attn_pdrop": 0.1,
  "bos_token_id": 50256,
  "decoder_start_to

['a white swan swimming on top of a body of water']

Xet Storage is enabled for this repo, but the 'hf_xet' package is not installed. Falling back to regular HTTP download. For better performance, install the package with: `pip install huggingface_hub[hf_xet]` or `pip install hf_xet`


# Gradio User Interface for Image and Video Captioning

This section adds a Gradio interface to your project, allowing users to upload images or videos and receive captions generated by the model.

In [None]:
# Install Gradio if not already installed
!pip install gradio

In [None]:
import gradio as gr
import cv2
import numpy as np

def predict_image(image):
    # Convert numpy array to PIL Image
    pil_image = Image.fromarray(image)
    if pil_image.mode != "RGB":
        pil_image = pil_image.convert(mode="RGB")
    pixel_values = feature_extractor(images=[pil_image], return_tensors="pt").pixel_values
    pixel_values = pixel_values.to(device)
    output_ids = model.generate(pixel_values, **gen_kwargs)
    preds = tokenizer.batch_decode(output_ids, skip_special_tokens=True)
    return preds[0].strip()

def predict_video(video):
    # Read video frames and select a representative frame (e.g., the middle frame)
    cap = cv2.VideoCapture(video)
    frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
    mid_frame = frame_count // 2
    cap.set(cv2.CAP_PROP_POS_FRAMES, mid_frame)
    ret, frame = cap.read()
    cap.release()
    if not ret:
        return "Could not read video frame."
    frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
    return predict_image(frame_rgb)

with gr.Blocks(theme=gr.themes.Default(primary_hue="blue"), css=".gradio-container {max-width: 1600px !important;} .gradio-container {width: 100% !important;}") as demo:
    gr.Markdown("# Image & Video Caption Generator")
    gr.Markdown("Upload an image or a video to generate a caption using a HuggingFace transformer model.")
    with gr.Tab("Image"):
        image_input = gr.Image(type="numpy", label="Upload Image")
        image_output = gr.Textbox(label="Generated Caption")
        image_btn = gr.Button("Generate Caption")
        image_btn.click(fn=predict_image, inputs=image_input, outputs=image_output)
    with gr.Tab("Video"):
        video_input = gr.Video(label="Upload Video")
        video_output = gr.Textbox(label="Generated Caption")
        video_btn = gr.Button("Generate Caption")
        video_btn.click(fn=predict_video, inputs=video_input, outputs=video_output)

demo.launch()

Running on local URL:  http://127.0.0.1:7860

To create a public link, set `share=True` in `launch()`.

To create a public link, set `share=True` in `launch()`.




Traceback (most recent call last):
  File "c:\Users\kalim\miniconda3\envs\ML\lib\site-packages\gradio\queueing.py", line 536, in process_events
    response = await route_utils.call_process_api(
  File "c:\Users\kalim\miniconda3\envs\ML\lib\site-packages\gradio\route_utils.py", line 322, in call_process_api
    output = await app.get_blocks().process_api(
  File "c:\Users\kalim\miniconda3\envs\ML\lib\site-packages\gradio\blocks.py", line 1935, in process_api
    result = await self.call_function(
  File "c:\Users\kalim\miniconda3\envs\ML\lib\site-packages\gradio\blocks.py", line 1520, in call_function
    prediction = await anyio.to_thread.run_sync(  # type: ignore
  File "c:\Users\kalim\miniconda3\envs\ML\lib\site-packages\anyio\to_thread.py", line 56, in run_sync
    return await get_async_backend().run_sync_in_worker_thread(
  File "c:\Users\kalim\miniconda3\envs\ML\lib\site-packages\anyio\_backends\_asyncio.py", line 2364, in run_sync_in_worker_thread
    return await future
  File