<a href="https://colab.research.google.com/github/Harshitha73/ImageToText-MultiModals/blob/main/Image_To_Text.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
!pip install --quiet transformers
!pip install --quiet einops

In [None]:
!pip install --quiet bitsandbytes
!pip install --quiet git+https://github.com/huggingface/transformers.git # Install latest version of transformers
!pip install --quiet accelerate

In [None]:
import cv2
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
from PIL import Image
from google.colab import files
from IPython.display import HTML, display
import base64
import io


# Load the Moondream model
moondream_model_id = "vikhyatk/moondream2"
revision = "2024-04-02"
moondream_model = AutoModelForCausalLM.from_pretrained(
    moondream_model_id, trust_remote_code=True, revision=revision
)
moondream_tokenizer = AutoTokenizer.from_pretrained(moondream_model_id, revision=revision)

# Pipeline for "image-to-text" using the Llava model
# Use "bitsandbytes" library to quantize the model
pipe = pipeline("image-to-text", model_kwargs= {"device_map": "auto", "load_in_8bit": True}, model="llava-hf/llava-1.5-7b-hf")

print("Please upload the video file:")
uploaded = files.upload()

if len(uploaded) == 0:
    print("No files uploaded. Please try again.")
else:
    # Get the uploaded video file
    video_path = list(uploaded.keys())[0]  # Selects the first item (assuming only one file is uploaded)
    cap = cv2.VideoCapture(video_path)

    # Get the 20th frame
    frame_number = 20
    current_frame = 0
    while cap.isOpened():
      ret, frame = cap.read()
      if not ret:
        break
      current_frame += 1
      if current_frame == frame_number:
        cv2.imwrite('frame_20.png', frame)
        break
    cap.release()

    # Load the saved image
    image_path = 'frame_20.png'
    image = Image.open(image_path)

    # Apply the Moondream model to describe the image
    moondream_enc_image = moondream_model.encode_image(image)
    moondream_description = moondream_model.answer_question(moondream_enc_image, "Describe the image", moondream_tokenizer)


    llava_description = pipe(image, prompt="<image> Describe the image", generate_kwargs={"max_new_tokens": 200})
    generated_text = llava_description[0]['generated_text']
    prefix = "Describe the image. (100 words) The"
    prefix_index = generated_text.find(prefix) + len(prefix)

    # Remove the prefix and suffix
    clean_llava_description = generated_text[prefix_index:].strip('}[')

    # Convert the image to a data URL
    buffer = io.BytesIO()
    image.save(buffer, format="PNG")
    img_str = "data:image/png;base64," + base64.b64encode(buffer.getvalue()).decode()

    # Construct HTML strings for displaying the 20th frame and descriptions generated by both models
    image_display = f'<img src="{img_str}" style="width:50%; height:"10%"; margin-right:10px;">'
    moondream_display_description = f'<div><h3>Moondream description:</h3><p>{moondream_description}</p></div>'
    llava_display_description = f'<div><h3>Llava description:</h3><p>{clean_llava_description}</p></div>'
    popup_content = f'<div style="width:100%; overflow:auto;">{image_display}{moondream_display_description}{llava_display_description}</div>'
    # Click on the "Popup" button to display the output
    display(HTML('<script>function openPopup(){var myWindow = window.open("", "Popup", "width=800,height=400"); myWindow.document.body.innerHTML = `%s`;}</script><button onclick="openPopup()">Open Popup</button>' % popup_content))