In [14]:
import os
import json
import pprint
from gradio_client import Client, handle_file
from dotenv import load_dotenv

load_dotenv()

hf_token = os.getenv("HUGGINGFACE_TOKEN")

In [2]:
client = Client("akhaliq/Molmo-7B-D-0924", hf_token=hf_token)

def encode_image(image_path):
    return handle_file(image_path)

Loaded as API: https://akhaliq-molmo-7b-d-0924.hf.space ✔


##### 1. Image Captioning (Descriptive Ability)


In [15]:
image_path = 'images/Picture1.jpg'
encoded_image = encode_image(image_path)

text_prompt = "What is the farmer doing in the image?"

result1 = client.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result1)

[['What is the farmer doing in the image?',
  'The farmer in the image is harvesting wheat using a New Holland tractor. '
  "He's driving the orange tractor through a vast wheat field, actively "
  'engaged in the process of cutting and gathering the wheat stalks. The '
  'tractor is equipped with a hay rake, which is visible at the front, helping '
  'to gather the cut wheat into neat rows behind the vehicle. This method of '
  'harvesting is known as raking, where the cut wheat is collected and '
  'gathered into rows, making it easier to gather later. The farmer appears '
  'focused on his work, maneuvering the tractor through the field to '
  'efficiently harvest the crop.']]


In [9]:
image_path = 'images/Picture1.jpg'
encoded_image = encode_image(image_path)

text_prompt = "Describe the background of the image in detail."

result2 = client.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result2)

[['What is the farmer doing in the image?',
  'The farmer in the image is harvesting wheat using a New Holland tractor. '
  "He's driving the orange tractor through a vast wheat field, actively "
  'engaged in the process of cutting and gathering the wheat stalks. The '
  'tractor is equipped with a hay rake, which is visible at the front, helping '
  'to gather the cut wheat into neat rows behind the vehicle. This method of '
  'harvesting is known as raking, where the cut wheat is collected and '
  'gathered into rows, making it easier to gather later. The farmer appears '
  'focused on his work, maneuvering the tractor through the field to '
  'efficiently harvest the crop.'],
 ['Describe the background of the image in detail.',
  'The background of the image showcases a serene rural landscape. A line of '
  'trees stretches across the horizon, creating a natural border between the '
  'sky and the field. The trees appear to be a mix of deciduous and evergreen '
  'varieties, adding

##### 2. Visual Question Answering (VQA)


In [10]:
image_path = 'images/Picture2.jpg'
encoded_image = encode_image(image_path)

text_prompt = "What color is the child\'s raincoat?"

result = client.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result)

[['What is the farmer doing in the image?',
  'The farmer in the image is harvesting wheat using a New Holland tractor. '
  "He's driving the orange tractor through a vast wheat field, actively "
  'engaged in the process of cutting and gathering the wheat stalks. The '
  'tractor is equipped with a hay rake, which is visible at the front, helping '
  'to gather the cut wheat into neat rows behind the vehicle. This method of '
  'harvesting is known as raking, where the cut wheat is collected and '
  'gathered into rows, making it easier to gather later. The farmer appears '
  'focused on his work, maneuvering the tractor through the field to '
  'efficiently harvest the crop.'],
 ['Describe the background of the image in detail.',
  'The background of the image showcases a serene rural landscape. A line of '
  'trees stretches across the horizon, creating a natural border between the '
  'sky and the field. The trees appear to be a mix of deciduous and evergreen '
  'varieties, adding

In [11]:
image_path = 'images/Picture2.jpg'
encoded_image = encode_image(image_path)

text_prompt = "Is the child playing alone or is there an animal nearby?"

result = client.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result)

[['What is the farmer doing in the image?',
  'The farmer in the image is harvesting wheat using a New Holland tractor. '
  "He's driving the orange tractor through a vast wheat field, actively "
  'engaged in the process of cutting and gathering the wheat stalks. The '
  'tractor is equipped with a hay rake, which is visible at the front, helping '
  'to gather the cut wheat into neat rows behind the vehicle. This method of '
  'harvesting is known as raking, where the cut wheat is collected and '
  'gathered into rows, making it easier to gather later. The farmer appears '
  'focused on his work, maneuvering the tractor through the field to '
  'efficiently harvest the crop.'],
 ['Describe the background of the image in detail.',
  'The background of the image showcases a serene rural landscape. A line of '
  'trees stretches across the horizon, creating a natural border between the '
  'sky and the field. The trees appear to be a mix of deciduous and evergreen '
  'varieties, adding

##### 3. Object detection and Recognition


In [12]:
image_path = 'images\Picture3.jpg'
encoded_image = encode_image(image_path)

text_prompt = "How many distinct objects can you identify on the desk?"

result = client.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result)

[['What is the farmer doing in the image?',
  'The farmer in the image is harvesting wheat using a New Holland tractor. '
  "He's driving the orange tractor through a vast wheat field, actively "
  'engaged in the process of cutting and gathering the wheat stalks. The '
  'tractor is equipped with a hay rake, which is visible at the front, helping '
  'to gather the cut wheat into neat rows behind the vehicle. This method of '
  'harvesting is known as raking, where the cut wheat is collected and '
  'gathered into rows, making it easier to gather later. The farmer appears '
  'focused on his work, maneuvering the tractor through the field to '
  'efficiently harvest the crop.'],
 ['Describe the background of the image in detail.',
  'The background of the image showcases a serene rural landscape. A line of '
  'trees stretches across the horizon, creating a natural border between the '
  'sky and the field. The trees appear to be a mix of deciduous and evergreen '
  'varieties, adding

In [13]:
image_path = 'images\Picture3.jpg'
encoded_image = encode_image(image_path)

text_prompt = "Can you detect any electronic devices in the image?"

result = client.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result)

[['What is the farmer doing in the image?',
  'The farmer in the image is harvesting wheat using a New Holland tractor. '
  "He's driving the orange tractor through a vast wheat field, actively "
  'engaged in the process of cutting and gathering the wheat stalks. The '
  'tractor is equipped with a hay rake, which is visible at the front, helping '
  'to gather the cut wheat into neat rows behind the vehicle. This method of '
  'harvesting is known as raking, where the cut wheat is collected and '
  'gathered into rows, making it easier to gather later. The farmer appears '
  'focused on his work, maneuvering the tractor through the field to '
  'efficiently harvest the crop.'],
 ['Describe the background of the image in detail.',
  'The background of the image showcases a serene rural landscape. A line of '
  'trees stretches across the horizon, creating a natural border between the '
  'sky and the field. The trees appear to be a mix of deciduous and evergreen '
  'varieties, adding

##### 4. Scene Understanding & Context Awareness


In [14]:
image_path = 'images\Picture4.jpg'
encoded_image = encode_image(image_path)

text_prompt = "Should the car stop or continue moving in this image?"

result = client.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result)

[['What is the farmer doing in the image?',
  'The farmer in the image is harvesting wheat using a New Holland tractor. '
  "He's driving the orange tractor through a vast wheat field, actively "
  'engaged in the process of cutting and gathering the wheat stalks. The '
  'tractor is equipped with a hay rake, which is visible at the front, helping '
  'to gather the cut wheat into neat rows behind the vehicle. This method of '
  'harvesting is known as raking, where the cut wheat is collected and '
  'gathered into rows, making it easier to gather later. The farmer appears '
  'focused on his work, maneuvering the tractor through the field to '
  'efficiently harvest the crop.'],
 ['Describe the background of the image in detail.',
  'The background of the image showcases a serene rural landscape. A line of '
  'trees stretches across the horizon, creating a natural border between the '
  'sky and the field. The trees appear to be a mix of deciduous and evergreen '
  'varieties, adding

In [15]:
image_path = 'images\Picture4.jpg'
encoded_image = encode_image(image_path)

text_prompt = "What does the red traffic light indicate for vehicles?"

result = client.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result)

[['What is the farmer doing in the image?',
  'The farmer in the image is harvesting wheat using a New Holland tractor. '
  "He's driving the orange tractor through a vast wheat field, actively "
  'engaged in the process of cutting and gathering the wheat stalks. The '
  'tractor is equipped with a hay rake, which is visible at the front, helping '
  'to gather the cut wheat into neat rows behind the vehicle. This method of '
  'harvesting is known as raking, where the cut wheat is collected and '
  'gathered into rows, making it easier to gather later. The farmer appears '
  'focused on his work, maneuvering the tractor through the field to '
  'efficiently harvest the crop.'],
 ['Describe the background of the image in detail.',
  'The background of the image showcases a serene rural landscape. A line of '
  'trees stretches across the horizon, creating a natural border between the '
  'sky and the field. The trees appear to be a mix of deciduous and evergreen '
  'varieties, adding

##### 5. Text-Image Matching (Multimodal Consistency)


In [16]:
image_path = 'images\Picture5.jpg'
encoded_image = encode_image(image_path)

text_prompt = "Which caption better describes the image: (A) \"A woman enjoying a peaceful reading session outdoors\" or (B) \"A woman playing soccer in the park\"?"

result = client.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result)

[['What is the farmer doing in the image?',
  'The farmer in the image is harvesting wheat using a New Holland tractor. '
  "He's driving the orange tractor through a vast wheat field, actively "
  'engaged in the process of cutting and gathering the wheat stalks. The '
  'tractor is equipped with a hay rake, which is visible at the front, helping '
  'to gather the cut wheat into neat rows behind the vehicle. This method of '
  'harvesting is known as raking, where the cut wheat is collected and '
  'gathered into rows, making it easier to gather later. The farmer appears '
  'focused on his work, maneuvering the tractor through the field to '
  'efficiently harvest the crop.'],
 ['Describe the background of the image in detail.',
  'The background of the image showcases a serene rural landscape. A line of '
  'trees stretches across the horizon, creating a natural border between the '
  'sky and the field. The trees appear to be a mix of deciduous and evergreen '
  'varieties, adding

In [17]:
image_path = 'images\Picture5.jpg'
encoded_image = encode_image(image_path)

text_prompt = "What elements in this image indicate that it is a sunny day?"

result = client.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result)

[['What is the farmer doing in the image?',
  'The farmer in the image is harvesting wheat using a New Holland tractor. '
  "He's driving the orange tractor through a vast wheat field, actively "
  'engaged in the process of cutting and gathering the wheat stalks. The '
  'tractor is equipped with a hay rake, which is visible at the front, helping '
  'to gather the cut wheat into neat rows behind the vehicle. This method of '
  'harvesting is known as raking, where the cut wheat is collected and '
  'gathered into rows, making it easier to gather later. The farmer appears '
  'focused on his work, maneuvering the tractor through the field to '
  'efficiently harvest the crop.'],
 ['Describe the background of the image in detail.',
  'The background of the image showcases a serene rural landscape. A line of '
  'trees stretches across the horizon, creating a natural border between the '
  'sky and the field. The trees appear to be a mix of deciduous and evergreen '
  'varieties, adding

##### 6. Optical Character Recognition (OCR) & Text-in-Image Understanding


In [18]:
image_path = 'images\Picture6.jpg'
encoded_image = encode_image(image_path)

text_prompt = "What is the main headline of the newspaper?"

result = client.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result)

[['What is the farmer doing in the image?',
  'The farmer in the image is harvesting wheat using a New Holland tractor. '
  "He's driving the orange tractor through a vast wheat field, actively "
  'engaged in the process of cutting and gathering the wheat stalks. The '
  'tractor is equipped with a hay rake, which is visible at the front, helping '
  'to gather the cut wheat into neat rows behind the vehicle. This method of '
  'harvesting is known as raking, where the cut wheat is collected and '
  'gathered into rows, making it easier to gather later. The farmer appears '
  'focused on his work, maneuvering the tractor through the field to '
  'efficiently harvest the crop.'],
 ['Describe the background of the image in detail.',
  'The background of the image showcases a serene rural landscape. A line of '
  'trees stretches across the horizon, creating a natural border between the '
  'sky and the field. The trees appear to be a mix of deciduous and evergreen '
  'varieties, adding

In [19]:
image_path = 'images\Picture6.jpg'
encoded_image = encode_image(image_path)

text_prompt = "Does the text suggest a historical or fictional event?"

result = client.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result)

[['What is the farmer doing in the image?',
  'The farmer in the image is harvesting wheat using a New Holland tractor. '
  "He's driving the orange tractor through a vast wheat field, actively "
  'engaged in the process of cutting and gathering the wheat stalks. The '
  'tractor is equipped with a hay rake, which is visible at the front, helping '
  'to gather the cut wheat into neat rows behind the vehicle. This method of '
  'harvesting is known as raking, where the cut wheat is collected and '
  'gathered into rows, making it easier to gather later. The farmer appears '
  'focused on his work, maneuvering the tractor through the field to '
  'efficiently harvest the crop.'],
 ['Describe the background of the image in detail.',
  'The background of the image showcases a serene rural landscape. A line of '
  'trees stretches across the horizon, creating a natural border between the '
  'sky and the field. The trees appear to be a mix of deciduous and evergreen '
  'varieties, adding

##### 7. Commonsense & Logical Reasoning


In [20]:
image_path = 'images/Picture7.jpg'
encoded_image = encode_image(image_path)

text_prompt = "Is it normal for a dog to wear glasses and read a newspaper? Why or why not?"

result = client.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result)

[['What is the farmer doing in the image?',
  'The farmer in the image is harvesting wheat using a New Holland tractor. '
  "He's driving the orange tractor through a vast wheat field, actively "
  'engaged in the process of cutting and gathering the wheat stalks. The '
  'tractor is equipped with a hay rake, which is visible at the front, helping '
  'to gather the cut wheat into neat rows behind the vehicle. This method of '
  'harvesting is known as raking, where the cut wheat is collected and '
  'gathered into rows, making it easier to gather later. The farmer appears '
  'focused on his work, maneuvering the tractor through the field to '
  'efficiently harvest the crop.'],
 ['Describe the background of the image in detail.',
  'The background of the image showcases a serene rural landscape. A line of '
  'trees stretches across the horizon, creating a natural border between the '
  'sky and the field. The trees appear to be a mix of deciduous and evergreen '
  'varieties, adding

In [21]:
image_path = 'images/Picture7.jpg'
encoded_image = encode_image(image_path)

text_prompt = "What is unusual about this scene?"

result = client.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result)

[['What is the farmer doing in the image?',
  'The farmer in the image is harvesting wheat using a New Holland tractor. '
  "He's driving the orange tractor through a vast wheat field, actively "
  'engaged in the process of cutting and gathering the wheat stalks. The '
  'tractor is equipped with a hay rake, which is visible at the front, helping '
  'to gather the cut wheat into neat rows behind the vehicle. This method of '
  'harvesting is known as raking, where the cut wheat is collected and '
  'gathered into rows, making it easier to gather later. The farmer appears '
  'focused on his work, maneuvering the tractor through the field to '
  'efficiently harvest the crop.'],
 ['Describe the background of the image in detail.',
  'The background of the image showcases a serene rural landscape. A line of '
  'trees stretches across the horizon, creating a natural border between the '
  'sky and the field. The trees appear to be a mix of deciduous and evergreen '
  'varieties, adding

##### 8. Zero-Shot & Few-Shot Learning


In [24]:
hf_token2 = os.getenv("HUGGINGFACE_TOKEN2")
client = Client("akhaliq/Molmo-7B-D-0924", hf_token=hf_token2)

Loaded as API: https://akhaliq-molmo-7b-d-0924.hf.space ✔


In [25]:
image_path = 'images/Picture8.jpg'
encoded_image = encode_image(image_path)

text_prompt = "What is this object, and how does it differ from a real cat?"

result = client.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result)

[['What is this object, and how does it differ from a real cat?',
  'This object is a robotic cat, which differs from a real cat in several '
  'ways:\n'
  '\n'
  '1. It has a metallic body with visible gears and mechanical parts, '
  'including a gear on its chest.\n'
  '2. The cat has four mechanical legs instead of the normal four legs of a '
  'real cat.\n'
  "3. It has a tail made of metal, unlike a real cat's fur-covered tail.\n"
  "4. The cat's body is covered in metal armor, giving it a futuristic or "
  'steampunk appearance.\n'
  '5. It has a mechanical device on its back, which could be a weapon or some '
  'kind of backpack.\n'
  "6. The cat's face is more stylized and cartoon-like compared to a real "
  "cat's more realistic features.\n"
  '\n'
  'While it maintains the general shape and some features of a cat, such as '
  'its head, ears, and overall form, the robotic cat is clearly an artificial '
  'creation with mechanical and metallic components that give it a distinc

In [27]:
hf_token3 = os.getenv("HUGGINGFACE_TOKEN3")
client3 = Client("akhaliq/Molmo-7B-D-0924", hf_token=hf_token3)

Loaded as API: https://akhaliq-molmo-7b-d-0924.hf.space ✔


In [29]:
image_path = 'images/Picture8.jpg'
encoded_image = encode_image(image_path)

text_prompt = "Based on the image, can you guess the possible functions of this robotic cat?"

result = client3.predict(
    image=encoded_image,
    text=text_prompt,
    api_name="/chatbot"
)

pprint.pprint(result)

[['Based on the image, can you guess the possible functions of this robotic '
  'cat?',
  "While it's difficult to determine the exact functions without more "
  'information, we can speculate based on the image. This robotic cat appears '
  'to be designed for a combination of surveillance and assistance roles. The '
  'camera on its back suggests it could be used for monitoring its '
  'surroundings or transmitting visual data. The mechanical arms might allow '
  'it to manipulate objects or provide support to humans. The overall design, '
  'blending organic and technological elements, indicates it could be a '
  'prototype for a companion robot that can assist with daily tasks while also '
  'gathering information about its environment. The cat-like appearance might '
  'also serve to make it more approachable and less intimidating in human '
  'interactions.']]
