In [9]:
import os
from dotenv import load_dotenv
import json

load_dotenv()

True

In [10]:
api_key: str | None = os.getenv("OPENROUTER_API_KEY")

In [11]:
import requests

api_url = 'https://openrouter.ai/api/v1/chat/completions'
headers = {
    'Authorization': f'Bearer {api_key}',
    'Content-Type': 'application/json'
}

In [12]:
import base64

def encode_image_to_base64(image_path):
    with open(image_path, 'rb') as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')


##### 1. Image Captioning (Descriptive Ability)


In [13]:
image_path = 'images\Picture1.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'What is the farmer doing in the image?'},
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "gen-1742739032-cPXZOyrdYbiL7TKnoZqx",
    "provider": "Together",
    "model": "meta-llama/llama-3.2-11b-vision-instruct",
    "object": "chat.completion",
    "created": 1742739032,
    "choices": [
        {
            "logprobs": null,
            "finish_reason": "stop",
            "native_finish_reason": "stop",
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "In the image the farmer is in the fields setting up for the harvest of wheat. It appears the farmer is in in the process of mowing the grain because there is no cut rope visible for reel that is behind the farmer on his tractor to wrap around the crop. You can tell by the length of the straw it is cut but not yet bundled. There appears to be a swather in the image in front of the farmer.  The swather is a type of mowing machine that is propelled by the power take-off on a tractor and is now being used in the harvest of grain such as wheat and, po

In [14]:
image_path = 'images\Picture1.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'Describe the background of the image in detail.'},
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "gen-1742739041-meSTrvTxBaZHAsuAxdcY",
    "provider": "Together",
    "model": "meta-llama/llama-3.2-11b-vision-instruct",
    "object": "chat.completion",
    "created": 1742739041,
    "choices": [
        {
            "logprobs": null,
            "finish_reason": "stop",
            "native_finish_reason": "stop",
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "The image depicts a man operating a tractor in a field of wheat or barley, with the sun setting in the background. The scene appears to be set during the daytime, likely during harvest season.\n\n*   **Man on Tractor:**\n    *   The man is wearing blue jeans and a light-colored shirt.\n    *   He is sitting on an orange tractor with the words \"Ghazl\" and \"New Holland\" written on it.\n    *   The man is facing forward, holding the steering wheel, and appears to be operating the tractor.\n*   **Tractor:**\n    *   The tractor is bright orange w

##### 2. Visual Question Answering (VQA)


In [15]:
image_path = 'images\Picture2.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'What color is the child\'s raincoat?'},
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))


{
    "id": "gen-1742739051-EZw28GW2jTOljK7G2dli",
    "provider": "Together",
    "model": "meta-llama/llama-3.2-11b-vision-instruct",
    "object": "chat.completion",
    "created": 1742739051,
    "choices": [
        {
            "logprobs": null,
            "finish_reason": "stop",
            "native_finish_reason": "stop",
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "There does not appear to be a child in this image. There are two dogs and what appears to be a person wearing a yellow rain slicker or raincoat, but no child.",
                "refusal": null
            }
        }
    ],
    "usage": {
        "prompt_tokens": 20,
        "completion_tokens": 38,
        "total_tokens": 58
    }
}


In [16]:
image_path = 'images\Picture2.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'Is the child in the image playing alone or is there an animal nearby?'},
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "gen-1742739056-zonCtDArZkyjDzQGO2Oy",
    "provider": "Together",
    "model": "meta-llama/llama-3.2-11b-vision-instruct",
    "object": "chat.completion",
    "created": 1742739056,
    "choices": [
        {
            "logprobs": null,
            "finish_reason": "stop",
            "native_finish_reason": "stop",
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "In the depicted image, a child, clad in a yellow raincoat, appears to be engaging with two dogs that share their attire. The dogs' apparel matches the child's, with one wearing a yellow and the other a black raincoat. Given the similarity in attire, it is likely that a person is playing with both pets. The image captures the child in the middle, leaning forward towards the two outfits and appears to be throwing treats for them to catch. This intimate inter-action points to a symbiotic relationship between the two parties, enabling them to play al

##### 3. Object detection and Recognition


In [17]:
image_path = 'images\Picture3.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'How many distinct objects can you identify on the desk?'},
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))


{
    "id": "gen-1742739064-4mx1geILXarVUZQdoBVL",
    "provider": "Together",
    "model": "meta-llama/llama-3.2-11b-vision-instruct",
    "object": "chat.completion",
    "created": 1742739064,
    "choices": [
        {
            "logprobs": null,
            "finish_reason": "stop",
            "native_finish_reason": "stop",
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "The following are the distinct objects that can be identified on the desk:\n\u2022 Laptop\n\u2022 Notebook\n\u2022 Plant\n\u2022 Coffee Cup\n\u2022 Hand and Arm\n\u2022 Calendar\n\u2022 Blind",
                "refusal": null
            }
        }
    ],
    "usage": {
        "prompt_tokens": 22,
        "completion_tokens": 38,
        "total_tokens": 60
    }
}


In [18]:
image_path = 'images\Picture3.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'Can you detect any electronic devices in the image?'},
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "error": {
        "message": "Provider returned error",
        "code": 429,
        "metadata": {
            "raw": "{\n  \"id\": \"nnAtGiz-57nCBj-924e842b35fa92f5\",\n  \"error\": {\n    \"message\": \"You have reached the rate limit specific to this model meta-llama/Llama-Vision-Free. The maximum rate limit for this model is 10.0 queries per minute. This limit differs from the general rate limits published at Together AI rate limits documentation (https://docs.together.ai/docs/rate-limits). For inquiries about increasing your model-specific rate limit, please contact our sales team (https://www.together.ai/forms/contact-sales)\",\n    \"type\": \"model_rate_limit\",\n    \"param\": null,\n    \"code\": null\n  }\n}",
            "provider_name": "Together"
        }
    },
    "user_id": "user_2ucOZHts8ueIxK1u5PPW9NFd3g8"
}


##### 4. Scene Understanding & Context Awareness


In [19]:
image_path = 'images\Picture4.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'Should the car stop or continue moving in this image?'},
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "gen-1742739072-KYPLDcisaiG0CvM4yR65",
    "provider": "Together",
    "model": "meta-llama/llama-3.2-11b-vision-instruct",
    "object": "chat.completion",
    "created": 1742739072,
    "choices": [
        {
            "logprobs": null,
            "finish_reason": "stop",
            "native_finish_reason": "stop",
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "The car should stop moving at the intersection, in front of the crosswalk. The rules of driving clearly state that a driver must yield to pedestrians in a crosswalk and the car in the photo should yield and wait for the pedestrians to clear to the sidewalk. The car is also stopping for a red light which means it was required to stop before the white line, where the crosswalk is. The walk signal indicates that pedestrians can now cross the intersection, and the car should give them the right-of-way. If the car were to continue moving, it could pot

In [20]:
image_path = 'images\Picture4.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'What does the red traffic light indicate for vehicles?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "error": {
        "message": "Provider returned error",
        "code": 429,
        "metadata": {
            "raw": "{\n  \"id\": \"nnAtK15-57nCBj-924e846177e392f6\",\n  \"error\": {\n    \"message\": \"You have reached the rate limit specific to this model meta-llama/Llama-Vision-Free. The maximum rate limit for this model is 10.0 queries per minute. This limit differs from the general rate limits published at Together AI rate limits documentation (https://docs.together.ai/docs/rate-limits). For inquiries about increasing your model-specific rate limit, please contact our sales team (https://www.together.ai/forms/contact-sales)\",\n    \"type\": \"model_rate_limit\",\n    \"param\": null,\n    \"code\": null\n  }\n}",
            "provider_name": "Together"
        }
    },
    "user_id": "user_2ucOZHts8ueIxK1u5PPW9NFd3g8"
}


##### 5. Text-Image Matching (Multimodal Consistency)


In [21]:
image_path = 'images\Picture5.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'Which caption better describes the image: (A) \"A woman enjoying a peaceful reading session outdoors\" or (B) \"A woman playing soccer in the park\"?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "gen-1742739080-1cYgzk1FVCNJDMPdAFLE",
    "provider": "Together",
    "model": "meta-llama/llama-3.2-11b-vision-instruct",
    "object": "chat.completion",
    "created": 1742739080,
    "choices": [
        {
            "logprobs": null,
            "finish_reason": "stop",
            "native_finish_reason": "stop",
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "The correct caption for the image is (A) \"A woman enjoying a peaceful reading session outdoors.\"",
                "refusal": null
            }
        }
    ],
    "usage": {
        "prompt_tokens": 44,
        "completion_tokens": 21,
        "total_tokens": 65
    }
}


In [22]:
image_path = 'images\Picture5.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'What elements in this image indicate that it is a sunny day?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "gen-1742739087-EW7fuqUYYUko9Zud7SoS",
    "provider": "Together",
    "model": "meta-llama/llama-3.2-11b-vision-instruct",
    "object": "chat.completion",
    "created": 1742739087,
    "choices": [
        {
            "logprobs": null,
            "finish_reason": "stop",
            "native_finish_reason": "stop",
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "The bright, even lighting; shadows cast by the trees in the background; and the happy expression on the woman's face all suggest that the weather is sunny and pleasant.\n\nAlthough this type of lighting could suggest many other settings and circumstances, I am strongly inclined to accept that the most likely cause of these phenomena is sun resting on the earth's surface.",
                "refusal": null
            }
        }
    ],
    "usage": {
        "prompt_tokens": 24,
        "completion_tokens": 71,
        "total_tokens": 95
    }
}


##### 6. Optical Character Recognition (OCR) & Text-in-Image Understanding


In [23]:
image_path = 'images\Picture6.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'What is the main headline of the newspaper?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "gen-1742739097-T9W2K3W9hZgYx2ComCbn",
    "provider": "Together",
    "model": "meta-llama/llama-3.2-11b-vision-instruct",
    "object": "chat.completion",
    "created": 1742739097,
    "choices": [
        {
            "logprobs": null,
            "finish_reason": "stop",
            "native_finish_reason": "stop",
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "The main headline of the newspaper is \"MEN WALK ON MOON\".",
                "refusal": null
            }
        }
    ],
    "usage": {
        "prompt_tokens": 20,
        "completion_tokens": 17,
        "total_tokens": 37
    }
}


In [24]:
image_path = 'images\Picture6.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'Does the text suggest a historical or fictional event?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "gen-1742739104-v2Kcw3yPgX28XRQ06Mxw",
    "provider": "Together",
    "model": "meta-llama/llama-3.2-11b-vision-instruct",
    "object": "chat.completion",
    "created": 1742739104,
    "choices": [
        {
            "logprobs": null,
            "finish_reason": "stop",
            "native_finish_reason": "stop",
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "The text on the newspaper suggests a historical event, specifically the first moon landing. This is because the headline reads \"MEN WALK ON MOON\" and includes additional details such as the astronauts collecting rocks and planting a flag, which are all consistent with the real-life event of the Apollo 11 mission in 1969. The use of the phrase \"A STRONAUTS LAND ON PLAIN\" also adds to the historical nature of the event, as it is a clear reference to the actual landing site of the Apollo 11 spacecraft on the moon's surface. Overall, the text sug

##### 7. Commonsense & Logical Reasoning


In [25]:
image_path = 'images/Picture7.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'Is it normal for a dog to wear glasses and read a newspaper? Why or why not?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "gen-1742739112-jL9JTK1vY8TAUVUhmuv1",
    "provider": "Together",
    "model": "meta-llama/llama-3.2-11b-vision-instruct",
    "object": "chat.completion",
    "created": 1742739112,
    "choices": [
        {
            "logprobs": null,
            "finish_reason": "stop",
            "native_finish_reason": "stop",
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "In the provided image, the dog is wearing glasses that suit its breed and plopped down as if reading the newspaper, a common practice for dogs, according to the researchers. In the study, which spans nearly 15 years and over 100 participants, respondents often reported their dogs performing common behaviors, like sitting in chairs, lying on beds, or even wearing glasses and televisions. The researchers call one dog's reading glasses \"canonical\" after it appeared in six out of 13 sets of respondents' descriptions, featuring numerous small-text w

In [26]:
image_path = 'images/Picture7.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'What is unusual about this scene?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "error": {
        "message": "Provider returned error",
        "code": 429,
        "metadata": {
            "raw": "{\n  \"id\": \"nnAtWTY-4pPsy7-924e8559e1b392f9\",\n  \"error\": {\n    \"message\": \"You have reached the rate limit specific to this model meta-llama/Llama-Vision-Free. The maximum rate limit for this model is 10.0 queries per minute. This limit differs from the general rate limits published at Together AI rate limits documentation (https://docs.together.ai/docs/rate-limits). For inquiries about increasing your model-specific rate limit, please contact our sales team (https://www.together.ai/forms/contact-sales)\",\n    \"type\": \"model_rate_limit\",\n    \"param\": null,\n    \"code\": null\n  }\n}",
            "provider_name": "Together"
        }
    },
    "user_id": "user_2ucOZHts8ueIxK1u5PPW9NFd3g8"
}


##### 8. Zero-Shot & Few-Shot Learning


In [27]:
image_path = 'images/Picture8.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'What is this object, and how does it differ from a real cat?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "gen-1742739119-HXomjr0t7Xwfr9uUyOv3",
    "provider": "Together",
    "model": "meta-llama/llama-3.2-11b-vision-instruct",
    "object": "chat.completion",
    "created": 1742739119,
    "choices": [
        {
            "logprobs": null,
            "finish_reason": "stop",
            "native_finish_reason": "stop",
            "index": 0,
            "message": {
                "role": "assistant",
                "content": "**Key Features of a Mechanical Cat**\n\n* The mechanical cat is a machine designed to mimic a cat's appearance and movements.\n* It lacks the warmth, fur, and other characteristics of a living animal.\n* The mechanical cat's physical body and abilities are artificial, distinct from any natural cat.",
                "refusal": null
            }
        }
    ],
    "usage": {
        "prompt_tokens": 26,
        "completion_tokens": 60,
        "total_tokens": 86
    }
}


In [28]:
image_path = 'images/Picture8.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    "model": "meta-llama/llama-3.2-11b-vision-instruct:free",
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'Based on the image, can you guess the possible functions of this robotic cat?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "error": {
        "message": "Provider returned error",
        "code": 429,
        "metadata": {
            "raw": "{\n  \"id\": \"nnAtZ8p-57nCBj-924e8585000392f5\",\n  \"error\": {\n    \"message\": \"You have reached the rate limit specific to this model meta-llama/Llama-Vision-Free. The maximum rate limit for this model is 10.0 queries per minute. This limit differs from the general rate limits published at Together AI rate limits documentation (https://docs.together.ai/docs/rate-limits). For inquiries about increasing your model-specific rate limit, please contact our sales team (https://www.together.ai/forms/contact-sales)\",\n    \"type\": \"model_rate_limit\",\n    \"param\": null,\n    \"code\": null\n  }\n}",
            "provider_name": "Together"
        }
    },
    "user_id": "user_2ucOZHts8ueIxK1u5PPW9NFd3g8"
}
