In [1]:
import os
from dotenv import load_dotenv
import json

load_dotenv()

True

In [2]:
api_key = os.getenv("MISTRAL_API_KEY")

In [3]:
import requests

api_url = 'https://api.mistral.ai/v1/chat/completions'
headers = {
    'Authorization': f'Bearer {api_key}',
    'Content-Type': 'application/json'
}

In [4]:
import base64

def encode_image_to_base64(image_path):
    with open(image_path, 'rb') as image_file:
        return base64.b64encode(image_file.read()).decode('utf-8')

##### 1. Image Captioning (Descriptive Ability)

In [5]:
image_path = 'images\Picture1.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'What is the farmer doing in the image?'},
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "8cd7bd2a66dc4e05a70ebbf8276a78a7",
    "object": "chat.completion",
    "created": 1742636733,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "The farmer is operating a tractor in a wheat field, likely during the harvesting process."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 3392,
        "total_tokens": 3409,
        "completion_tokens": 17
    }
}


In [6]:
image_path = 'images\Picture1.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'Describe the background of the image in detail.'},
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "cf7de8799bef4abca237854ac8286d1e",
    "object": "chat.completion",
    "created": 1742636748,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "In the image, a man is seen operating an orange tractor in a field of golden wheat. The tractor, equipped with a red plow, is in the process of harvesting the wheat. The man, dressed in a blue shirt and a white hat, is seated on the tractor, which is positioned in the center of the image. The background of the image is filled with trees under a clear blue sky. The image captures the essence of agricultural life, with the man, the tractor, and the wheat field as the main subjects."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 3393,
        "total_tokens": 3500,
        "completion_tokens": 107
    }
}


##### 2. Visual Question Answering (VQA)

In [7]:
image_path = 'images\Picture2.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'What color is the child\'s raincoat?'},
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))


{
    "id": "be417cf2cb454b5394b0c5536362d150",
    "object": "chat.completion",
    "created": 1742636759,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "The child's raincoat is yellow."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 2808,
        "total_tokens": 2817,
        "completion_tokens": 9
    }
}


In [8]:
image_path = 'images\Picture2.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'Is the child in the image playing alone or is there an animal nearby?'},
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "bf02cc607d164a169dc743442e1ae390",
    "object": "chat.completion",
    "created": 1742636764,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "The child is not playing alone; they are accompanied by two small dogs wearing raincoats."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 2813,
        "total_tokens": 2832,
        "completion_tokens": 19
    }
}


##### 3. Object detection and Recognition

In [10]:
image_path = 'images\Picture3.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'How many distinct objects can you identify on the desk?'},
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))


{
    "id": "b8a08b497b1841778f2042112423b471",
    "object": "chat.completion",
    "created": 1742636784,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "I can identify six distinct objects on the desk: two laptops, a potted plant, a cup with a lid, a spiral-bound notebook, and a pen."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 3150,
        "total_tokens": 3184,
        "completion_tokens": 34
    }
}


In [11]:
image_path = 'images\Picture3.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'Can you detect any electronic devices in the image?'},
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "0c4a54b2f18144e0aa5ddfef051d26b0",
    "object": "chat.completion",
    "created": 1742636789,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "Yes, there are two laptops in the image."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 3149,
        "total_tokens": 3160,
        "completion_tokens": 11
    }
}


##### 4. Scene Understanding & Context Awareness

In [12]:
image_path = 'images\Picture4.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'text', 'text': 'Should the car stop or continue moving in this image?'},
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "f547bb7048f24fa58f8ae1feb8b44a54",
    "object": "chat.completion",
    "created": 1742636796,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "The car should stop, as indicated by the red traffic light in the image."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 3406,
        "total_tokens": 3422,
        "completion_tokens": 16
    }
}


In [13]:
image_path = 'images\Picture4.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'What does the red traffic light indicate for vehicles?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "382a5767098b4317b2089f9faf859973",
    "object": "chat.completion",
    "created": 1742636827,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "The red traffic light indicates that vehicles must stop, as seen with the white car halted at the crosswalk."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 3405,
        "total_tokens": 3427,
        "completion_tokens": 22
    }
}


##### 5. Text-Image Matching (Multimodal Consistency)

In [14]:
image_path = 'images\Picture5.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'Which caption better describes the image: (A) \"A woman enjoying a peaceful reading session outdoors\" or (B) \"A woman playing soccer in the park\"?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "d9dd58e093aa4b4092a82a9e2862da7b",
    "object": "chat.completion",
    "created": 1742636839,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "The caption that better describes the image is (A) \"A woman enjoying a peaceful reading session outdoors.\" This is evident from the visual content, which shows a woman sitting on a blanket in a park, engrossed in reading a book titled \"En agosto nos vemos\" by Gabriel Garc\u00eda M\u00e1rquez."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 2831,
        "total_tokens": 2892,
        "completion_tokens": 61
    }
}


In [16]:
image_path = 'images\Picture5.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'What elements in this image indicate that it is a sunny day?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "dc892531196c4e259c599a47eaee338b",
    "object": "chat.completion",
    "created": 1742636861,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "The image shows bright lighting and clear shadows, particularly under the trees and on the grass, indicating that it is a sunny day."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 2811,
        "total_tokens": 2837,
        "completion_tokens": 26
    }
}


##### 6. Optical Character Recognition (OCR) & Text-in-Image Understanding

In [17]:
image_path = 'images\Picture6.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'What is the main headline of the newspaper?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "6326639089404bd795e4537ac7ac05e8",
    "object": "chat.completion",
    "created": 1742636878,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "The main headline of the newspaper is 'MEN WALK ON MOON'."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 2828,
        "total_tokens": 2845,
        "completion_tokens": 17
    }
}


In [18]:
image_path = 'images\Picture6.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'Does the text suggest a historical or fictional event?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "4652512ae3ae410bb6d397b54dc6e957",
    "object": "chat.completion",
    "created": 1742636885,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "The text suggests a historical event as it references the Apollo 11 moon landing, which is a well-documented historical achievement."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 2829,
        "total_tokens": 2855,
        "completion_tokens": 26
    }
}


##### 7. Commonsense & Logical Reasoning

In [19]:
image_path = 'images/Picture7.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'Is it normal for a dog to wear glasses and read a newspaper? Why or why not?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "27dd3b0944974b37b106e4e8c9496008",
    "object": "chat.completion",
    "created": 1742636890,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "No, it is not normal for a dog to wear glasses and read a newspaper. Dogs do not have the physical capability or the cognitive ability to wear glasses or read human text. The image of a dog wearing glasses and reading a newspaper is a playful and humorous representation, likely created for entertainment purposes. It is important to understand that this image is not a realistic depiction of a dog's abilities or behavior."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 4182,
        "total_tokens": 4263,
        "completion_tokens": 81
    }
}


In [20]:
image_path = 'images/Picture7.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'What is unusual about this scene?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "b34c6716409e459381ff541c3267fc31",
    "object": "chat.completion",
    "created": 1742636898,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "The unusual aspect of this scene is that a dog and a cat are depicted reading newspapers together, which is not a typical behavior for animals. Dogs and cats do not have the ability to read or understand written text, so the image of them reading newspapers is a humorous and whimsical representation. Additionally, the dog is wearing glasses, which further adds to the playful and unrealistic nature of the scene. This image is likely created for entertainment purposes, highlighting the contrast between human activities and animal behaviors."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 4170,
        "total_tokens"

##### 8. Zero-Shot & Few-Shot Learning

In [21]:
image_path = 'images/Picture8.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'What is this object, and how does it differ from a real cat?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "86f434e2513c47ab90e27be35d3da1cf",
    "object": "chat.completion",
    "created": 1742636906,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "This object is a digital illustration of a cat wearing a complex mechanical exoskeleton, resembling a robot. It differs from a real cat in that it has metallic limbs with visible gears and machinery, giving it an artificial, robotic appearance, whereas a real cat would not have such mechanical enhancements."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 2358,
        "total_tokens": 2417,
        "completion_tokens": 59
    }
}


In [22]:
image_path = 'images/Picture8.jpg'
image_base64 = encode_image_to_base64(image_path)

data = {
    'model': 'pixtral-12b-2409',
    'messages': [
        {
            'role': 'user',
            'content': [
                {'type': 'image_url', 'image_url': {'url': f'data:image/jpeg;base64,{image_base64}'}},
                {'type': 'text', 'text': 'Based on the image, can you guess the possible functions of this robotic cat?'}
            ]
        }
    ]
}

response = requests.post(api_url, headers=headers, json=data)
print(json.dumps(response.json(), indent=4))

{
    "id": "50b8fe6289254074b3e3a5306582cca0",
    "object": "chat.completion",
    "created": 1742636913,
    "model": "pixtral-12b-2409",
    "choices": [
        {
            "index": 0,
            "message": {
                "role": "assistant",
                "tool_calls": null,
                "content": "The robotic cat appears to have multiple functions, including mobility with its mechanical legs, likely enhanced vision or sensory capabilities with its glowing eyes, and possibly communication or interaction tools indicated by the various mechanical components and antennas on its back. It seems designed for exploration or surveillance in urban environments."
            },
            "finish_reason": "stop"
        }
    ],
    "usage": {
        "prompt_tokens": 2359,
        "total_tokens": 2415,
        "completion_tokens": 56
    }
}
