In [2]:
import requests
import os
from PIL import Image
import io

In [9]:


# Replace with your actual Hugging Face API token
HF_TOKEN = os.environ.get("HF_TOKEN") # It's best to use environment variables for tokens

# The model ID for Magma
# Note: Magma is a multimodal model. Its API might expect specific input formats
# (e.g., image + text, or a specific prompt structure).
# You'll need to refer to the specific model card on Hugging Face Hub for exact input/output.
MODEL_ID = "microsoft/Magma-8B"
API_URL = f"https://api-inference.huggingface.co/models/{MODEL_ID}"

headers = {
    "Authorization": f"Bearer {HF_TOKEN}",
    "Content-Type": "application/json" # For text inputs
    # For image inputs, you might need "Content-Type": "image/jpeg" or similar
}

def query_magma_text(text_input):
    payload = {
        "inputs": text_input
        # Magma often expects a specific prompt format, e.g., for VQA or UI navigation
        # You might need to structure 'inputs' as:
        # "inputs": {"text": "<image_start><image><image_end>\nWhat is in this image?"}
        # or for UI navigation, it might involve image data directly.
    }
    response = requests.post(API_URL, headers=headers, json=payload)
    response.raise_for_status() # Raise an exception for bad status codes
    return response.json()

def query_magma_image_and_text(image_path, text_input):
    # For multimodal models like Magma, you often send binary image data
    # along with text. The exact format depends on the model's API.
    # This is a simplified example; actual Magma API might be more complex.

    # If Magma's API expects image and text in separate parts of the request,
    # you might need to use `files` parameter in requests.post or a different structure.
    # Check the Magma model card on Hugging Face Hub for its specific Inference API usage.

    with open(image_path, "rb") as f:
        image_data = f.read()

    # Some models use a multipart/form-data for image and text
    # Or you might send image as base64 in JSON, but binary is common for HF Inference API
    # The exact structure depends on the model's inference endpoint.
    # For Magma specifically, you might need to combine them within the 'inputs' structure,
    # potentially by embedding the image as a string or a special token.

    # A common pattern for multimodal inference API:
    headers = {
        "Authorization": f"Bearer {HF_TOKEN}",
        # "Content-Type": "application/json" if image is base64 encoded within JSON
        # Or "Content-Type": "multipart/form-data" if sending as files
    }

    # This is an *example* and might not be exactly how Magma's Inference API works for image+text
    # You would need to consult the specific API documentation for Microsoft Magma-8B
    # on Hugging Face Hub.
    # A common pattern involves sending the image bytes directly and the text in headers or a separate field.
    # Example using the `huggingface_hub` InferenceClient:
    from huggingface_hub import InferenceClient

    client = InferenceClient(token=HF_TOKEN)
    # Assuming a structure like this for Magma, based on its common usage patterns
    # This is a general example for multimodal. Magma might have specific processor calls.
    # The prompt format for Magma often involves <image_start><image><image_end> tokens.
    # You'd send the image bytes and the text prompt where the image token is placeholder.
    # The Inference API would then combine them.

    # This specific usage for Magma might involve a custom handler on the endpoint.
    # A common simple pattern for some HF image + text models:
    # response = requests.post(API_URL, headers=headers, data=image_data) # Send image first
    # You might also send JSON for text data in a separate request or merged.

    # For Magma, the Hugging Face Space shows a Gradio demo:
    # https://huggingface.co/spaces/microsoft/Magma-UI
    # This suggests a more integrated approach, typically managed by `transformers` or a custom handler
    # on the Inference Endpoint, not just raw text/image over generic API.
    # If you want to use the *Inference API* with Magma, you'd need to confirm what inputs it expects.
    # Many multimodal models on HF expose an API where you send the image as binary data
    # and the text as a JSON payload, or combine them with a multipart request.

    # Let's assume for simplicity a model that takes image bytes and text in a JSON:
    # This is speculative for Magma's direct Inference API.
    # You'll likely need to check the exact endpoint behavior.
    # A safer bet is to use the `huggingface_hub` library's `InferenceClient`.
    try:
        # This is a generic example for a visual question answering task.
        # Magma's specific API might require a different client method or payload.
        # Refer to https://huggingface.co/docs/huggingface_hub/package_reference/inference_client#huggingface_hub.InferenceClient.image_to_text
        # and the Magma model card on Hugging Face for specifics.
        result = client.image_to_text(image=image_path, prompt=text_input, model=MODEL_ID)
        return result
    except Exception as e:
        print(f"Error calling Inference API for multimodal Magma: {e}")
        # The Magma model often works with a specific processor and model loading
        # in a Python environment, rather than a generic text_to_image or image_to_text API call.
        # It's an agentic model.

        # For Magma specifically, the API might be structured more like this (if it has a public inference API):
        # endpoint = "https://<your-magma-inference-endpoint>"
        # headers = {"Authorization": f"Bearer {HF_TOKEN}"}
        # files = {"image": open(image_path, "rb")}
        # data = {"text": text_input}
        # response = requests.post(endpoint, headers=headers, files=files, data=data)
        # return response.json()
        return {"error": "Could not process Magma request. Check model card for API specifics."}


# Example of how you might call it from your UI backend
# text_result = query_magma_text("What is the capital of France?")
# print(text_result)

# image_text_result = query_magma_image_and_text("path/to/your/image.jpg", "What is visible in this image?")
# print(image_text_result)

In [11]:
text_result = query_magma_text("What is the capital of France?")


HTTPError: 404 Client Error: Not Found for url: https://api-inference.huggingface.co/models/microsoft/Magma-8B