Skip to content

fix some ctypes in mtmd_cpp.py and adapt Llava15ChatHandler for vision#10

Closed
flamingrickpat wants to merge 1 commit intoJamePeng:mainfrom
flamingrickpat:main
Closed

fix some ctypes in mtmd_cpp.py and adapt Llava15ChatHandler for vision#10
flamingrickpat wants to merge 1 commit intoJamePeng:mainfrom
flamingrickpat:main

Conversation

@flamingrickpat
Copy link
Copy Markdown

  • tested with gemma3 12b q4

I only tested it with this:

from llama_cpp import Llama
from llama_cpp.llama_chat_format import Llava15ChatHandler

MODEL_PATH = r"./gemma-3-12b-it-q4_0.gguf"
MMPROJ_PATH = r"./mmproj-model-f16-12B.gguf"

class Gemma3Handler(Llava15ChatHandler):
    DEFAULT_SYSTEM_MESSAGE = "You are a helpful assistant."
    CHAT_FORMAT = (
        "{% for message in messages %}"
        "{% if message['role'] == 'user' %}"
        "<start_of_turn>user\n"
        "{% else %}"
        "<start_of_turn>model\n"
        "{% endif %}"
        "{% if 1 == 1 %}"
        "{% if message['content'] is string %}"
        "{{ message['content'] }}"
        "{% else %}"
        "{% for content in message['content'] %}"
        "{% if content['type'] == 'text' %}"
        "{{ content['text'] }}"
        "{% elif content['type'] == 'image_url' %}"
        "{% if content.image_url is string %}"
        "{{ content.image_url }}"
        "{% else %}"
        "<start_of_image>{{ content.image_url.url }}"
        "{% endif %}"
        "{% endif %}"
        "{% endfor %}"
        "{% endif %}"
        "<end_of_turn>\n"
        "{% endif %}"
        "{% endfor %}"
        "<start_of_turn>model\n"
    )

    def __call__(self, **kwargs):
        llama = kwargs['llama']

        # Clear state for multiple runs
        llama.reset()
        llama.n_tokens = 0

        if hasattr(llama, 'input_ids'):
            llama.input_ids.fill(0)

        # Clear any handler state
        if hasattr(self, '_last_image_embed'):
            self._last_image_embed = None
            self._last_image_hash = None

        if self.verbose:
            messages = kwargs.get('messages', [])
            image_count = len(self.get_image_urls(messages))
            print(f"Minimal - Cleared state, processing {image_count} images")

        # Use parent implementation
        return super().__call__(**kwargs)

llm = Llama(
    model_path=MODEL_PATH,
    chat_handler=Gemma3Handler(clip_model_path=MMPROJ_PATH),
    n_gpu_layers=-1,
    n_ctx=2048,
)

res = llm.create_chat_completion(
    messages = [
        {"role": "system", "content": "You are an assistant who perfectly describes images."},
        {
            "role": "user",
            "content": [
                {"type" : "text", "text": "What's in this image?"},
                {"type": "image_url", "image_url": {"url": "https://upload.wikimedia.org/wikipedia/commons/0/0d/20250510-Hannah_Hampton_%28cropped_-_portrait_-_2%29.jpg" } }
            ]
        }
    ]
)
print(res)

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment

Labels

None yet

Projects

None yet

Development

Successfully merging this pull request may close these issues.

1 participant