JamePeng · flamingrickpat · Jul 30, 2025
diff --git a/llama_cpp/llama_chat_format.py b/llama_cpp/llama_chat_format.py
@@ -2835,15 +2835,12 @@ def _create_bitmap_from_bytes(self, image_bytes: bytes):
 
         with suppress_stdout_stderr(disable=self.verbose):
             # Create bitmap from buffer using helper function
-            bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf(
-                self.mtmd_ctx,
-                (ctypes.c_uint8 * len(image_bytes)).from_buffer(bytearray(image_bytes)),
-                len(image_bytes)
-            )
+            n = len(image_bytes)
+            buf = (ctypes.c_ubyte * n).from_buffer_copy(image_bytes)  # makes a copy
 
-            if bitmap is None:
+            bitmap = self._mtmd_cpp.mtmd_helper_bitmap_init_from_buf(self.mtmd_ctx, buf, n)
+            if not bitmap:
                 raise ValueError("Failed to create bitmap from image bytes")
-
             return bitmap
 
     def __call__(
@@ -2965,7 +2962,6 @@ def __call__(
 
                 # Reset llama context
                 llama.reset()
-                llama._ctx.kv_cache_clear()
 
                 # Process each chunk
                 n_past = llama_cpp.llama_pos(0)
@@ -2978,7 +2974,7 @@ def __call__(
 
                     chunk_type = self._mtmd_cpp.mtmd_input_chunk_get_type(chunk)
 
-                    if chunk_type == self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_TEXT:
+                    if chunk_type == self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_TEXT:
                         # Handle text chunk
                         n_tokens_out = ctypes.c_size_t()
                         tokens_ptr = self._mtmd_cpp.mtmd_input_chunk_get_tokens_text(
@@ -2995,7 +2991,7 @@ def __call__(
                                 )
                             llama.eval(tokens)
 
-                    elif chunk_type in [self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_IMAGE, self._mtmd_cpp.MTMD_INPUT_CHUNK_TYPE_AUDIO]:
+                    elif chunk_type in [self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_IMAGE, self._mtmd_cpp.mtmd_input_chunk_type.MTMD_INPUT_CHUNK_TYPE_AUDIO]:
                         # Handle image/audio chunk using helper
                         chunk_n_tokens = self._mtmd_cpp.mtmd_input_chunk_get_n_tokens(chunk)
 

diff --git a/llama_cpp/mtmd_cpp.py b/llama_cpp/mtmd_cpp.py
@@ -11,6 +11,8 @@
     c_uint32,
     c_float,
     c_void_p,
+    c_size_t,
+    c_ubyte,
     POINTER,
     _Pointer,  # type: ignore
     Structure,
@@ -136,14 +138,13 @@ class mtmd_context_params(Structure):
 def mtmd_default_marker() -> c_char_p:
     ...
 
-
 # MTMD_API struct mtmd_context_params mtmd_context_params_default(void);
 @ctypes_function_mtmd(
     "mtmd_context_params_default",
     [],
-    mtmd_context_params_p_ctypes,
+    mtmd_context_params,
 )
-def mtmd_context_params_default() -> mtmd_context_params_p:
+def mtmd_context_params_default() -> mtmd_context_params:
     ...
 
 
@@ -370,13 +371,13 @@ def mtmd_input_chunk_get_type(chunk: mtmd_input_chunk_p) -> c_int32:
 @ctypes_function_mtmd(
     "mtmd_input_chunk_get_tokens_text", [
         mtmd_input_chunk_p_ctypes,
-        POINTER(c_uint),
-    ], c_int32)
+        POINTER(c_size_t),
+    ], POINTER(c_int32))
 def mtmd_input_chunk_get_tokens_text(
     chunk: mtmd_input_chunk_p,
-    n_tokens_output: c_uint,
+    n_tokens_output: c_size_t,
     /,
-) -> c_int32:
+) -> POINTER(c_int32):
     ...
 
 # MTMD_API const mtmd_image_tokens *  mtmd_input_chunk_get_tokens_image(const mtmd_input_chunk * chunk);
@@ -609,13 +610,11 @@ def mtmd_helper_bitmap_init_from_file(ctx: mtmd_context_p, fname: c_char_p) -> m
 # // this function is thread-safe
 # MTMD_API mtmd_bitmap * mtmd_helper_bitmap_init_from_buf(mtmd_context * ctx, const unsigned char * buf, size_t len);
 @ctypes_function_mtmd(
-    "mtmd_helper_bitmap_init_from_buf", [mtmd_context_p_ctypes, c_char_p, c_uint], mtmd_bitmap_p_ctypes)
-def mtmd_helper_bitmap_init_from_buf(
-    ctx: mtmd_context_p,
-    buf: c_char_p,
-    len: c_uint,
-    /,
-) -> mtmd_bitmap_p:
+    "mtmd_helper_bitmap_init_from_buf",
+    [mtmd_context_p_ctypes, POINTER(c_ubyte), c_size_t],
+    mtmd_bitmap_p_ctypes
+)
+def mtmd_helper_bitmap_init_from_buf(ctx, buf, length):
     """
     helper function to construct a mtmd_bitmap from a buffer containing a file
     supported formats: