NexaAI · zhiyuan8 · Nov 5, 2024 · Nov 4, 2024 · Nov 5, 2024
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -52,6 +52,7 @@ else()
     # add_subdirectory(simple)
     # add_subdirectory(speculative)
     # add_subdirectory(tokenize)
+    add_subdirectory(omni-vlm)
     add_subdirectory(nexa-omni-audio)
     add_subdirectory(qwen2-audio)
 endif()
diff --git a/examples/omni-vlm/CMakeLists.txt b/examples/omni-vlm/CMakeLists.txt
@@ -0,0 +1,50 @@
+add_library(omni_vlm OBJECT
+            omni-vlm.cpp
+            omni-vlm.h
+            clip.cpp
+            clip.h
+            )
+
+target_link_libraries(omni_vlm PRIVATE ggml_llama llama ${CMAKE_THREAD_LIBS_INIT})
+
+target_include_directories(omni_vlm PUBLIC .)
+target_include_directories(omni_vlm PUBLIC ../..)
+target_include_directories(omni_vlm PUBLIC ../../common)
+
+target_compile_features(omni_vlm PRIVATE cxx_std_11)
+
+add_library(omni_vlm_static STATIC $<TARGET_OBJECTS:omni_vlm>)
+if (BUILD_SHARED_LIBS)
+    set_target_properties(omni_vlm PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(omni_vlm PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    add_library(omni_vlm_shared SHARED $<TARGET_OBJECTS:omni_vlm>)
+    target_link_libraries(omni_vlm_shared PRIVATE ggml_llama llama ${CMAKE_THREAD_LIBS_INIT})
+    install(TARGETS omni_vlm_shared LIBRARY)
+endif()
+
+if (NOT MSVC)
+    target_compile_options(omni_vlm PRIVATE -Wno-cast-qual) # stb_image.h
+endif()
+
+if(TARGET BUILD_INFO)
+    add_dependencies(omni_vlm BUILD_INFO)
+endif()
+
+set(TARGET omni-vlm-cli)
+add_executable(${TARGET} omni-vlm-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME omni-vlm-cli)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common omni_vlm ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+#=== for omni-vlm-wrapper
+add_library(omni_vlm_wrapper_shared SHARED omni-vlm-wrapper.cpp $<TARGET_OBJECTS:omni_vlm>)
+target_link_libraries(omni_vlm_wrapper_shared PRIVATE common ggml_llama llama ${CMAKE_THREAD_LIBS_INIT})
+install(TARGETS omni_vlm_wrapper_shared LIBRARY)
+
+# set(TARGET omni-vlm-wrapper-cli)
+# add_executable(${TARGET} omni-vlm-wrapper-cli.cpp)
+# set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME omni-vlm-wrapper-cli)
+# install(TARGETS ${TARGET} RUNTIME)
+# target_link_libraries(${TARGET} PRIVATE omni_vlm_wrapper_shared ${CMAKE_THREAD_LIBS_INIT})
+# target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/omni-vlm/README.md b/examples/omni-vlm/README.md
@@ -0,0 +1,109 @@
+# omni-vlm
+
+Currently this implementation supports [omni-vlm](https://huggingface.co/NexaAIDev/nano-vlm-instruct) variants,
+
+After API is confirmed, more models will be supported / uploaded.
+
+## Usage
+Build with cmake in the `llama-cpp-experiments` folder:
+```bash
+cmake -S . -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
+cmake --build build --verbose -j
+```
+After building, run: `./omni-vlm-cli` to see the usage. For example:
+
+```bash
+./omni-vlm-cli \
+    -m Nano-Llm-494M-F16.gguf \
+    --mmproj mmproj-omni-vlm-f16.gguf \
+    --image example/omni-vlm/cat.png
+```
+
+See next section to convert gguf files from original safetensors.
+
+[comment]: # (TODO:
+**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
+**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
+)
+
+## Omni-vlm gguf conversion
+1) First clone omni-vlm model:
+```console
+git clone https://huggingface.co/NexaAIDev/nano-vlm-instruct
+```
+
+2) Install the required Python packages:
+
+```sh
+pip install -r examples/omni-vlm/requirements.txt
+```
+
+3) Run `omni_vlm_surgery.py`:
+```console
+python omni_vlm_surgery.py \
+  --clean-vision-tower \
+  --model <PATH TO nano-vlm-instruct>
+```
+- you will find an `omni_vlm.projector` and an `omni_vlm.clip` file in `nano-vlm-instruct/` directory
+
+4) Create a soft link `pytorch_model.bin` to `omni_vlm.clip`:
+```bash
+# in nano-vlm-instruct/ folder
+ln -s omni_vlm.clip pytorch_model.bin
+```
+5) Go back to `llama.cpp` project folder and create the visual gguf model:
+
+clone `nano-vlm-processor` model directory (You may need to obtain authorization to access NexaAIDev space).
+```console
+git clone https://huggingface.co/NexaAIDev/nano-vlm-processor
+```
+
+```console
+python ./examples/omni-vlm/convert_image_encoder_to_gguf.py \
+    -m <PATH TO nano-vlm-instruct> \
+    --output-dir <PATH TO nano-vlm-instruct> \
+    -p <PATH TO nano-vlm-processor>
+```
+- You will get a pure vision model part of CLIP named `<PATH TO nano-vlm-instruct>/mmproj-omni-vlm-f16.gguf`.
+
+6) Then convert the LLM portion to gguf format:
+* Run python snippet below to extract LLM portion from original omni-vlm model.
+```python
+from safetensors import safe_open
+from safetensors.torch import save_file
+
+tensors = {}
+with safe_open("<PATH TO nano-vlm-instruct>/model.safetensors", framework="pt", device=0) as f:
+    for k in f.keys():
+        if k.startswith('language_model'):
+            k2 = k.replace('language_model.', '')
+            tensors[k2] = f.get_tensor(k)
+
+    save_file(tensors, "<PATH TO nano-vlm-processor>/model.safetensors")
+```
+
+```console
+python convert_hf_to_gguf.py <PATH TO nano-vlm-processor>
+```
+Finally we will get LLM GGUF model: `<PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.ggu`
+
+7) And finally we can run the omni-vlm demo of C++ version:
+```console
+./build/bin/omni-vlm-cli \
+    -m  <PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.gguf \
+    --mmproj <PATH TO nano-vlm-instruct>/mmproj-omni-vlm-f16.gguf \
+    --image example/omni-vlm/cat.png
+```
+The results will print on the screen:
+> The image depicts a grey and white cat with its head pressed against the camera, appearing as if it is staring directly into the lens. The cat is surrounded by black and white stripes, adding a unique touch to its appearance. The black background creates a strong contrast and highlights the cat's features, making it a captivating scene.
+
+8) Python interface:
+
+After successfully compiling omni_vlm_wrapper_shared dynamic library, run:
+```console
+python omni_vlm_demo.py \
+  --model <PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.gguf \
+  --mmproj <PATH TO nano-vlm-instruct>/mmproj-omni-vlm-f16.gguf \
+  --prompt="Describe this image for me" \
+  --image-path cat.png
+```
diff --git a/examples/omni-vlm/cat.png b/examples/omni-vlm/cat.png