NexaAI · zhiyuan8 · Nov 7, 2024 · Nov 7, 2024
diff --git a/examples/CMakeLists.txt b/examples/CMakeLists.txt
@@ -53,6 +53,7 @@ else()
     # add_subdirectory(speculative)
     # add_subdirectory(tokenize)
     add_subdirectory(omni-vlm)
+    add_subdirectory(omni-vlm-v2)
     add_subdirectory(nexa-omni-audio)
     add_subdirectory(qwen2-audio)
 endif()
diff --git a/examples/omni-vlm-v2/CMakeLists.txt b/examples/omni-vlm-v2/CMakeLists.txt
@@ -0,0 +1,50 @@
+add_library(omni_vlm_v2 OBJECT
+            omni-vlm-v2.cpp
+            omni-vlm-v2.h
+            clip-v2.cpp
+            clip-v2.h
+            )
+
+target_link_libraries(omni_vlm_v2 PRIVATE ggml_llama llama ${CMAKE_THREAD_LIBS_INIT})
+
+target_include_directories(omni_vlm_v2 PUBLIC .)
+target_include_directories(omni_vlm_v2 PUBLIC ../..)
+target_include_directories(omni_vlm_v2 PUBLIC ../../common)
+
+target_compile_features(omni_vlm_v2 PRIVATE cxx_std_11)
+
+add_library(omni_vlm_v2_static STATIC $<TARGET_OBJECTS:omni_vlm_v2>)
+if (BUILD_SHARED_LIBS)
+    set_target_properties(omni_vlm_v2 PROPERTIES POSITION_INDEPENDENT_CODE ON)
+    target_compile_definitions(omni_vlm_v2 PRIVATE LLAMA_SHARED LLAMA_BUILD)
+    add_library(omni_vlm_v2_shared SHARED $<TARGET_OBJECTS:omni_vlm_v2>)
+    target_link_libraries(omni_vlm_v2_shared PRIVATE ggml_llama llama ${CMAKE_THREAD_LIBS_INIT})
+    install(TARGETS omni_vlm_v2_shared LIBRARY)
+endif()
+
+if (NOT MSVC)
+    target_compile_options(omni_vlm_v2 PRIVATE -Wno-cast-qual) # stb_image.h
+endif()
+
+if(TARGET BUILD_INFO)
+    add_dependencies(omni_vlm_v2 BUILD_INFO)
+endif()
+
+set(TARGET omni-vlm-v2-cli)
+add_executable(${TARGET} omni-vlm-v2-cli.cpp)
+set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME omni-vlm-v2-cli)
+install(TARGETS ${TARGET} RUNTIME)
+target_link_libraries(${TARGET} PRIVATE common omni_vlm_v2 ${CMAKE_THREAD_LIBS_INIT})
+target_compile_features(${TARGET} PRIVATE cxx_std_11)
+
+#=== for omni-vlm-wrapper
+add_library(omni_vlm_v2_wrapper_shared SHARED omni-vlm-v2-wrapper.cpp $<TARGET_OBJECTS:omni_vlm_v2>)
+target_link_libraries(omni_vlm_v2_wrapper_shared PRIVATE common ggml_llama llama ${CMAKE_THREAD_LIBS_INIT})
+install(TARGETS omni_vlm_v2_wrapper_shared LIBRARY)
+
+# set(TARGET omni-vlm-wrapper-cli)
+# add_executable(${TARGET} omni-vlm-wrapper-cli.cpp)
+# set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME omni-vlm-wrapper-cli)
+# install(TARGETS ${TARGET} RUNTIME)
+# target_link_libraries(${TARGET} PRIVATE omni_vlm_v2_wrapper_shared ${CMAKE_THREAD_LIBS_INIT})
+# target_compile_features(${TARGET} PRIVATE cxx_std_11)
diff --git a/examples/omni-vlm-v2/README.md b/examples/omni-vlm-v2/README.md
@@ -0,0 +1,110 @@
+# omni-vlm
+
+Currently this implementation supports [vlm-81-ocr](https://huggingface.co/NexaAIDev/vlm-81-ocr) variants,
+
+After API is confirmed, more models will be supported / uploaded.
+
+## Usage
+Build with cmake in the `llama.cpp` folder:
+```bash
+cmake -S . -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
+cmake --build build --verbose -j
+```
+After building, run: `./omni-vlm-v2-cli` to see the usage. For example:
+
+```bash
+./omni-vlm-v2-cli \
+    -m Nano-Llm-v2-494M-F16.gguf \
+    --mmproj mmproj-omni-vlm-v2-f16.gguf \
+    --image example/omni-vlm-v2/latex.png \
+    --prompt "Describe this image for me"
+```
+
+See next section to convert gguf files from original safetensors.
+
+[comment]: # (TODO:
+**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
+**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
+)
+
+## Omni-vlm gguf conversion
+1) First clone omni-vlm model:
+```console
+git clone https://huggingface.co/NexaAIDev/vlm-81-ocr
+```
+
+2) Install the required Python packages:
+
+```sh
+pip install -r examples/omni-vlm/requirements.txt
+```
+
+3) Run `omni_vlm_surgery.py`:
+```console
+python omni_vlm_surgery.py \
+  --clean-vision-tower \
+  --model <PATH TO vlm-81-ocr>
+```
+- you will find an `omni_vlm.projector` and an `omni_vlm.clip` file in `vlm-81-ocr/` directory
+
+4) Create a soft link `pytorch_model.bin` to `omni_vlm.clip`:
+```bash
+# in vlm-81-ocr/ folder
+ln -s omni_vlm.clip pytorch_model.bin
+```
+5) Go back to `llama.cpp` project folder and create the visual gguf model:
+
+clone `nano-vlm-processor` model directory (You may need to obtain authorization to access NexaAIDev space).
+```console
+git clone https://huggingface.co/NexaAIDev/nano-vlm-processor
+```
+
+```console
+python ./examples/omni-vlm/convert_image_encoder_to_gguf.py \
+    -m <PATH TO vlm-81-ocr> \
+    --output-dir <PATH TO vlm-81-ocr> \
+    -p <PATH TO nano-vlm-processor>
+```
+- You will get a pure vision model part of CLIP named `<PATH TO vlm-81-ocr>/mmproj-omni-vlm-f16.gguf`.
+
+6) Then convert the LLM portion to gguf format:
+* Run python snippet below to extract LLM portion from original omni-vlm model.
+```python
+from safetensors import safe_open
+from safetensors.torch import save_file
+
+tensors = {}
+with safe_open("<PATH TO vlm-81-ocr>/model.safetensors", framework="pt", device=0) as f:
+    for k in f.keys():
+        if k.startswith('language_model'):
+            k2 = k.replace('language_model.', '')
+            tensors[k2] = f.get_tensor(k)
+
+    save_file(tensors, "<PATH TO nano-vlm-processor>/model.safetensors")
+```
+
+```console
+python convert_hf_to_gguf.py <PATH TO nano-vlm-processor>
+```
+Finally we will get LLM GGUF model: `<PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.ggu`
+
+7) And finally we can run the omni-vlm demo of C++ version:
+```console
+./build/bin/omni-vlm-cli \
+    -m  <PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.gguf \
+    --mmproj <PATH TO nano-vlm-instruct>/mmproj-omni-vlm-v2-f16.gguf \
+    --image example/omni-vlm/cat.png
+```
+The results will print on the screen:
+> The image depicts a grey and white cat with its head pressed against the camera, appearing as if it is staring directly into the lens. The cat is surrounded by black and white stripes, adding a unique touch to its appearance. The black background creates a strong contrast and highlights the cat's features, making it a captivating scene.
+
+8) Python interface:
+
+After successfully compiling omni_vlm_wrapper_shared dynamic library, run:
+```console
+python omni_vlm_v2_demo.py \
+  --model <PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.gguf \
+  --mmproj <PATH TO vlm-81-ocr>/mmproj-omni-vlm-f16.gguf \
+  --prompt="Describe this image for me" \
+  --image-path latex.png
+```