Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions examples/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ else()
# add_subdirectory(simple)
# add_subdirectory(speculative)
# add_subdirectory(tokenize)
add_subdirectory(omni-vlm)
add_subdirectory(nexa-omni-audio)
add_subdirectory(qwen2-audio)
endif()
50 changes: 50 additions & 0 deletions examples/omni-vlm/CMakeLists.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,50 @@
add_library(omni_vlm OBJECT
omni-vlm.cpp
omni-vlm.h
clip.cpp
clip.h
)

target_link_libraries(omni_vlm PRIVATE ggml_llama llama ${CMAKE_THREAD_LIBS_INIT})

target_include_directories(omni_vlm PUBLIC .)
target_include_directories(omni_vlm PUBLIC ../..)
target_include_directories(omni_vlm PUBLIC ../../common)

target_compile_features(omni_vlm PRIVATE cxx_std_11)

add_library(omni_vlm_static STATIC $<TARGET_OBJECTS:omni_vlm>)
if (BUILD_SHARED_LIBS)
set_target_properties(omni_vlm PROPERTIES POSITION_INDEPENDENT_CODE ON)
target_compile_definitions(omni_vlm PRIVATE LLAMA_SHARED LLAMA_BUILD)
add_library(omni_vlm_shared SHARED $<TARGET_OBJECTS:omni_vlm>)
target_link_libraries(omni_vlm_shared PRIVATE ggml_llama llama ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS omni_vlm_shared LIBRARY)
endif()

if (NOT MSVC)
target_compile_options(omni_vlm PRIVATE -Wno-cast-qual) # stb_image.h
endif()

if(TARGET BUILD_INFO)
add_dependencies(omni_vlm BUILD_INFO)
endif()

set(TARGET omni-vlm-cli)
add_executable(${TARGET} omni-vlm-cli.cpp)
set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME omni-vlm-cli)
install(TARGETS ${TARGET} RUNTIME)
target_link_libraries(${TARGET} PRIVATE common omni_vlm ${CMAKE_THREAD_LIBS_INIT})
target_compile_features(${TARGET} PRIVATE cxx_std_11)

#=== for omni-vlm-wrapper
add_library(omni_vlm_wrapper_shared SHARED omni-vlm-wrapper.cpp $<TARGET_OBJECTS:omni_vlm>)
target_link_libraries(omni_vlm_wrapper_shared PRIVATE common ggml_llama llama ${CMAKE_THREAD_LIBS_INIT})
install(TARGETS omni_vlm_wrapper_shared LIBRARY)

# set(TARGET omni-vlm-wrapper-cli)
# add_executable(${TARGET} omni-vlm-wrapper-cli.cpp)
# set_target_properties(${TARGET} PROPERTIES OUTPUT_NAME omni-vlm-wrapper-cli)
# install(TARGETS ${TARGET} RUNTIME)
# target_link_libraries(${TARGET} PRIVATE omni_vlm_wrapper_shared ${CMAKE_THREAD_LIBS_INIT})
# target_compile_features(${TARGET} PRIVATE cxx_std_11)
109 changes: 109 additions & 0 deletions examples/omni-vlm/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,109 @@
# omni-vlm

Currently this implementation supports [omni-vlm](https://huggingface.co/NexaAIDev/nano-vlm-instruct) variants,

After API is confirmed, more models will be supported / uploaded.

## Usage
Build with cmake in the `llama-cpp-experiments` folder:
```bash
cmake -S . -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
cmake --build build --verbose -j
```
After building, run: `./omni-vlm-cli` to see the usage. For example:

```bash
./omni-vlm-cli \
-m Nano-Llm-494M-F16.gguf \
--mmproj mmproj-omni-vlm-f16.gguf \
--image example/omni-vlm/cat.png
```

See next section to convert gguf files from original safetensors.

[comment]: # (TODO:
**note**: A lower temperature like 0.1 is recommended for better quality. add `--temp 0.1` to the command to do so.
**note**: For GPU offloading ensure to use the `-ngl` flag just like usual
)

## Omni-vlm gguf conversion
1) First clone omni-vlm model:
```console
git clone https://huggingface.co/NexaAIDev/nano-vlm-instruct
```

2) Install the required Python packages:

```sh
pip install -r examples/omni-vlm/requirements.txt
```

3) Run `omni_vlm_surgery.py`:
```console
python omni_vlm_surgery.py \
--clean-vision-tower \
--model <PATH TO nano-vlm-instruct>
```
- you will find an `omni_vlm.projector` and an `omni_vlm.clip` file in `nano-vlm-instruct/` directory

4) Create a soft link `pytorch_model.bin` to `omni_vlm.clip`:
```bash
# in nano-vlm-instruct/ folder
ln -s omni_vlm.clip pytorch_model.bin
```
5) Go back to `llama.cpp` project folder and create the visual gguf model:

clone `nano-vlm-processor` model directory (You may need to obtain authorization to access NexaAIDev space).
```console
git clone https://huggingface.co/NexaAIDev/nano-vlm-processor
```

```console
python ./examples/omni-vlm/convert_image_encoder_to_gguf.py \
-m <PATH TO nano-vlm-instruct> \
--output-dir <PATH TO nano-vlm-instruct> \
-p <PATH TO nano-vlm-processor>
```
- You will get a pure vision model part of CLIP named `<PATH TO nano-vlm-instruct>/mmproj-omni-vlm-f16.gguf`.

6) Then convert the LLM portion to gguf format:
* Run python snippet below to extract LLM portion from original omni-vlm model.
```python
from safetensors import safe_open
from safetensors.torch import save_file

tensors = {}
with safe_open("<PATH TO nano-vlm-instruct>/model.safetensors", framework="pt", device=0) as f:
for k in f.keys():
if k.startswith('language_model'):
k2 = k.replace('language_model.', '')
tensors[k2] = f.get_tensor(k)

save_file(tensors, "<PATH TO nano-vlm-processor>/model.safetensors")
```

```console
python convert_hf_to_gguf.py <PATH TO nano-vlm-processor>
```
Finally we will get LLM GGUF model: `<PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.ggu`

7) And finally we can run the omni-vlm demo of C++ version:
```console
./build/bin/omni-vlm-cli \
-m <PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.gguf \
--mmproj <PATH TO nano-vlm-instruct>/mmproj-omni-vlm-f16.gguf \
--image example/omni-vlm/cat.png
```
The results will print on the screen:
> The image depicts a grey and white cat with its head pressed against the camera, appearing as if it is staring directly into the lens. The cat is surrounded by black and white stripes, adding a unique touch to its appearance. The black background creates a strong contrast and highlights the cat's features, making it a captivating scene.

8) Python interface:

After successfully compiling omni_vlm_wrapper_shared dynamic library, run:
```console
python omni_vlm_demo.py \
--model <PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.gguf \
--mmproj <PATH TO nano-vlm-instruct>/mmproj-omni-vlm-f16.gguf \
--prompt="Describe this image for me" \
--image-path cat.png
```
Binary file added examples/omni-vlm/cat.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Loading