Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
23 commits
Select commit Hold shift + click to select a range
22da7bc
add returned string (pure c const char* type) for omni-vlm inference api
Nov 6, 2024
5574bda
Merge pull request #10 from NexaAI/weili/master-release
zhiyuan8 Nov 6, 2024
b24a409
add returned string (const char*) for qwen2 audio
Nov 6, 2024
6a4cf0b
Merge pull request #11 from NexaAI/weili/master-release
zhiyuan8 Nov 6, 2024
5edadff
add returned string type (const char*) for nexa-omni-audio
Nov 7, 2024
20b9f02
Merge pull request #12 from NexaAI/weili/master-release
zhiyuan8 Nov 7, 2024
3dfac78
add returned string type (const char*) for nexa-omni-audio
Nov 7, 2024
df5841b
Merge pull request #13 from NexaAI/weili/master-release
zhiyuan8 Nov 7, 2024
86c2233
add submodule llava for android
Nov 8, 2024
400fc2a
add one more model
Nov 8, 2024
b17684e
add include llava.h
Nov 8, 2024
16c2247
remove redundant omni-vlm-v2/ folder, all omni-vlm examples will be a…
Nov 8, 2024
3d9c63a
remove omni-vlm-v2/
Nov 8, 2024
eb6d546
update README.md
Nov 8, 2024
8c41728
Merge pull request #15 from NexaAI/weili/master-release
zhiyuan8 Nov 8, 2024
d5df536
Merge pull request #14 from NexaAI/teliu/android/dev
zhiyuan8 Nov 8, 2024
ecfe0b4
changed download models and nlen
zhycheng614 Nov 8, 2024
667a6d9
Merge pull request #16 from NexaAI/perry/android-dev
zhycheng614 Nov 8, 2024
d04e354
fix OCR template error.
Nov 9, 2024
21bc833
Merge pull request #17 from NexaAI/weili/master-release
zhiyuan8 Nov 9, 2024
6f0e8c3
Create CODEOWNERS
zhiyuan8 Nov 10, 2024
7cf07df
reset model in every inerence step to avoid nosense output.
Nov 11, 2024
362bdf3
Merge pull request #18 from NexaAI/weili/master-release
zhiyuan8 Nov 11, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions .github/CODEOWNERS
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
@zhiyuan8 @alexchen4ai
6 changes: 6 additions & 0 deletions common/common.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1442,6 +1442,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
// End of Parse args for logging parameters
#endif // LOG_DISABLE_LOGS

if (arg == "--omni-vlm-version") {
CHECK_ARG
params.omni_vlm_version = argv[i];
return true;
}
return false;
}

Expand Down Expand Up @@ -1688,6 +1693,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
"layer range to apply the control vector(s) to, start and end inclusive" });
options.push_back({ "*", "-m, --model FNAME", "model path (default: models/$filename with filename from --hf-file\n"
"or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
options.push_back({ "*", " --omni-vlm-version VERSION_STRING", "omni vlm string version(one of 'vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n" "(default: 'vlm-81-ocr')"});
options.push_back({ "*", "-md, --model-draft FNAME", "draft model for speculative decoding (default: unused)" });
options.push_back({ "*", "-mu, --model-url MODEL_URL", "model download url (default: unused)" });
options.push_back({ "*", "-hfr, --hf-repo REPO", "Hugging Face model repository (default: unused)" });
Expand Down
2 changes: 2 additions & 0 deletions common/common.h
Original file line number Diff line number Diff line change
Expand Up @@ -265,6 +265,8 @@ struct gpt_params {
bool spm_infill = false; // suffix/prefix/middle pattern for infill

std::string lora_outfile = "ggml-lora-merged-f16.gguf";

std::string omni_vlm_version = "vlm-81-ocr";
};

void gpt_params_parse_from_env(gpt_params & params);
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -72,19 +72,14 @@ class MainActivity(

val models = listOf(
Downloadable(
"Phi-2 7B (Q4_0, 1.6 GiB)",
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
File(extFilesDir, "phi-2-q4_0.gguf"),
"Llama3.2-1B-Instruct (Q4_0, 735 MB)",
Uri.parse("https://public-storage.nexa4ai.com/Llama3.2-1B-Instruct/q4_0.gguf"),
File(extFilesDir, "Llama3.2-1B-Instruct-q4_0.gguf"),
),
Downloadable(
"TinyLlama 1.1B (f16, 2.2 GiB)",
Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
File(extFilesDir, "tinyllama-1.1-f16.gguf"),
),
Downloadable(
"Phi 2 DPO (Q3_K_M, 1.48 GiB)",
Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
"octopus",
Uri.parse("https://public-storage.nexa4ai.com/Octopus-v2/q4_0.gguf"),
File(extFilesDir, "octopus-q4_0.gguf")
),
)

Expand Down
4 changes: 3 additions & 1 deletion examples/llama.android/llama/src/main/cpp/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ project("llama-android")

#load local llama.cpp
add_subdirectory(../../../../../../ build-llama)
add_subdirectory(../../../../../../examples/llava build-llava)

# In order to load a library into your app from Java/Kotlin, you must call
# System.loadLibrary() and pass the name of the library defined here;
Expand All @@ -50,4 +51,5 @@ target_link_libraries(${CMAKE_PROJECT_NAME}
llama
common
android
log)
log
llava)
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
#include <unistd.h>
#include "llama.h"
#include "common.h"
#include "llava.h"

// Write C++ code here.
//
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ class LLamaAndroid {
}
}.asCoroutineDispatcher()

private val nlen: Int = 64
private val nlen: Int = 256

private external fun log_to_android()
private external fun load_model(filename: String): Long
Expand Down
25 changes: 18 additions & 7 deletions examples/nexa-omni-audio/omni.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,8 @@
// Constants
//

void* internal_chars = nullptr;

static const char *AUDIO_TOKEN = "<|AUDIO|>";

//
Expand Down Expand Up @@ -570,7 +572,7 @@ static omni_params get_omni_params_from_context_params(omni_context_params &para
all_params.gpt.n_gpu_layers = params.n_gpu_layers;
all_params.gpt.model = params.model;
all_params.gpt.prompt = params.prompt;

// Initialize whisper params
all_params.whisper.model = params.mmproj;
all_params.whisper.fname_inp = {params.file};
Expand Down Expand Up @@ -703,6 +705,10 @@ struct omni_context *omni_init_context(omni_context_params &params)

void omni_free(struct omni_context *ctx_omni)
{
if(internal_chars != nullptr)
{
free(internal_chars);
}
if (ctx_omni->ctx_whisper)
{
whisper_free(ctx_omni->ctx_whisper);
Expand Down Expand Up @@ -792,7 +798,7 @@ ggml_tensor *omni_process_audio(struct omni_context *ctx_omni, omni_params &para
return embed_proj;
}

void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
const char* omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
{
int n_past = 0;

Expand Down Expand Up @@ -833,12 +839,11 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
for (int i = 0; i < max_tgt_len; i++)
{
const char * tmp = sample(ctx_sampling, ctx_omni->ctx_llama, &n_past);
response += tmp;
if (strcmp(tmp, "</s>") == 0)
break;
if (strstr(tmp, "###"))
break; // Yi-VL behavior
printf("%s", tmp);
// printf("%s", tmp);
if (strstr(response.c_str(), "<|im_end|>"))
break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
if (strstr(response.c_str(), "<|im_start|>"))
Expand All @@ -847,16 +852,22 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
break; // mistral llava-1.6

fflush(stdout);
response += tmp;
}

llama_sampling_free(ctx_sampling);
printf("\n");
if(internal_chars != nullptr) { free(internal_chars); }
internal_chars = malloc(sizeof(char)*(response.size()+1));
strncpy((char*)(internal_chars), response.c_str(), response.size());
((char*)(internal_chars))[response.size()] = '\0';
return (const char*)(internal_chars);
}

void omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
const char* omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
{
omni_params all_params = get_omni_params_from_context_params(params);

ggml_tensor *audio_embed = omni_process_audio(ctx_omni, all_params);
omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
}
return omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
}
4 changes: 2 additions & 2 deletions examples/nexa-omni-audio/omni.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,11 +54,11 @@ OMNI_AUDIO_API struct omni_context *omni_init_context(omni_context_params &param

OMNI_AUDIO_API void omni_free(struct omni_context *ctx_omni);

OMNI_AUDIO_API void omni_process_full(
OMNI_AUDIO_API const char* omni_process_full(
struct omni_context *ctx_omni,
omni_context_params &params
);

#ifdef __cplusplus
}
#endif
#endif
30 changes: 19 additions & 11 deletions examples/omni-vlm/README.md
Original file line number Diff line number Diff line change
@@ -1,22 +1,30 @@
# omni-vlm

Currently this implementation supports [omni-vlm](https://huggingface.co/NexaAIDev/nano-vlm-instruct) variants,
Currently this implementation supports:

After API is confirmed, more models will be supported / uploaded.
* [nano-vlm-instruct](https://huggingface.co/NexaAIDev/nano-vlm-instruct/tree/main) ([gguf](https://huggingface.co/NexaAIDev/nano-vlm-instruct-gguf/tree/main))
* [vlm-81-ocr](https://huggingface.co/NexaAIDev/vlm-81-ocr/tree/main) ([gguf](https://huggingface.co/NexaAIDev/vlm-81-ocr-gguf/tree/main))
* [vlm-81-instruct](https://huggingface.co/NexaAIDev/vlm-81-instruct/tree/main) ([gguf](https://huggingface.co/NexaAIDev/vlm-81-instruct-gguf/tree/main))

After API is stable, more models will be supported.

## Usage
Build with cmake in the `llama-cpp-experiments` folder:
```bash

Build with cmake in the `llama.cpp` folder:

```console
cmake -S . -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
cmake --build build --verbose -j
```

After building, run: `./omni-vlm-cli` to see the usage. For example:

```bash
```console
./omni-vlm-cli \
-m Nano-Llm-494M-F16.gguf \
--mmproj mmproj-omni-vlm-f16.gguf \
--image example/omni-vlm/cat.png
-m <llm-F16.gguf> \
--mmproj <mmproj-F16.gguf> \
--image example/omni-vlm/cat.png \
--omni-vlm-version <vlm-81-ocr | vlm-81-instruct | nano-vlm-instruct>
```

See next section to convert gguf files from original safetensors.
Expand All @@ -27,14 +35,15 @@ See next section to convert gguf files from original safetensors.
)

## Omni-vlm gguf conversion

1) First clone omni-vlm model:
```console
git clone https://huggingface.co/NexaAIDev/nano-vlm-instruct
```

2) Install the required Python packages:

```sh
```console
pip install -r examples/omni-vlm/requirements.txt
```

Expand Down Expand Up @@ -104,6 +113,5 @@ After successfully compiling omni_vlm_wrapper_shared dynamic library, run:
python omni_vlm_demo.py \
--model <PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.gguf \
--mmproj <PATH TO nano-vlm-instruct>/mmproj-omni-vlm-f16.gguf \
--prompt="Describe this image for me" \
--image-path cat.png
--omni-vlm-version <vlm-81-ocr | vlm-81-instruct | nano-vlm-instruct>
```
Loading