NexaAI · zhiyuan8 · Nov 11, 2024 · Nov 6, 2024 · Nov 6, 2024 · Nov 6, 2024
diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS
@@ -0,0 +1 @@
+@zhiyuan8 @alexchen4ai
diff --git a/common/common.cpp b/common/common.cpp
@@ -1442,6 +1442,11 @@ bool gpt_params_find_arg(int argc, char ** argv, const std::string & arg, gpt_pa
     // End of Parse args for logging parameters
 #endif // LOG_DISABLE_LOGS
 
+    if (arg == "--omni-vlm-version") {
+        CHECK_ARG
+        params.omni_vlm_version = argv[i];
+        return true;
+    }
     return false;
 }
 
@@ -1688,6 +1693,7 @@ void gpt_params_print_usage(int /*argc*/, char ** argv, const gpt_params & param
                                                                         "layer range to apply the control vector(s) to, start and end inclusive" });
     options.push_back({ "*",           "-m,    --model FNAME",          "model path (default: models/$filename with filename from --hf-file\n"
                                                                         "or --model-url if set, otherwise %s)", DEFAULT_MODEL_PATH });
+    options.push_back({ "*",           "        --omni-vlm-version VERSION_STRING",          "omni vlm string version(one of 'vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n"                               "(default: 'vlm-81-ocr')"});
     options.push_back({ "*",           "-md,   --model-draft FNAME",    "draft model for speculative decoding (default: unused)" });
     options.push_back({ "*",           "-mu,   --model-url MODEL_URL",  "model download url (default: unused)" });
     options.push_back({ "*",           "-hfr,  --hf-repo REPO",         "Hugging Face model repository (default: unused)" });

diff --git a/common/common.h b/common/common.h
@@ -265,6 +265,8 @@ struct gpt_params {
     bool spm_infill = false; // suffix/prefix/middle pattern for infill
 
     std::string lora_outfile = "ggml-lora-merged-f16.gguf";
+
+    std::string omni_vlm_version = "vlm-81-ocr";
 };
 
 void gpt_params_parse_from_env(gpt_params & params);

diff --git a/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt b/examples/llama.android/app/src/main/java/com/example/llama/MainActivity.kt
@@ -72,19 +72,14 @@ class MainActivity(
 
         val models = listOf(
             Downloadable(
-                "Phi-2 7B (Q4_0, 1.6 GiB)",
-                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/phi-2/ggml-model-q4_0.gguf?download=true"),
-                File(extFilesDir, "phi-2-q4_0.gguf"),
+                "Llama3.2-1B-Instruct (Q4_0, 735 MB)",
+                Uri.parse("https://public-storage.nexa4ai.com/Llama3.2-1B-Instruct/q4_0.gguf"),
+                File(extFilesDir, "Llama3.2-1B-Instruct-q4_0.gguf"),
             ),
             Downloadable(
-                "TinyLlama 1.1B (f16, 2.2 GiB)",
-                Uri.parse("https://huggingface.co/ggml-org/models/resolve/main/tinyllama-1.1b/ggml-model-f16.gguf?download=true"),
-                File(extFilesDir, "tinyllama-1.1-f16.gguf"),
-            ),
-            Downloadable(
-                "Phi 2 DPO (Q3_K_M, 1.48 GiB)",
-                Uri.parse("https://huggingface.co/TheBloke/phi-2-dpo-GGUF/resolve/main/phi-2-dpo.Q3_K_M.gguf?download=true"),
-                File(extFilesDir, "phi-2-dpo.Q3_K_M.gguf")
+                "octopus",
+                Uri.parse("https://public-storage.nexa4ai.com/Octopus-v2/q4_0.gguf"),
+                File(extFilesDir, "octopus-q4_0.gguf")
             ),
         )
 

diff --git a/examples/llama.android/llama/src/main/cpp/CMakeLists.txt b/examples/llama.android/llama/src/main/cpp/CMakeLists.txt
@@ -33,6 +33,7 @@ project("llama-android")
 
 #load local llama.cpp
 add_subdirectory(../../../../../../ build-llama)
+add_subdirectory(../../../../../../examples/llava build-llava)
 
 # In order to load a library into your app from Java/Kotlin, you must call
 # System.loadLibrary() and pass the name of the library defined here;
@@ -50,4 +51,5 @@ target_link_libraries(${CMAKE_PROJECT_NAME}
         llama
         common
         android
-        log)
+        log
+        llava)
diff --git a/examples/llama.android/llama/src/main/cpp/llama-android.cpp b/examples/llama.android/llama/src/main/cpp/llama-android.cpp
@@ -6,6 +6,7 @@
 #include <unistd.h>
 #include "llama.h"
 #include "common.h"
+#include "llava.h"
 
 // Write C++ code here.
 //

diff --git a/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt b/examples/llama.android/llama/src/main/java/android/llama/cpp/LLamaAndroid.kt
@@ -36,7 +36,7 @@ class LLamaAndroid {
         }
     }.asCoroutineDispatcher()
 
-    private val nlen: Int = 64
+    private val nlen: Int = 256
 
     private external fun log_to_android()
     private external fun load_model(filename: String): Long

diff --git a/examples/nexa-omni-audio/omni.cpp b/examples/nexa-omni-audio/omni.cpp
@@ -23,6 +23,8 @@
 // Constants
 //
 
+void* internal_chars = nullptr;
+
 static const char *AUDIO_TOKEN = "<|AUDIO|>";
 
 //
@@ -570,7 +572,7 @@ static omni_params get_omni_params_from_context_params(omni_context_params &para
     all_params.gpt.n_gpu_layers = params.n_gpu_layers;
     all_params.gpt.model = params.model;
     all_params.gpt.prompt = params.prompt;
-    
+
     // Initialize whisper params
     all_params.whisper.model = params.mmproj;
     all_params.whisper.fname_inp = {params.file};
@@ -703,6 +705,10 @@ struct omni_context *omni_init_context(omni_context_params &params)
 
 void omni_free(struct omni_context *ctx_omni)
 {
+    if(internal_chars != nullptr)
+    {
+        free(internal_chars);
+    }
     if (ctx_omni->ctx_whisper)
     {
         whisper_free(ctx_omni->ctx_whisper);
@@ -792,7 +798,7 @@ ggml_tensor *omni_process_audio(struct omni_context *ctx_omni, omni_params &para
     return embed_proj;
 }
 
-void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
+const char* omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed, omni_params &params, const std::string &prompt)
 {
     int n_past = 0;
 
@@ -833,12 +839,11 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
     for (int i = 0; i < max_tgt_len; i++)
     {
         const char * tmp = sample(ctx_sampling, ctx_omni->ctx_llama, &n_past);
-        response += tmp;
         if (strcmp(tmp, "</s>") == 0)
             break;
         if (strstr(tmp, "###"))
             break; // Yi-VL behavior
-        printf("%s", tmp);
+        // printf("%s", tmp);
         if (strstr(response.c_str(), "<|im_end|>"))
             break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
         if (strstr(response.c_str(), "<|im_start|>"))
@@ -847,16 +852,22 @@ void omni_process_prompt(struct omni_context *ctx_omni, ggml_tensor *audio_embed
             break; // mistral llava-1.6
 
         fflush(stdout);
+        response += tmp;
     }
 
     llama_sampling_free(ctx_sampling);
     printf("\n");
+    if(internal_chars != nullptr) { free(internal_chars); }
+    internal_chars = malloc(sizeof(char)*(response.size()+1));
+    strncpy((char*)(internal_chars), response.c_str(), response.size());
+    ((char*)(internal_chars))[response.size()] = '\0';
+    return (const char*)(internal_chars);
 }
 
-void omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
+const char* omni_process_full(struct omni_context *ctx_omni, omni_context_params &params)
 {
     omni_params all_params = get_omni_params_from_context_params(params);
 
     ggml_tensor *audio_embed = omni_process_audio(ctx_omni, all_params);
-    omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
-}
+    return omni_process_prompt(ctx_omni, audio_embed, all_params, all_params.gpt.prompt);
+}
diff --git a/examples/nexa-omni-audio/omni.h b/examples/nexa-omni-audio/omni.h
@@ -54,11 +54,11 @@ OMNI_AUDIO_API struct omni_context *omni_init_context(omni_context_params &param
 
 OMNI_AUDIO_API void omni_free(struct omni_context *ctx_omni);
 
-OMNI_AUDIO_API void omni_process_full(
+OMNI_AUDIO_API const char* omni_process_full(
     struct omni_context *ctx_omni,
     omni_context_params &params
 );
 
 #ifdef __cplusplus
 }
-#endif
+#endif
diff --git a/examples/omni-vlm/README.md b/examples/omni-vlm/README.md
@@ -1,22 +1,30 @@
 # omni-vlm
 
-Currently this implementation supports [omni-vlm](https://huggingface.co/NexaAIDev/nano-vlm-instruct) variants,
+Currently this implementation supports:
 
-After API is confirmed, more models will be supported / uploaded.
+* [nano-vlm-instruct](https://huggingface.co/NexaAIDev/nano-vlm-instruct/tree/main) ([gguf](https://huggingface.co/NexaAIDev/nano-vlm-instruct-gguf/tree/main))
+* [vlm-81-ocr](https://huggingface.co/NexaAIDev/vlm-81-ocr/tree/main) ([gguf](https://huggingface.co/NexaAIDev/vlm-81-ocr-gguf/tree/main))
+* [vlm-81-instruct](https://huggingface.co/NexaAIDev/vlm-81-instruct/tree/main) ([gguf](https://huggingface.co/NexaAIDev/vlm-81-instruct-gguf/tree/main))
+
+After API is stable, more models will be supported.
 
 ## Usage
-Build with cmake in the `llama-cpp-experiments` folder:
-```bash
+
+Build with cmake in the `llama.cpp` folder:
+
+```console
 cmake -S . -B build -DCMAKE_BUILD_TYPE=RelWithDebInfo
 cmake --build build --verbose -j
 ```
+
 After building, run: `./omni-vlm-cli` to see the usage. For example:
 
-```bash
+```console
 ./omni-vlm-cli \
-    -m Nano-Llm-494M-F16.gguf \
-    --mmproj mmproj-omni-vlm-f16.gguf \
-    --image example/omni-vlm/cat.png
+    -m <llm-F16.gguf> \
+    --mmproj <mmproj-F16.gguf> \
+    --image example/omni-vlm/cat.png \
+    --omni-vlm-version <vlm-81-ocr | vlm-81-instruct | nano-vlm-instruct>
 ```
 
 See next section to convert gguf files from original safetensors.
@@ -27,14 +35,15 @@ See next section to convert gguf files from original safetensors.
 )
 
 ## Omni-vlm gguf conversion
+
 1) First clone omni-vlm model:
 ```console
 git clone https://huggingface.co/NexaAIDev/nano-vlm-instruct
 ```
 
 2) Install the required Python packages:
 
-```sh
+```console
 pip install -r examples/omni-vlm/requirements.txt
 ```
 
@@ -104,6 +113,5 @@ After successfully compiling omni_vlm_wrapper_shared dynamic library, run:
 python omni_vlm_demo.py \
   --model <PATH TO nano-vlm-processor>/Nano-Llm-494M-F16.gguf \
   --mmproj <PATH TO nano-vlm-instruct>/mmproj-omni-vlm-f16.gguf \
-  --prompt="Describe this image for me" \
-  --image-path cat.png
+  --omni-vlm-version <vlm-81-ocr | vlm-81-instruct | nano-vlm-instruct>
 ```