NexaAI · zhiyuan8 · Nov 12, 2024 · Nov 12, 2024
diff --git a/examples/omni-vlm/clip.cpp b/examples/omni-vlm/clip.cpp
@@ -701,7 +701,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
 }
 
 // read and create ggml_context containing the tensors and their data
-struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
+struct clip_ctx * clip_model_load(const char * fname, const char * omni_vlm_version, const int verbosity = 1) {
     struct ggml_context * meta = NULL;
 
     struct gguf_init_params params = {
@@ -796,6 +796,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
     }
 
     clip_ctx * new_clip = new clip_ctx{};
+    if (std::string(omni_vlm_version) == "vlm-81-ocr") {
+        new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
+    } else if (std::string(omni_vlm_version) == "vlm-81-instruct") {
+        new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
+    } else if (std::string(omni_vlm_version) == "nano-vlm-instruct") {
+        new_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
+    } else {
+        throw std::runtime_error(std::string("error vlm version info: ") + omni_vlm_version);
+    }
 
     // update projector type
     {
@@ -1209,17 +1218,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
     return new_clip;
 }
 
-void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params) {
-    if (params->omni_vlm_version == "vlm-81-ocr") {
-        ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
-    } else if (params->omni_vlm_version == "vlm-81-instruct") {
-        ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
-    } else if (params->omni_vlm_version == "nano-vlm-instruct") {
-        ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
-    } else {
-        throw std::runtime_error(std::string("error vlm version info: ") + params->omni_vlm_version);
-    }
-}
+// void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params) {
+//     if (params->omni_vlm_version == "vlm-81-ocr") {
+//         ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
+//     } else if (params->omni_vlm_version == "vlm-81-instruct") {
+//         ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
+//     } else if (params->omni_vlm_version == "nano-vlm-instruct") {
+//         ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
+//     } else {
+//         throw std::runtime_error(std::string("error vlm version info: ") + params->omni_vlm_version);
+//     }
+// }
 
 void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
     ctx_clip->load_image_size = load_image_size;
@@ -2207,13 +2216,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
     return true;
 }
 
-bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
+bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype, const char* omni_vlm_version) {
     ggml_type type = GGML_TYPE_Q4_1;
 
     assert(itype < GGML_TYPE_COUNT);
     type = static_cast<ggml_type>(itype);
 
-    auto * ctx_clip = clip_model_load(fname_inp, 2);
+    auto * ctx_clip = clip_model_load(fname_inp, omni_vlm_version, 2);
 
     const auto & ctx_src = ctx_clip->ctx_gguf;
     const auto & ctx_data = ctx_clip->ctx_data;

diff --git a/examples/omni-vlm/clip.h b/examples/omni-vlm/clip.h
@@ -39,11 +39,11 @@ struct clip_image_f32_batch {
     size_t size;
 };
 
-CLIP_API struct clip_ctx * clip_model_load    (const char * fname, int verbosity);
+CLIP_API struct clip_ctx * clip_model_load    (const char * fname, const char * omni_vlm_version, int verbosity);
 CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);
 
-struct gpt_params;
-CLIP_API void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params);
+// struct gpt_params;
+// CLIP_API void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params);
 
 CLIP_API void clip_free(struct clip_ctx * ctx);
 
@@ -86,7 +86,7 @@ CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ct
 CLIP_API bool clip_image_encode      (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
 CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);
 
-CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
+CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype, const char * omni_vlm_version);
 
 CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);
 

diff --git a/examples/omni-vlm/omni-vlm-cli.cpp b/examples/omni-vlm/omni-vlm-cli.cpp
@@ -219,8 +219,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
         prompt = "describe the image in detail.";
     }
 
-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 0);
-    clip_set_omni_vlm_version(ctx_clip, params);
+    auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0);
+    // clip_set_omni_vlm_version(ctx_clip, params);
 
     llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
     ctx_params.n_ctx           = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings

diff --git a/examples/omni-vlm/omni-vlm-wrapper.cpp b/examples/omni-vlm/omni-vlm-wrapper.cpp
@@ -65,8 +65,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
         prompt = "describe the image in detail.";
     }
 
-    auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 0);
-    clip_set_omni_vlm_version(ctx_clip, params);
+    auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0);
+    // clip_set_omni_vlm_version(ctx_clip, params);
 
 
     llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
@@ -280,7 +280,10 @@ const char* omnivlm_inference(const char *prompt, const char *imag_path) {
 
 void omnivlm_free() {
     if(internal_chars != nullptr) { free(internal_chars); }
-    ctx_omnivlm->model = NULL;
-    omnivlm_free(ctx_omnivlm);
+    if(ctx_omnivlm != nullptr) {
+        // this snipet should never be run!
+        ctx_omnivlm->model = nullptr;
+        omnivlm_free(ctx_omnivlm);
+    }
     llama_free_model(model);
 }