Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 23 additions & 14 deletions examples/omni-vlm/clip.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -701,7 +701,7 @@ static ggml_cgraph * clip_image_build_graph(clip_ctx * ctx, const clip_image_f32
}

// read and create ggml_context containing the tensors and their data
struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
struct clip_ctx * clip_model_load(const char * fname, const char * omni_vlm_version, const int verbosity = 1) {
struct ggml_context * meta = NULL;

struct gguf_init_params params = {
Expand Down Expand Up @@ -796,6 +796,15 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
}

clip_ctx * new_clip = new clip_ctx{};
if (std::string(omni_vlm_version) == "vlm-81-ocr") {
new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
} else if (std::string(omni_vlm_version) == "vlm-81-instruct") {
new_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
} else if (std::string(omni_vlm_version) == "nano-vlm-instruct") {
new_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
} else {
throw std::runtime_error(std::string("error vlm version info: ") + omni_vlm_version);
}

// update projector type
{
Expand Down Expand Up @@ -1209,17 +1218,17 @@ struct clip_ctx * clip_model_load(const char * fname, const int verbosity = 1) {
return new_clip;
}

void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params) {
if (params->omni_vlm_version == "vlm-81-ocr") {
ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
} else if (params->omni_vlm_version == "vlm-81-instruct") {
ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
} else if (params->omni_vlm_version == "nano-vlm-instruct") {
ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
} else {
throw std::runtime_error(std::string("error vlm version info: ") + params->omni_vlm_version);
}
}
// void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params) {
// if (params->omni_vlm_version == "vlm-81-ocr") {
// ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_OCR;
// } else if (params->omni_vlm_version == "vlm-81-instruct") {
// ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::VLM_81_INSTRUCT;
// } else if (params->omni_vlm_version == "nano-vlm-instruct") {
// ctx_clip->omni_vlm_ver_type = omni_vlm_version_type::NANO_VLM_INSTRUCT;
// } else {
// throw std::runtime_error(std::string("error vlm version info: ") + params->omni_vlm_version);
// }
// }

void clip_add_load_image_size(struct clip_ctx * ctx_clip, struct clip_image_size * load_image_size) {
ctx_clip->load_image_size = load_image_size;
Expand Down Expand Up @@ -2207,13 +2216,13 @@ bool clip_image_batch_encode(clip_ctx * ctx, const int n_threads, const clip_ima
return true;
}

bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype) {
bool clip_model_quantize(const char * fname_inp, const char * fname_out, const int itype, const char* omni_vlm_version) {
ggml_type type = GGML_TYPE_Q4_1;

assert(itype < GGML_TYPE_COUNT);
type = static_cast<ggml_type>(itype);

auto * ctx_clip = clip_model_load(fname_inp, 2);
auto * ctx_clip = clip_model_load(fname_inp, omni_vlm_version, 2);

const auto & ctx_src = ctx_clip->ctx_gguf;
const auto & ctx_data = ctx_clip->ctx_data;
Expand Down
8 changes: 4 additions & 4 deletions examples/omni-vlm/clip.h
Original file line number Diff line number Diff line change
Expand Up @@ -39,11 +39,11 @@ struct clip_image_f32_batch {
size_t size;
};

CLIP_API struct clip_ctx * clip_model_load (const char * fname, int verbosity);
CLIP_API struct clip_ctx * clip_model_load (const char * fname, const char * omni_vlm_version, int verbosity);
CLIP_API struct clip_ctx * clip_model_load_cpu(const char * fname, int verbosity);

struct gpt_params;
CLIP_API void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params);
// struct gpt_params;
// CLIP_API void clip_set_omni_vlm_version(struct clip_ctx * ctx_clip, const struct gpt_params * params);

CLIP_API void clip_free(struct clip_ctx * ctx);

Expand Down Expand Up @@ -86,7 +86,7 @@ CLIP_API struct ggml_tensor * clip_get_newline_tensor(const struct clip_ctx * ct
CLIP_API bool clip_image_encode (struct clip_ctx * ctx, int n_threads, struct clip_image_f32 * img, float * vec);
CLIP_API bool clip_image_batch_encode(struct clip_ctx * ctx, int n_threads, const struct clip_image_f32_batch * imgs, float * vec);

CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype);
CLIP_API bool clip_model_quantize(const char * fname_inp, const char * fname_out, int itype, const char * omni_vlm_version);

CLIP_API int clip_is_minicpmv(const struct clip_ctx * ctx);

Expand Down
4 changes: 2 additions & 2 deletions examples/omni-vlm/omni-vlm-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -219,8 +219,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
prompt = "describe the image in detail.";
}

auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 0);
clip_set_omni_vlm_version(ctx_clip, params);
auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0);
// clip_set_omni_vlm_version(ctx_clip, params);

llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
ctx_params.n_ctx = params->n_ctx < 2048 ? 2048 : params->n_ctx; // we need a longer context size to process image embeddings
Expand Down
11 changes: 7 additions & 4 deletions examples/omni-vlm/omni-vlm-wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -65,8 +65,8 @@ static struct omnivlm_context * omnivlm_init_context(gpt_params * params, llama_
prompt = "describe the image in detail.";
}

auto ctx_clip = clip_model_load(clip_path, /*verbosity=*/ 0);
clip_set_omni_vlm_version(ctx_clip, params);
auto ctx_clip = clip_model_load(clip_path, params->omni_vlm_version.c_str(), /*verbosity=*/ 0);
// clip_set_omni_vlm_version(ctx_clip, params);


llama_context_params ctx_params = llama_context_params_from_gpt_params(*params);
Expand Down Expand Up @@ -280,7 +280,10 @@ const char* omnivlm_inference(const char *prompt, const char *imag_path) {

void omnivlm_free() {
if(internal_chars != nullptr) { free(internal_chars); }
ctx_omnivlm->model = NULL;
omnivlm_free(ctx_omnivlm);
if(ctx_omnivlm != nullptr) {
// this snipet should never be run!
ctx_omnivlm->model = nullptr;
omnivlm_free(ctx_omnivlm);
}
llama_free_model(model);
}