Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
13 changes: 6 additions & 7 deletions examples/omni-vlm/omni-vlm-cli.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -149,7 +149,7 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
LOG_TEE("%6d -> '%s'\n", tmp[i], llama_token_to_piece(ctx_omnivlm->ctx_llama, tmp[i]).c_str());
}
}
LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
// LOG_TEE("user_prompt: %s\n", user_prompt.c_str());
if (params->verbose_prompt) {
auto tmp = ::llama_tokenize(ctx_omnivlm->ctx_llama, user_prompt, true, true);
for (int i = 0; i < (int) tmp.size(); i++) {
Expand All @@ -165,6 +165,9 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima

LOG("\n");

params->sparams.temp = 0.0f;
params->sparams.top_k = 1;
params->sparams.top_p = 1.0f;
struct llama_sampling_context * ctx_sampling = llama_sampling_init(params->sparams);
if (!ctx_sampling) {
LOG_TEE("%s: failed to initialize sampling subsystem\n", __func__);
Expand All @@ -177,8 +180,8 @@ static void process_prompt(struct omnivlm_context * ctx_omnivlm, struct omni_ima
response += tmp;
if (strcmp(tmp, "<|im_end|>") == 0) break;
if (strcmp(tmp, "</s>") == 0) break;
// if (strstr(tmp, "###")) break; // Yi-VL behavior
printf("%s", tmp);
// LOG("%s", tmp);
// if (strstr(response.c_str(), "<|im_end|>")) break; // Yi-34B llava-1.6 - for some reason those decode not as the correct token (tokenizer works)
// if (strstr(response.c_str(), "<|im_start|>")) break; // Yi-34B llava-1.6
// if (strstr(response.c_str(), "USER:")) break; // mistral llava-1.6
Expand Down Expand Up @@ -265,7 +268,7 @@ int main(int argc, char ** argv) {
}

if (params.omni_vlm_version == "vlm-81-ocr") {
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|ocr_start|><|vision_start|><|image_pad|><|vision_end|><|ocr_end|><|im_end|>";
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n <|vision_start|><|image_pad|><|vision_end|><|im_end|>";
} else if (params.omni_vlm_version == "vlm-81-instruct" || params.omni_vlm_version == "nano-vlm-instruct") {
params.prompt = "<|im_start|>system\nYou are Nano-Omni-VLM, created by Nexa AI. You are a helpful assistant.<|im_end|>\n<|im_start|>user\n" + params.prompt + "\n<|vision_start|><|image_pad|><|vision_end|><|im_end|>";
} else {
Expand All @@ -282,10 +285,6 @@ int main(int argc, char ** argv) {

auto * ctx_omnivlm = omnivlm_init_context(&params, model);

// temporarily set to greedy decoding.
params.sparams.top_k = 1;
params.sparams.top_p = 1.0f;

for (auto & image : params.image) {
auto * image_embed = load_image(ctx_omnivlm, &params, image);
if (!image_embed) {
Expand Down
4 changes: 0 additions & 4 deletions examples/omni-vlm/omni-vlm-wrapper.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -222,8 +222,6 @@ static void print_usage(int argc, char ** argv, const gpt_params & params) {

// inference interface definition
void omnivlm_init(const char* llm_model_path, const char* projector_model_path, const char* omni_vlm_version) {
std::cout << "debug0 " << llm_model_path << std::endl;
std::cout << "debug1 " << omni_vlm_version << std::endl;
const char* argv = "omni-wrapper-py";
char* nc_argv = const_cast<char*>(argv);
if (!gpt_params_parse(1, &nc_argv, params)) {
Expand All @@ -235,8 +233,6 @@ void omnivlm_init(const char* llm_model_path, const char* projector_model_path,
params.omni_vlm_version = omni_vlm_version;

std::string omni_vlm_ver = params.omni_vlm_version;
std::cout << "\t\t DEBUG omni_ver" << std::endl;
std::cout << params.omni_vlm_version << std::endl;
if(omni_vlm_ver != "vlm-81-ocr" && omni_vlm_ver != "vlm-81-instruct" && omni_vlm_ver != "nano-vlm-instruct") {
fprintf(stderr, "%s: error: you set wrong omni_vlm_string: %s\n", __func__, omni_vlm_version);
fprintf(stderr, "%s: Valid omni_vlm_version set is ('vlm-81-ocr', 'vlm-81-instruct', 'nano-vlm-instruct')\n", __func__);
Expand Down
105 changes: 0 additions & 105 deletions examples/omni-vlm/omni-vlm.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -258,111 +258,6 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli

*n_img_pos = clip_n_patches(ctx_clip);
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd);
// cout << "\t\t A NICE START" << endl;
// cout << "\t\t" << *n_img_pos << endl;
/*
if (clip_is_minicpmv(ctx_clip)) {
std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size);
struct clip_image_size * load_image_size = clip_image_size_init();
for (size_t i = 0; i < img_res_v.size; i++) {
const int64_t t_img_enc_step_start_us = ggml_time_us();
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
int patch_size=14;
load_image_size->width = img_res_v.data[i].nx;
load_image_size->height = img_res_v.data[i].ny;
clip_add_load_image_size(ctx_clip, load_image_size);
bool encoded = false;
int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
if (has_minicpmv_projector == 2) {
encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
}
else if (has_minicpmv_projector == 3) {
encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
}
if (!encoded) {
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
}
const int64_t t_img_enc_steop_batch_us = ggml_time_us();
LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
}
const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);

int n_img_pos_out = 0;
for (size_t i = 0; i < image_embd_v.size(); i++) {
std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
n_img_pos_out += clip_n_patches(ctx_clip);
}
*n_img_pos = n_img_pos_out;
for (size_t i = 0; i < image_embd_v.size(); i++) {
free(image_embd_v[i]);
}
image_embd_v.clear();
load_image_size->width = img->nx;
load_image_size->height = img->ny;
clip_add_load_image_size(ctx_clip, load_image_size);
LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
}
else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
// flat / default llava-1.5 type embedding
*n_img_pos = clip_n_patches(ctx_clip);
bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
delete[] img_res_v.data;
if (!encoded) {
LOG_ERR("Unable to encode image\n");

return false;
}
}
else {
// spatial_unpad llava-1.6 type embedding
// TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
std::vector<float *> image_embd_v;
image_embd_v.resize(img_res_v.size);
for (size_t i = 0; i < img_res_v.size; i++) {
image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
if (!encoded) {
LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
return false;
}
}
const int64_t t_img_enc_batch_us = ggml_time_us();
LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);

const int32_t * image_grid = clip_image_grid(ctx_clip);

std::vector<std::pair<int, int>> grid_pinpoints;
for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
}

// free all img_res_v - not needed anymore
delete[] img_res_v.data;
img_res_v.size = 0;
img_res_v.data = nullptr;

const int32_t image_size = clip_image_size(ctx_clip);

struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);

int n_img_pos_out;
clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
*n_img_pos = n_img_pos_out;

for (size_t i = 0; i < image_embd_v.size(); i++) {
free(image_embd_v[i]);
}
image_embd_v.clear();

// debug image/segment/normalization content:
// clip_image_u8 * tmp = clip_image_u8_init();
// clip_image_convert_f32_to_u8(*image_feature, *tmp);
// clip_image_save_to_bmp(*tmp, "image_feature.bmp");
}
*/

LOG("%s: image embedding created: %d tokens\n", __func__, *n_img_pos);

Expand Down