@@ -258,111 +258,6 @@ static bool encode_image_with_clip(clip_ctx * ctx_clip, int n_threads, const cli
258258
259259 *n_img_pos = clip_n_patches (ctx_clip);
260260 bool encoded = clip_image_encode (ctx_clip, n_threads, &img_res_v.data [0 ], image_embd);
261- // cout << "\t\t A NICE START" << endl;
262- // cout << "\t\t" << *n_img_pos << endl;
263- /*
264- if (clip_is_minicpmv(ctx_clip)) {
265- std::vector<float *> image_embd_v;
266- image_embd_v.resize(img_res_v.size);
267- struct clip_image_size * load_image_size = clip_image_size_init();
268- for (size_t i = 0; i < img_res_v.size; i++) {
269- const int64_t t_img_enc_step_start_us = ggml_time_us();
270- image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip));
271- int patch_size=14;
272- load_image_size->width = img_res_v.data[i].nx;
273- load_image_size->height = img_res_v.data[i].ny;
274- clip_add_load_image_size(ctx_clip, load_image_size);
275- bool encoded = false;
276- int has_minicpmv_projector = clip_is_minicpmv(ctx_clip);
277- if (has_minicpmv_projector == 2) {
278- encoded = clip_image_encode(ctx_clip, n_threads, only_v2_5_reshape_by_patch(&img_res_v.data[i], patch_size), image_embd_v[i]);
279- }
280- else if (has_minicpmv_projector == 3) {
281- encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]);
282- }
283- if (!encoded) {
284- LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
285- return false;
286- }
287- const int64_t t_img_enc_steop_batch_us = ggml_time_us();
288- LOG_INF("%s: step %d of %d encoded in %8.2f ms\n", __func__, (int)i+1, (int)img_res_v.size, (t_img_enc_steop_batch_us - t_img_enc_step_start_us) / 1000.0);
289- }
290- const int64_t t_img_enc_batch_us = ggml_time_us();
291- LOG_INF("%s: all %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
292-
293- int n_img_pos_out = 0;
294- for (size_t i = 0; i < image_embd_v.size(); i++) {
295- std::memcpy(image_embd + n_img_pos_out * clip_n_mmproj_embd(ctx_clip), image_embd_v[i], clip_embd_nbytes(ctx_clip));
296- n_img_pos_out += clip_n_patches(ctx_clip);
297- }
298- *n_img_pos = n_img_pos_out;
299- for (size_t i = 0; i < image_embd_v.size(); i++) {
300- free(image_embd_v[i]);
301- }
302- image_embd_v.clear();
303- load_image_size->width = img->nx;
304- load_image_size->height = img->ny;
305- clip_add_load_image_size(ctx_clip, load_image_size);
306- LOG_INF("%s: load_image_size %d %d\n", __func__, load_image_size->width, load_image_size->height);
307- }
308- else if (strcmp(mm_patch_merge_type, "spatial_unpad") != 0) {
309- // flat / default llava-1.5 type embedding
310- *n_img_pos = clip_n_patches(ctx_clip);
311- bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[0], image_embd); // image_embd shape is 576 x 4096
312- delete[] img_res_v.data;
313- if (!encoded) {
314- LOG_ERR("Unable to encode image\n");
315-
316- return false;
317- }
318- }
319- else {
320- // spatial_unpad llava-1.6 type embedding
321- // TODO: CLIP needs batching support - in HF the llm projection is separate after encoding, which might be a solution to quickly get batching working
322- std::vector<float *> image_embd_v;
323- image_embd_v.resize(img_res_v.size);
324- for (size_t i = 0; i < img_res_v.size; i++) {
325- image_embd_v[i] = (float *)malloc(clip_embd_nbytes(ctx_clip)); // 576 patches * 4096 embeddings * 4 bytes = 9437184
326- const bool encoded = clip_image_encode(ctx_clip, n_threads, &img_res_v.data[i], image_embd_v[i]); // image data is in 3x336x336 format and will be converted to 336x336x3 inside
327- if (!encoded) {
328- LOG_ERR("Unable to encode image - spatial_unpad - subimage %d of %d\n", (int) i+1, (int) img_res_v.size);
329- return false;
330- }
331- }
332- const int64_t t_img_enc_batch_us = ggml_time_us();
333- LOG_INF("%s: %d segments encoded in %8.2f ms\n", __func__, (int)img_res_v.size, (t_img_enc_batch_us - t_img_enc_start_us) / 1000.0);
334-
335- const int32_t * image_grid = clip_image_grid(ctx_clip);
336-
337- std::vector<std::pair<int, int>> grid_pinpoints;
338- for (int i = 0; i < 32 && image_grid[i] != 0; i += 2) {
339- grid_pinpoints.push_back({image_grid[i], image_grid[i+1]});
340- }
341-
342- // free all img_res_v - not needed anymore
343- delete[] img_res_v.data;
344- img_res_v.size = 0;
345- img_res_v.data = nullptr;
346-
347- const int32_t image_size = clip_image_size(ctx_clip);
348-
349- struct clip_image_grid_shape grid_shape = get_anyres_image_grid_shape({img->nx,img->ny}, grid_pinpoints, image_size);
350-
351- int n_img_pos_out;
352- clip_llava_handle_patches(ctx_clip, image_embd_v, grid_shape, image_embd, &n_img_pos_out);
353- *n_img_pos = n_img_pos_out;
354-
355- for (size_t i = 0; i < image_embd_v.size(); i++) {
356- free(image_embd_v[i]);
357- }
358- image_embd_v.clear();
359-
360- // debug image/segment/normalization content:
361- // clip_image_u8 * tmp = clip_image_u8_init();
362- // clip_image_convert_f32_to_u8(*image_feature, *tmp);
363- // clip_image_save_to_bmp(*tmp, "image_feature.bmp");
364- }
365- */
366261
367262 LOG (" %s: image embedding created: %d tokens\n " , __func__, *n_img_pos);
368263
0 commit comments