@@ -886,16 +886,29 @@ static int update_second_degree_neighbors(MHNSW_Context *ctx, TABLE *graph,
886
886
}
887
887
888
888
static int search_layer (MHNSW_Context *ctx, TABLE *graph, const FVector *target,
889
- Neighborhood *start_nodes, uint ef, size_t layer ,
890
- Neighborhood *result, bool skip_deleted )
889
+ Neighborhood *start_nodes, uint result_size ,
890
+ size_t layer, Neighborhood *result, bool construction )
891
891
{
892
892
DBUG_ASSERT (start_nodes->num > 0 );
893
893
result->num = 0 ;
894
894
895
895
MEM_ROOT * const root= graph->in_use ->mem_root ;
896
+ Queue<Visited> candidates, best;
897
+ bool skip_deleted;
898
+ uint ef= result_size;
896
899
897
- Queue<Visited> candidates;
898
- Queue<Visited> best;
900
+ if (construction)
901
+ {
902
+ skip_deleted= false ;
903
+ if (ef > 1 )
904
+ ef= std::max (ef_construction, ef);
905
+ }
906
+ else
907
+ {
908
+ skip_deleted= layer == 0 ;
909
+ if (ef > 1 || layer == 0 )
910
+ ef= std::max (graph->in_use ->variables .mhnsw_min_limit , ef);
911
+ }
899
912
900
913
// WARNING! heuristic here
901
914
const double est_heuristic= 8 * std::sqrt (ctx->max_neighbors (layer));
@@ -905,23 +918,21 @@ static int search_layer(MHNSW_Context *ctx, TABLE *graph, const FVector *target,
905
918
candidates.init (10000 , false , Visited::cmp);
906
919
best.init (ef, true , Visited::cmp);
907
920
921
+ DBUG_ASSERT (start_nodes->num <= result_size);
908
922
for (size_t i=0 ; i < start_nodes->num ; i++)
909
923
{
910
924
Visited *v= visited.create (start_nodes->links [i]);
911
925
candidates.push (v);
912
926
if (skip_deleted && v->node ->deleted )
913
927
continue ;
914
- if (best.elements () < ef)
915
- best.push (v);
916
- else if (v->distance_to_target < best.top ()->distance_to_target )
917
- best.replace_top (v);
928
+ best.push (v);
918
929
}
919
930
920
931
float furthest_best= FLT_MAX;
921
932
while (candidates.elements ())
922
933
{
923
934
const Visited &cur= *candidates.pop ();
924
- if (cur.distance_to_target > furthest_best && best.elements () == ef )
935
+ if (cur.distance_to_target > furthest_best && best.is_full () )
925
936
break ; // All possible candidates are worse than what we have
926
937
927
938
visited.flush ();
@@ -941,7 +952,7 @@ static int search_layer(MHNSW_Context *ctx, TABLE *graph, const FVector *target,
941
952
if (int err= links[i]->load (graph))
942
953
return err;
943
954
Visited *v= visited.create (links[i]);
944
- if (best.elements () < ef )
955
+ if (! best.is_full () )
945
956
{
946
957
candidates.push (v);
947
958
if (skip_deleted && v->node ->deleted )
@@ -966,6 +977,9 @@ static int search_layer(MHNSW_Context *ctx, TABLE *graph, const FVector *target,
966
977
set_if_bigger (ctx->ef_power , ef_power); // not atomic, but it's ok
967
978
}
968
979
980
+ while (best.elements () > result_size)
981
+ best.pop ();
982
+
969
983
result->num = best.elements ();
970
984
for (FVectorNode **links= result->links + result->num ; best.elements ();)
971
985
*--links= best.pop ()->node ;
@@ -1033,9 +1047,10 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)
1033
1047
root_make_savepoint (thd->mem_root , &memroot_sv);
1034
1048
SCOPE_EXIT ([memroot_sv](){ root_free_to_savepoint (&memroot_sv); });
1035
1049
1050
+ const size_t max_found= ctx->max_neighbors (0 );
1036
1051
Neighborhood candidates, start_nodes;
1037
- candidates.init (thd->alloc <FVectorNode*>(ef_construction + 7 ), ef_construction );
1038
- start_nodes.init (thd->alloc <FVectorNode*>(ef_construction + 7 ), ef_construction );
1052
+ candidates.init (thd->alloc <FVectorNode*>(max_found + 7 ), max_found );
1053
+ start_nodes.init (thd->alloc <FVectorNode*>(max_found + 7 ), max_found );
1039
1054
start_nodes.links [start_nodes.num ++]= ctx->start ;
1040
1055
1041
1056
const double NORMALIZATION_FACTOR= 1 / std::log (ctx->M );
@@ -1063,7 +1078,7 @@ int mhnsw_insert(TABLE *table, KEY *keyinfo)
1063
1078
{
1064
1079
uint max_neighbors= ctx->max_neighbors (cur_layer);
1065
1080
if (int err= search_layer (ctx, graph, target->vec , &start_nodes,
1066
- ef_construction , cur_layer, &candidates, false ))
1081
+ max_neighbors , cur_layer, &candidates, true ))
1067
1082
return err;
1068
1083
1069
1084
if (int err= select_neighbors (ctx, graph, cur_layer, *target, candidates,
@@ -1106,11 +1121,9 @@ int mhnsw_read_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit)
1106
1121
if (err)
1107
1122
return err;
1108
1123
1109
- size_t ef= thd->variables .mhnsw_min_limit ;
1110
-
1111
1124
Neighborhood candidates, start_nodes;
1112
- candidates.init (thd->alloc <FVectorNode*>(ef + 7 ), ef );
1113
- start_nodes.init (thd->alloc <FVectorNode*>(ef + 7 ), ef );
1125
+ candidates.init (thd->alloc <FVectorNode*>(limit + 7 ), limit );
1126
+ start_nodes.init (thd->alloc <FVectorNode*>(limit + 7 ), limit );
1114
1127
1115
1128
// one could put all max_layer nodes in start_nodes
1116
1129
// but it has no effect on the recall or speed
@@ -1146,8 +1159,8 @@ int mhnsw_read_first(TABLE *table, KEY *keyinfo, Item *dist, ulonglong limit)
1146
1159
std::swap (start_nodes, candidates);
1147
1160
}
1148
1161
1149
- if (int err= search_layer (ctx, graph, target, &start_nodes, ef, 0 ,
1150
- &candidates, true ))
1162
+ if (int err= search_layer (ctx, graph, target, &start_nodes,
1163
+ static_cast <uint>(limit), 0 , &candidates, false ))
1151
1164
return err;
1152
1165
1153
1166
if (limit > candidates.num )
0 commit comments