Skip to content

Commit 88839e7

Browse files
cvicentiuHugoWenTD
authored andcommitted
Initial HNSW implementation
This commit includes the work done in collaboration with Hugo Wen from Amazon: MDEV-33408 Alter HNSW graph storage and fix memory leak This commit changes the way HNSW graph information is stored in the second table. Instead of storing connections as separate records, it now stores neighbors for each node, leading to significant performance improvements and storage savings. Comparing with the previous approach, the insert speed is 5 times faster, search speed improves by 23%, and storage usage is reduced by 73%, based on ann-benchmark tests with random-xs-20-euclidean and random-s-100-euclidean datasets. Additionally, in previous code, vector objects were not released after use, resulting in excessive memory consumption (over 20GB for building the index with 90,000 records), preventing tests with large datasets. Now ensure that vectors are released appropriately during the insert and search functions. Note there are still some vectors that need to be cleaned up after search query completion. Needs to be addressed in a future commit. All new code of the whole pull request, including one or several files that are either new files or modified ones, are contributed under the BSD-new license. I am contributing on behalf of my employer Amazon Web Services, Inc. As well as the commit: Introduce session variables to manage HNSW index parameters Three variables: hnsw_max_connection_per_layer hnsw_ef_constructor hnsw_ef_search ann-benchmark tool is also updated to support these variables in commit HugoWenTD/ann-benchmarks@e09784e for branch https://github.com/HugoWenTD/ann-benchmarks/tree/mariadb-configurable All new code of the whole pull request, including one or several files that are either new files or modified ones, are contributed under the BSD-new license. I am contributing on behalf of my employer Amazon Web Services, Inc. Co-authored-by: Hugo Wen <wenhug@amazon.com>
1 parent 26e5654 commit 88839e7

File tree

11 files changed

+830
-128
lines changed

11 files changed

+830
-128
lines changed

mysql-test/main/mysqld--help.result

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -412,6 +412,11 @@ The following specify which files/extra groups are read (specified before remain
412412
height-balanced, DOUBLE_PREC_HB - double precision
413413
height-balanced, JSON_HB - height-balanced, stored as
414414
JSON
415+
--hnsw-ef-constructor
416+
hnsw_ef_constructor
417+
--hnsw-ef-search hnsw_ef_search
418+
--hnsw-max-connection-per-layer
419+
hnsw_max_connection_per_layer
415420
--host-cache-size=# How many host names should be cached to avoid resolving
416421
(Automatically configured unless set explicitly)
417422
--idle-readonly-transaction-timeout=#
@@ -1732,6 +1737,9 @@ gtid-strict-mode FALSE
17321737
help TRUE
17331738
histogram-size 254
17341739
histogram-type JSON_HB
1740+
hnsw-ef-constructor 10
1741+
hnsw-ef-search 10
1742+
hnsw-max-connection-per-layer 50
17351743
host-cache-size 279
17361744
idle-readonly-transaction-timeout 0
17371745
idle-transaction-timeout 0

mysql-test/suite/sys_vars/r/sysvars_server_notembedded.result

Lines changed: 30 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1432,6 +1432,36 @@ NUMERIC_BLOCK_SIZE NULL
14321432
ENUM_VALUE_LIST SINGLE_PREC_HB,DOUBLE_PREC_HB,JSON_HB
14331433
READ_ONLY NO
14341434
COMMAND_LINE_ARGUMENT REQUIRED
1435+
VARIABLE_NAME HNSW_EF_CONSTRUCTOR
1436+
VARIABLE_SCOPE SESSION
1437+
VARIABLE_TYPE INT UNSIGNED
1438+
VARIABLE_COMMENT hnsw_ef_constructor
1439+
NUMERIC_MIN_VALUE 0
1440+
NUMERIC_MAX_VALUE 4294967295
1441+
NUMERIC_BLOCK_SIZE 1
1442+
ENUM_VALUE_LIST NULL
1443+
READ_ONLY NO
1444+
COMMAND_LINE_ARGUMENT NONE
1445+
VARIABLE_NAME HNSW_EF_SEARCH
1446+
VARIABLE_SCOPE SESSION
1447+
VARIABLE_TYPE INT UNSIGNED
1448+
VARIABLE_COMMENT hnsw_ef_search
1449+
NUMERIC_MIN_VALUE 0
1450+
NUMERIC_MAX_VALUE 4294967295
1451+
NUMERIC_BLOCK_SIZE 1
1452+
ENUM_VALUE_LIST NULL
1453+
READ_ONLY NO
1454+
COMMAND_LINE_ARGUMENT NONE
1455+
VARIABLE_NAME HNSW_MAX_CONNECTION_PER_LAYER
1456+
VARIABLE_SCOPE SESSION
1457+
VARIABLE_TYPE INT UNSIGNED
1458+
VARIABLE_COMMENT hnsw_max_connection_per_layer
1459+
NUMERIC_MIN_VALUE 0
1460+
NUMERIC_MAX_VALUE 4294967295
1461+
NUMERIC_BLOCK_SIZE 1
1462+
ENUM_VALUE_LIST NULL
1463+
READ_ONLY NO
1464+
COMMAND_LINE_ARGUMENT NONE
14351465
VARIABLE_NAME HOSTNAME
14361466
VARIABLE_SCOPE GLOBAL
14371467
VARIABLE_TYPE VARCHAR

sql/item.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -6634,7 +6634,6 @@ class Item_int_with_ref :public Item_int
66346634
#include "item_subselect.h"
66356635
#include "item_xmlfunc.h"
66366636
#include "item_jsonfunc.h"
6637-
#include "item_vectorfunc.h"
66386637
#include "item_create.h"
66396638
#include "item_vers.h"
66406639
#endif

sql/item_create.cc

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -36,6 +36,7 @@
3636
#include "sp.h"
3737
#include "sql_time.h"
3838
#include "sql_type_geom.h"
39+
#include "item_vectorfunc.h"
3940
#include <mysql/plugin_function.h>
4041

4142

sql/item_vectorfunc.cc

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323

2424
#include <my_global.h>
2525
#include "item.h"
26+
#include "item_vectorfunc.h"
2627

2728
key_map Item_func_vec_distance::part_of_sortkey() const
2829
{
@@ -48,8 +49,18 @@ double Item_func_vec_distance::val_real()
4849
return 0;
4950
float *v1= (float*)r1->ptr();
5051
float *v2= (float*)r2->ptr();
52+
return euclidean_vec_distance(v1, v2, (r1->length()) / sizeof(float));
53+
}
54+
55+
double euclidean_vec_distance(float *v1, float *v2, size_t v_len)
56+
{
57+
float *p1= v1;
58+
float *p2= v2;
5159
double d= 0;
52-
for (uint i=0; i < r1->length() / sizeof(float); i++)
53-
d+= (v1[i] - v2[i])*(v1[i] - v2[i]);
60+
for (size_t i= 0; i < v_len; p1++, p2++, i++)
61+
{
62+
float dist= *p1 - *p2;
63+
d+= dist * dist;
64+
}
5465
return sqrt(d);
5566
}

sql/item_vectorfunc.h

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@
1717
Foundation, Inc., 51 Franklin Street, Fifth Floor, Boston, MA 02110-1335 USA */
1818

1919
/* This file defines all vector functions */
20+
#include <my_global.h>
21+
#include "item.h"
2022
#include "lex_string.h"
2123
#include "item_func.h"
2224

@@ -34,6 +36,7 @@ class Item_func_vec_distance: public Item_real_func
3436
{
3537
return check_argument_types_or_binary(NULL, 0, arg_count);
3638
}
39+
3740
public:
3841
Item_func_vec_distance(THD *thd, Item *a, Item *b)
3942
:Item_real_func(thd, a, b) {}
@@ -51,6 +54,9 @@ class Item_func_vec_distance: public Item_real_func
5154
key_map part_of_sortkey() const override;
5255
Item *do_get_copy(THD *thd) const override
5356
{ return get_item_copy<Item_func_vec_distance>(thd, this); }
57+
virtual ~Item_func_vec_distance() {};
5458
};
5559

60+
61+
double euclidean_vec_distance(float *v1, float *v2, size_t v_len);
5662
#endif

sql/sql_base.cc

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9883,7 +9883,7 @@ int TABLE::hlindex_open(uint nr)
98839883
mysql_mutex_unlock(&s->LOCK_share);
98849884
TABLE *table= (TABLE*)alloc_root(&mem_root, sizeof(*table));
98859885
if (!table ||
9886-
open_table_from_share(in_use, s->hlindex, &empty_clex_str, db_stat, 0,
9886+
open_table_from_share(in_use, s->hlindex, &empty_clex_str, db_stat, EXTRA_RECORD,
98879887
in_use->open_options, table, 0))
98889888
return 1;
98899889
hlindex= table;
@@ -9938,7 +9938,7 @@ int TABLE::hlindex_read_first(uint nr, Item *item, ulonglong limit)
99389938

99399939
DBUG_ASSERT(hlindex->in_use == in_use);
99409940

9941-
return mhnsw_read_first(this, item, limit);
9941+
return mhnsw_read_first(this, key_info + s->keys, item, limit);
99429942
}
99439943

99449944
int TABLE::hlindex_read_next()

sql/sql_class.h

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -922,6 +922,11 @@ typedef struct system_variables
922922
my_bool binlog_alter_two_phase;
923923

924924
Charset_collation_map_st character_set_collations;
925+
926+
/* Temporary for HNSW tests */
927+
uint hnsw_max_connection_per_layer;
928+
uint hnsw_ef_constructor;
929+
uint hnsw_ef_search;
925930
} SV;
926931

927932
/**

sql/sys_vars.cc

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7447,3 +7447,23 @@ static Sys_var_ulonglong Sys_binlog_large_commit_threshold(
74477447
// Allow a smaller minimum value for debug builds to help with testing
74487448
VALID_RANGE(IF_DBUG(100, 10240) * 1024, ULLONG_MAX),
74497449
DEFAULT(128 * 1024 * 1024), BLOCK_SIZE(1));
7450+
7451+
/* Temporary for HNSW tests */
7452+
static Sys_var_uint Sys_hnsw_ef_search(
7453+
"hnsw_ef_search",
7454+
"hnsw_ef_search",
7455+
SESSION_VAR(hnsw_ef_search), CMD_LINE(NO_ARG),
7456+
VALID_RANGE(0, UINT_MAX), DEFAULT(10),
7457+
BLOCK_SIZE(1));
7458+
static Sys_var_uint Sys_hnsw_ef_constructor(
7459+
"hnsw_ef_constructor",
7460+
"hnsw_ef_constructor",
7461+
SESSION_VAR(hnsw_ef_constructor), CMD_LINE(NO_ARG),
7462+
VALID_RANGE(0, UINT_MAX), DEFAULT(10),
7463+
BLOCK_SIZE(1));
7464+
static Sys_var_uint Sys_hnsw_max_connection_per_layer(
7465+
"hnsw_max_connection_per_layer",
7466+
"hnsw_max_connection_per_layer",
7467+
SESSION_VAR(hnsw_max_connection_per_layer), CMD_LINE(NO_ARG),
7468+
VALID_RANGE(0, UINT_MAX), DEFAULT(50),
7469+
BLOCK_SIZE(1));

0 commit comments

Comments
 (0)