diff --git a/AnnService/CoreLibrary.vcxproj b/AnnService/CoreLibrary.vcxproj
index 9844c709..a0e884b3 100644
--- a/AnnService/CoreLibrary.vcxproj
+++ b/AnnService/CoreLibrary.vcxproj
@@ -157,9 +157,14 @@
+
+
+
+
+
diff --git a/AnnService/CoreLibrary.vcxproj.filters b/AnnService/CoreLibrary.vcxproj.filters
index 7d27224d..c411e8ce 100644
--- a/AnnService/CoreLibrary.vcxproj.filters
+++ b/AnnService/CoreLibrary.vcxproj.filters
@@ -118,6 +118,18 @@
Header Files\Core\Common
+
+ Header Files\Core\Common
+
+
+ Header Files\Core\Common
+
+
+ Header Files\Core\Common
+
+
+ Header Files\Core\Common
+
@@ -156,5 +168,11 @@
Source Files\Core\KDT
+
+ Source Files\Core\Common
+
+
+
+
\ No newline at end of file
diff --git a/AnnService/inc/Core/BKT/Index.h b/AnnService/inc/Core/BKT/Index.h
index c14aa815..2ead4acd 100644
--- a/AnnService/inc/Core/BKT/Index.h
+++ b/AnnService/inc/Core/BKT/Index.h
@@ -1,267 +1,109 @@
#ifndef _SPTAG_BKT_INDEX_H_
#define _SPTAG_BKT_INDEX_H_
-#include "../SearchQuery.h"
-#include "../VectorIndex.h"
#include "../Common.h"
+#include "../VectorIndex.h"
#include "../Common/CommonUtils.h"
#include "../Common/DistanceUtils.h"
#include "../Common/QueryResultSet.h"
-#include "../Common/Heap.h"
#include "../Common/Dataset.h"
#include "../Common/WorkSpace.h"
#include "../Common/WorkSpacePool.h"
-#include "../Common/FineGrainedLock.h"
-#include "../Common/DataUtils.h"
+#include "../Common/RelativeNeighborhoodGraph.h"
+#include "../Common/BKTree.h"
+#include "inc/Helper/SimpleIniReader.h"
+#include "inc/Helper/StringConvert.h"
#include
#include
-#include
#include
namespace SPTAG
{
-namespace Helper
-{
-class IniReader;
-}
-
-
-namespace BKT
-{
- // node type for storing BKT
- struct BKTNode
+ namespace Helper
{
- int centerid;
- int childStart;
- int childEnd;
-
- BKTNode(int cid = -1) : centerid(cid), childStart(-1), childEnd(-1) {}
- };
-
- template
- struct KmeansArgs {
- int _K;
- int _D;
- int _T;
- T* centers;
- int* counts;
- float* newCenters;
- int* newCounts;
- char* label;
- int* clusterIdx;
- float* clusterDist;
- T* newTCenters;
-
- KmeansArgs(int k, int dim, int datasize, int threadnum): _K(k), _D(dim), _T(threadnum) {
- centers = new T[k * dim];
- counts = new int[k];
- newCenters = new float[threadnum * k * dim];
- newCounts = new int[threadnum * k];
- label = new char[datasize];
- clusterIdx = new int[threadnum * k];
- clusterDist = new float[threadnum * k];
- newTCenters = new T[k * dim];
- }
-
- ~KmeansArgs() {
- delete[] centers;
- delete[] counts;
- delete[] newCenters;
- delete[] newCounts;
- delete[] label;
- delete[] clusterIdx;
- delete[] clusterDist;
- delete[] newTCenters;
- }
-
- inline void ClearCounts() {
- memset(newCounts, 0, sizeof(int) * _T * _K);
- }
-
- inline void ClearCenters() {
- memset(newCenters, 0, sizeof(float) * _T * _K * _D);
- }
-
- inline void ClearDists(float dist) {
- for (int i = 0; i < _T * _K; i++) {
- clusterIdx[i] = -1;
- clusterDist[i] = dist;
- }
- }
-
- void Shuffle(std::vector& indices, int first, int last) {
- int* pos = new int[_K];
- pos[0] = first;
- for (int k = 1; k < _K; k++) pos[k] = pos[k - 1] + newCounts[k - 1];
-
- for (int k = 0; k < _K; k++) {
- if (newCounts[k] == 0) continue;
- int i = pos[k];
- while (newCounts[k] > 0) {
- int swapid = pos[(int)(label[i])] + newCounts[(int)(label[i])] - 1;
- newCounts[(int)(label[i])]--;
- std::swap(indices[i], indices[swapid]);
- std::swap(label[i], label[swapid]);
- }
- while (indices[i] != clusterIdx[k]) i++;
- std::swap(indices[i], indices[pos[k] + counts[k] - 1]);
- }
- delete[] pos;
- }
- };
+ class IniReader;
+ }
- template
- class Index : public VectorIndex
+ namespace BKT
{
- private:
- // Initial data points
- int m_iDataSize;
- int m_iDataDimension;
- COMMON::Dataset m_pSamples;
+ template
+ class Index : public VectorIndex
+ {
+ private:
+ // data points
+ COMMON::Dataset m_pSamples;
- // BKT structures.
- int m_iBKTNumber;
- std::vector m_pBKTStart;
- std::vector m_pBKTRoots;
+ // BKT structures.
+ COMMON::BKTree m_pTrees;
- // Graph structure
- int m_iGraphSize;
- int m_iNeighborhoodSize;
- COMMON::Dataset m_pNeighborhoodGraph;
+ // Graph structure
+ COMMON::RelativeNeighborhoodGraph m_pGraph;
- // Variables for building BKTs and TPTs
- int m_iBKTKmeansK;
- int m_iBKTLeafSize;
- int m_iSamples;
- int m_iTptreeNumber;
- int m_iTPTLeafSize;
- int m_numTopDimensionTpTreeSplit;
+ std::string m_sBKTFilename;
+ std::string m_sGraphFilename;
+ std::string m_sDataPointsFilename;
- // Variables for building graph
- int m_iRefineIter;
- int m_iCEF;
- int m_iMaxCheckForRefineGraph;
- int m_iMaxCheck;
- std::unordered_map m_pSampleToCenter;
+ std::mutex m_dataLock; // protect data and graph
+ tbb::concurrent_unordered_set m_deletedID;
+ std::unique_ptr m_workSpacePool;
- // Load from files directly
- std::string m_sBKTFilename;
- std::string m_sGraphFilename;
- std::string m_sDataPointsFilename;
-
- // Load from memory mapped files
- char* m_pBKTMemoryFile;
- char* m_pGraphMemoryFile;
- char* m_pDataPointsMemoryFile;
-
- DistCalcMethod m_iDistCalcMethod;
- float(*m_fComputeDistance)(const T* pX, const T* pY, int length);
-
- int m_iCacheSize;
- int m_iDebugLoad;
-
- int g_iThresholdOfNumberOfContinuousNoBetterPropagation;
- int g_iNumberOfInitialDynamicPivots;
- int g_iNumberOfOtherDynamicPivots;
-
- int m_iNumberOfThreads;
- std::mutex m_dataAllocLock;
- COMMON::FineGrainedLock m_dataUpdateLock;
- tbb::concurrent_unordered_set m_deletedID;
- std::unique_ptr m_workSpacePool;
- public:
- Index() : m_iBKTNumber(1),
- m_iBKTKmeansK(32),
- m_iBKTLeafSize(8),
- m_iSamples(1000),
- m_iNeighborhoodSize(32),
- m_iTptreeNumber(32),
- m_iTPTLeafSize(2000),
- m_numTopDimensionTpTreeSplit(5),
- m_iRefineIter(0),
- m_iCEF(1000),
- m_iMaxCheckForRefineGraph(10000),
- m_iMaxCheck(2048),
- m_pBKTMemoryFile(NULL),
- m_pGraphMemoryFile(NULL),
- m_pDataPointsMemoryFile(NULL),
- m_sBKTFilename("tree.bin"),
- m_sGraphFilename("graph.bin"),
- m_sDataPointsFilename("vectors.bin"),
- m_iNumberOfThreads(1),
- m_iDistCalcMethod(DistCalcMethod::Cosine),
- m_fComputeDistance(COMMON::DistanceCalcSelector(DistCalcMethod::Cosine)),
- m_iCacheSize(-1),
- m_iDebugLoad(-1),
- g_iThresholdOfNumberOfContinuousNoBetterPropagation(3),
- g_iNumberOfInitialDynamicPivots(50),
- g_iNumberOfOtherDynamicPivots(4) {}
-
- ~Index() {
- m_pBKTRoots.clear();
- }
- int GetNumSamples() const { return m_pSamples.R(); }
- int GetFeatureDim() const { return m_pSamples.C(); }
- int GetNumThreads() const { return m_iNumberOfThreads; }
- int GetCurrMaxCheck() const { return m_iMaxCheck; }
-
- DistCalcMethod GetDistCalcMethod() const { return m_iDistCalcMethod; }
- IndexAlgoType GetIndexAlgoType() const { return IndexAlgoType::BKT; }
- VectorValueType GetVectorValueType() const { return GetEnumValueType(); }
-
- ErrorCode BuildIndex(const void* p_data, int p_vectorNum, int p_dimension);
+ int m_iNumberOfThreads;
+ DistCalcMethod m_iDistCalcMethod;
+ float(*m_fComputeDistance)(const T* pX, const T* pY, int length);
- ErrorCode LoadIndex(const std::string& p_folderPath);
- ErrorCode LoadIndexFromMemory(const std::vector& p_indexBlobs);
-
- ErrorCode SaveIndex(const std::string& p_folderPath);
-
- void SearchIndex(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const tbb::concurrent_unordered_set &p_deleted) const;
- ErrorCode SearchIndex(QueryResult &p_query) const;
-
- ErrorCode AddIndex(const void* p_vectors, int p_vectorNum, int p_dimension);
- ErrorCode DeleteIndex(const void* p_vectors, int p_vectorNum);
- ErrorCode RefineIndex(const std::string& p_folderPath);
- ErrorCode MergeIndex(const char* p_indexFilePath1, const char* p_indexFilePath2);
+ int m_iMaxCheck;
+ int m_iThresholdOfNumberOfContinuousNoBetterPropagation;
+ int m_iNumberOfInitialDynamicPivots;
+ int m_iNumberOfOtherDynamicPivots;
+ public:
+ Index() :
+ m_sBKTFilename("tree.bin"),
+ m_sGraphFilename("graph.bin"),
+ m_sDataPointsFilename("vectors.bin"),
+ m_iNumberOfThreads(1),
+ m_iDistCalcMethod(DistCalcMethod::Cosine),
+ m_fComputeDistance(COMMON::DistanceCalcSelector(DistCalcMethod::Cosine)),
+ m_iMaxCheck(2048),
+ m_iThresholdOfNumberOfContinuousNoBetterPropagation(3),
+ m_iNumberOfInitialDynamicPivots(50),
+ m_iNumberOfOtherDynamicPivots(4) {}
+
+ ~Index() {}
+
+ inline int GetNumSamples() const { return m_pSamples.R(); }
+ inline int GetFeatureDim() const { return m_pSamples.C(); }
- ErrorCode SetParameter(const char* p_param, const char* p_value);
- std::string GetParameter(const char* p_param) const;
-
- private:
- // Functions for loading models from files
- bool LoadDataPoints(std::string sDataPointsFileName);
- bool LoadBKT(std::string sBKTFilename);
- bool LoadGraph(std::string sGraphFilename);
-
- // Functions for loading models from memory mapped files
- bool LoadDataPoints(char* pDataPointsMemFile);
- bool LoadBKT(char* pBKTMemFile);
- bool LoadGraph(char* pGraphMemFile);
-
- bool SaveDataPoints(std::string sDataPointsFileName);
-
- // Functions for building balanced kmeans tree
- void BuildBKT(std::vector& indices, std::vector& newStart, std::vector& newRoot);
- bool SaveBKT(std::string sBKTFilename, std::vector& newStart, std::vector& newRoot) const;
- float KmeansAssign(std::vector& indices, const int first, const int last, KmeansArgs& args, bool updateCenters);
- int KmeansClustering(std::vector& indices, const int first, const int last, KmeansArgs& args);
-
- // Functions for building Graph
- void BuildRNG();
- bool SaveRNG(std::string sGraphFilename) const;
- void PartitionByTptree(std::vector &indices,
- const int first,
- const int last,
- std::vector> &leaves);
- void RefineRNG();
- void RefineRNGNode(const int node, COMMON::WorkSpace &space, bool updateNeighbors);
- void RebuildRNGNodeNeighbors(int* nodes, const BasicResult* queryResults, int numResults);
- float GraphAccuracyEstimation(int NSample, bool rng);
- };
-} // namespace BKT
+ inline int GetCurrMaxCheck() const { return m_iMaxCheck; }
+ inline int GetNumThreads() const { return m_iNumberOfThreads; }
+ inline DistCalcMethod GetDistCalcMethod() const { return m_iDistCalcMethod; }
+ inline IndexAlgoType GetIndexAlgoType() const { return IndexAlgoType::BKT; }
+ inline VectorValueType GetVectorValueType() const { return GetEnumValueType(); }
+
+ inline float ComputeDistance(const void* pX, const void* pY) const { return m_fComputeDistance((const T*)pX, (const T*)pY, m_pSamples.C()); }
+ inline const void* GetSample(const int idx) const { return (void*)m_pSamples[idx]; }
+
+ ErrorCode BuildIndex(const void* p_data, int p_vectorNum, int p_dimension);
+
+ ErrorCode LoadIndexFromMemory(const std::vector& p_indexBlobs);
+
+ ErrorCode SaveIndex(const std::string& p_folderPath, std::ofstream& p_configout);
+ ErrorCode LoadIndex(const std::string& p_folderPath, Helper::IniReader& p_reader);
+ ErrorCode SearchIndex(QueryResult &p_query) const;
+ ErrorCode AddIndex(const void* p_vectors, int p_vectorNum, int p_dimension);
+ ErrorCode DeleteIndex(const void* p_vectors, int p_vectorNum);
+
+ ErrorCode SetParameter(const char* p_param, const char* p_value);
+ std::string GetParameter(const char* p_param) const;
+
+ private:
+ ErrorCode RefineIndex(const std::string& p_folderPath);
+ void SearchIndexWithDeleted(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const tbb::concurrent_unordered_set &p_deleted) const;
+ void SearchIndexWithoutDeleted(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space) const;
+ };
+ } // namespace BKT
} // namespace SPTAG
#endif // _SPTAG_BKT_INDEX_H_
diff --git a/AnnService/inc/Core/BKT/ParameterDefinitionList.h b/AnnService/inc/Core/BKT/ParameterDefinitionList.h
index 237b9da4..15c0cc01 100644
--- a/AnnService/inc/Core/BKT/ParameterDefinitionList.h
+++ b/AnnService/inc/Core/BKT/ParameterDefinitionList.h
@@ -5,25 +5,29 @@ DefineBKTParameter(m_sBKTFilename, std::string, std::string("tree.bin"), "TreeFi
DefineBKTParameter(m_sGraphFilename, std::string, std::string("graph.bin"), "GraphFilePath")
DefineBKTParameter(m_sDataPointsFilename, std::string, std::string("vectors.bin"), "VectorFilePath")
-DefineBKTParameter(m_iBKTNumber, int, 1L, "BKTNumber")
-DefineBKTParameter(m_iBKTKmeansK, int, 32L, "BKTKmeansK")
-DefineBKTParameter(m_iNeighborhoodSize, int, 32L, "NeighborhoodSize")
-DefineBKTParameter(m_iBKTLeafSize, int, 8L, "BKTLeafSize")
-DefineBKTParameter(m_iSamples, int, 1000L, "Samples")
-DefineBKTParameter(m_iTptreeNumber, int, 32L, "TpTreeNumber")
-DefineBKTParameter(m_iTPTLeafSize, int, 2000L, "TPTLeafSize")
-DefineBKTParameter(m_numTopDimensionTpTreeSplit, int, 5L, "NumTopDimensionTpTreeSplit")
-DefineBKTParameter(m_iCEF, int, 1000L, "CEF")
-DefineBKTParameter(m_iMaxCheckForRefineGraph, int, 10000L, "MaxCheckForRefineGraph")
-DefineBKTParameter(m_iMaxCheck, int, 8192L, "MaxCheck")
-DefineBKTParameter(m_iNumberOfThreads, int, 1L, "NumberOfThreads")
+DefineBKTParameter(m_pTrees.m_iTreeNumber, int, 1L, "BKTNumber")
+DefineBKTParameter(m_pTrees.m_iBKTKmeansK, int, 32L, "BKTKmeansK")
+DefineBKTParameter(m_pTrees.m_iBKTLeafSize, int, 8L, "BKTLeafSize")
+DefineBKTParameter(m_pTrees.m_iSamples, int, 1000L, "Samples")
+
+
+DefineBKTParameter(m_pGraph.m_iTPTNumber, int, 32L, "TpTreeNumber")
+DefineBKTParameter(m_pGraph.m_iTPTLeafSize, int, 2000L, "TPTLeafSize")
+DefineBKTParameter(m_pGraph.m_numTopDimensionTPTSplit, int, 5L, "NumTopDimensionTpTreeSplit")
-DefineBKTParameter(g_iThresholdOfNumberOfContinuousNoBetterPropagation, int, 3L, "ThresholdOfNumberOfContinuousNoBetterPropagation")
-DefineBKTParameter(g_iNumberOfInitialDynamicPivots, int, 50L, "NumberOfInitialDynamicPivots")
-DefineBKTParameter(g_iNumberOfOtherDynamicPivots, int, 4L, "NumberOfOtherDynamicPivots")
+DefineBKTParameter(m_pGraph.m_iNeighborhoodSize, int, 32L, "NeighborhoodSize")
+DefineBKTParameter(m_pGraph.m_iNeighborhoodScale, int, 16L, "GraphNeighborhoodScale")
+DefineBKTParameter(m_pGraph.m_iCEFScale, int, 4L, "GraphCEFScale")
+DefineBKTParameter(m_pGraph.m_iRefineIter, int, 0L, "RefineIterations")
+DefineBKTParameter(m_pGraph.m_iCEF, int, 1000L, "CEF")
+DefineBKTParameter(m_pGraph.m_iMaxCheckForRefineGraph, int, 10000L, "MaxCheckForRefineGraph")
+DefineBKTParameter(m_iNumberOfThreads, int, 1L, "NumberOfThreads")
DefineBKTParameter(m_iDistCalcMethod, SPTAG::DistCalcMethod, SPTAG::DistCalcMethod::Cosine, "DistCalcMethod")
-DefineBKTParameter(m_iRefineIter, int, 0L, "RefineIterations")
-DefineBKTParameter(m_iDebugLoad, int, -1, "NumTrains")
-DefineBKTParameter(m_iCacheSize, int, -1, "CacheSize")
+
+DefineBKTParameter(m_iMaxCheck, int, 8192L, "MaxCheck")
+DefineBKTParameter(m_iThresholdOfNumberOfContinuousNoBetterPropagation, int, 3L, "ThresholdOfNumberOfContinuousNoBetterPropagation")
+DefineBKTParameter(m_iNumberOfInitialDynamicPivots, int, 50L, "NumberOfInitialDynamicPivots")
+DefineBKTParameter(m_iNumberOfOtherDynamicPivots, int, 4L, "NumberOfOtherDynamicPivots")
+
#endif
diff --git a/AnnService/inc/Core/Common/BKTree.h b/AnnService/inc/Core/Common/BKTree.h
new file mode 100644
index 00000000..70140621
--- /dev/null
+++ b/AnnService/inc/Core/Common/BKTree.h
@@ -0,0 +1,461 @@
+#ifndef _SPTAG_COMMON_BKTREE_H_
+#define _SPTAG_COMMON_BKTREE_H_
+
+#include
+#include
+#include
+#include
+
+#include "../VectorIndex.h"
+
+#include "CommonUtils.h"
+#include "QueryResultSet.h"
+#include "WorkSpace.h"
+
+#pragma warning(disable:4996) // 'fopen': This function or variable may be unsafe. Consider using fopen_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
+
+namespace SPTAG
+{
+ namespace COMMON
+ {
+ // node type for storing BKT
+ struct BKTNode
+ {
+ int centerid;
+ int childStart;
+ int childEnd;
+
+ BKTNode(int cid = -1) : centerid(cid), childStart(-1), childEnd(-1) {}
+ };
+
+ template
+ struct KmeansArgs {
+ int _K;
+ int _D;
+ int _T;
+ T* centers;
+ int* counts;
+ float* newCenters;
+ int* newCounts;
+ char* label;
+ int* clusterIdx;
+ float* clusterDist;
+ T* newTCenters;
+
+ KmeansArgs(int k, int dim, int datasize, int threadnum) : _K(k), _D(dim), _T(threadnum) {
+ centers = new T[k * dim];
+ counts = new int[k];
+ newCenters = new float[threadnum * k * dim];
+ newCounts = new int[threadnum * k];
+ label = new char[datasize];
+ clusterIdx = new int[threadnum * k];
+ clusterDist = new float[threadnum * k];
+ newTCenters = new T[k * dim];
+ }
+
+ ~KmeansArgs() {
+ delete[] centers;
+ delete[] counts;
+ delete[] newCenters;
+ delete[] newCounts;
+ delete[] label;
+ delete[] clusterIdx;
+ delete[] clusterDist;
+ delete[] newTCenters;
+ }
+
+ inline void ClearCounts() {
+ memset(newCounts, 0, sizeof(int) * _T * _K);
+ }
+
+ inline void ClearCenters() {
+ memset(newCenters, 0, sizeof(float) * _T * _K * _D);
+ }
+
+ inline void ClearDists(float dist) {
+ for (int i = 0; i < _T * _K; i++) {
+ clusterIdx[i] = -1;
+ clusterDist[i] = dist;
+ }
+ }
+
+ void Shuffle(std::vector& indices, int first, int last) {
+ int* pos = new int[_K];
+ pos[0] = first;
+ for (int k = 1; k < _K; k++) pos[k] = pos[k - 1] + newCounts[k - 1];
+
+ for (int k = 0; k < _K; k++) {
+ if (newCounts[k] == 0) continue;
+ int i = pos[k];
+ while (newCounts[k] > 0) {
+ int swapid = pos[(int)(label[i])] + newCounts[(int)(label[i])] - 1;
+ newCounts[(int)(label[i])]--;
+ std::swap(indices[i], indices[swapid]);
+ std::swap(label[i], label[swapid]);
+ }
+ while (indices[i] != clusterIdx[k]) i++;
+ std::swap(indices[i], indices[pos[k] + counts[k] - 1]);
+ }
+ delete[] pos;
+ }
+ };
+
+ class BKTree
+ {
+ public:
+ BKTree(): m_iTreeNumber(1), m_iBKTKmeansK(32), m_iBKTLeafSize(8), m_iSamples(1000) {}
+
+ BKTree(BKTree& other): m_iTreeNumber(other.m_iTreeNumber),
+ m_iBKTKmeansK(other.m_iBKTKmeansK),
+ m_iBKTLeafSize(other.m_iBKTLeafSize),
+ m_iSamples(other.m_iSamples) {}
+ ~BKTree() {}
+
+ inline const BKTNode& operator[](int index) const { return m_pTreeRoots[index]; }
+ inline BKTNode& operator[](int index) { return m_pTreeRoots[index]; }
+
+ inline int size() const { return (int)m_pTreeRoots.size(); }
+
+ inline const std::unordered_map& GetSampleMap() const { return m_pSampleCenterMap; }
+
+ template
+ void BuildTrees(VectorIndex* index, std::vector* indices = nullptr)
+ {
+ struct BKTStackItem {
+ int index, first, last;
+ BKTStackItem(int index_, int first_, int last_) : index(index_), first(first_), last(last_) {}
+ };
+ std::stack ss;
+
+ std::vector localindices;
+ if (indices == nullptr) {
+ localindices.resize(index->GetNumSamples());
+ for (int i = 0; i < index->GetNumSamples(); i++) localindices[i] = i;
+ }
+ else {
+ localindices.assign(indices->begin(), indices->end());
+ }
+ KmeansArgs args(m_iBKTKmeansK, index->GetFeatureDim(), (int)localindices.size(), omp_get_num_threads());
+
+ m_pSampleCenterMap.clear();
+ for (char i = 0; i < m_iTreeNumber; i++)
+ {
+ std::random_shuffle(localindices.begin(), localindices.end());
+
+ m_pTreeStart.push_back((int)m_pTreeRoots.size());
+ m_pTreeRoots.push_back(BKTNode((int)localindices.size()));
+ std::cout << "Start to build BKTree " << i + 1 << std::endl;
+
+ ss.push(BKTStackItem(m_pTreeStart[i], 0, (int)localindices.size()));
+ while (!ss.empty()) {
+ BKTStackItem item = ss.top(); ss.pop();
+ int newBKTid = (int)m_pTreeRoots.size();
+ m_pTreeRoots[item.index].childStart = newBKTid;
+ if (item.last - item.first <= m_iBKTLeafSize) {
+ for (int j = item.first; j < item.last; j++) {
+ m_pTreeRoots.push_back(BKTNode(localindices[j]));
+ }
+ }
+ else { // clustering the data into BKTKmeansK clusters
+ int numClusters = KmeansClustering(index, localindices, item.first, item.last, args);
+ if (numClusters <= 1) {
+ int end = min(item.last + 1, (int)localindices.size());
+ std::sort(localindices.begin() + item.first, localindices.begin() + end);
+ m_pTreeRoots[item.index].centerid = localindices[item.first];
+ m_pTreeRoots[item.index].childStart = -m_pTreeRoots[item.index].childStart;
+ for (int j = item.first + 1; j < end; j++) {
+ m_pTreeRoots.push_back(BKTNode(localindices[j]));
+ m_pSampleCenterMap[localindices[j]] = m_pTreeRoots[item.index].centerid;
+ }
+ m_pSampleCenterMap[-1 - m_pTreeRoots[item.index].centerid] = item.index;
+ }
+ else {
+ for (int k = 0; k < m_iBKTKmeansK; k++) {
+ if (args.counts[k] == 0) continue;
+ m_pTreeRoots.push_back(BKTNode(localindices[item.first + args.counts[k] - 1]));
+ if (args.counts[k] > 1) ss.push(BKTStackItem(newBKTid++, item.first, item.first + args.counts[k] - 1));
+ item.first += args.counts[k];
+ }
+ }
+ }
+ m_pTreeRoots[item.index].childEnd = (int)m_pTreeRoots.size();
+ }
+ std::cout << i + 1 << " BKTree built, " << m_pTreeRoots.size() - m_pTreeStart[i] << " " << localindices.size() << std::endl;
+ }
+ }
+
+ bool SaveTrees(std::string sTreeFileName) const
+ {
+ std::cout << "Save BKT to " << sTreeFileName << std::endl;
+ FILE *fp = fopen(sTreeFileName.c_str(), "wb");
+ if (fp == NULL) return false;
+
+ fwrite(&m_iTreeNumber, sizeof(int), 1, fp);
+ fwrite(m_pTreeStart.data(), sizeof(int), m_iTreeNumber, fp);
+ int treeNodeSize = (int)m_pTreeRoots.size();
+ fwrite(&treeNodeSize, sizeof(int), 1, fp);
+ fwrite(m_pTreeRoots.data(), sizeof(BKTNode), treeNodeSize, fp);
+ fclose(fp);
+ std::cout << "Save BKT (" << m_iTreeNumber << "," << treeNodeSize << ") Finish!" << std::endl;
+ return true;
+ }
+
+ bool LoadTrees(char* pBKTMemFile)
+ {
+ m_iTreeNumber = *((int*)pBKTMemFile);
+ pBKTMemFile += sizeof(int);
+ m_pTreeStart.resize(m_iTreeNumber);
+ memcpy(m_pTreeStart.data(), pBKTMemFile, sizeof(int) * m_iTreeNumber);
+ pBKTMemFile += sizeof(int)*m_iTreeNumber;
+
+ int treeNodeSize = *((int*)pBKTMemFile);
+ pBKTMemFile += sizeof(int);
+ m_pTreeRoots.resize(treeNodeSize);
+ memcpy(m_pTreeRoots.data(), pBKTMemFile, sizeof(BKTNode) * treeNodeSize);
+ return true;
+ }
+
+ bool LoadTrees(std::string sTreeFileName)
+ {
+ std::cout << "Load BKT From " << sTreeFileName << std::endl;
+ FILE *fp = fopen(sTreeFileName.c_str(), "rb");
+ if (fp == NULL) return false;
+
+ fread(&m_iTreeNumber, sizeof(int), 1, fp);
+ m_pTreeStart.resize(m_iTreeNumber);
+ fread(m_pTreeStart.data(), sizeof(int), m_iTreeNumber, fp);
+
+ int treeNodeSize;
+ fread(&treeNodeSize, sizeof(int), 1, fp);
+ m_pTreeRoots.resize(treeNodeSize);
+ fread(m_pTreeRoots.data(), sizeof(BKTNode), treeNodeSize, fp);
+ fclose(fp);
+ std::cout << "Load BKT (" << m_iTreeNumber << "," << treeNodeSize << ") Finish!" << std::endl;
+ return true;
+ }
+
+ template
+ void InitSearchTrees(const VectorIndex* p_index, const COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space) const
+ {
+ for (char i = 0; i < m_iTreeNumber; i++) {
+ const BKTNode& node = m_pTreeRoots[m_pTreeStart[i]];
+ if (node.childStart < 0) {
+ p_space.m_SPTQueue.insert(COMMON::HeapCell(m_pTreeStart[i], p_index->ComputeDistance((const void*)p_query.GetTarget(), p_index->GetSample(node.centerid))));
+ }
+ else {
+ for (int begin = node.childStart; begin < node.childEnd; begin++) {
+ int index = m_pTreeRoots[begin].centerid;
+ p_space.m_SPTQueue.insert(COMMON::HeapCell(begin, p_index->ComputeDistance((const void*)p_query.GetTarget(), p_index->GetSample(index))));
+ }
+ }
+ }
+ }
+
+ template
+ void SearchTrees(const VectorIndex* p_index, const COMMON::QueryResultSet &p_query,
+ COMMON::WorkSpace &p_space, const int p_limits) const
+ {
+ do
+ {
+ COMMON::HeapCell bcell = p_space.m_SPTQueue.pop();
+ const BKTNode& tnode = m_pTreeRoots[bcell.node];
+ if (tnode.childStart < 0) {
+ if (!p_space.CheckAndSet(tnode.centerid)) {
+ p_space.m_iNumberOfCheckedLeaves++;
+ p_space.m_NGQueue.insert(COMMON::HeapCell(tnode.centerid, bcell.distance));
+ }
+ if (p_space.m_iNumberOfCheckedLeaves >= p_limits) break;
+ }
+ else {
+ if (!p_space.CheckAndSet(tnode.centerid)) {
+ p_space.m_NGQueue.insert(COMMON::HeapCell(tnode.centerid, bcell.distance));
+ }
+ for (int begin = tnode.childStart; begin < tnode.childEnd; begin++) {
+ int index = m_pTreeRoots[begin].centerid;
+ p_space.m_SPTQueue.insert(COMMON::HeapCell(begin, p_index->ComputeDistance((const void*)p_query.GetTarget(), p_index->GetSample(index))));
+ }
+ }
+ } while (!p_space.m_SPTQueue.empty());
+ }
+
+ private:
+
+ template
+ float KmeansAssign(VectorIndex* p_index,
+ std::vector& indices,
+ const int first, const int last, KmeansArgs& args, const bool updateCenters) const {
+ float currDist = 0;
+ int threads = omp_get_num_threads();
+ float lambda = (updateCenters) ? COMMON::Utils::GetBase() * COMMON::Utils::GetBase() / (100.0f * (last - first)) : 0.0f;
+ int subsize = (last - first - 1) / threads + 1;
+
+#pragma omp parallel for
+ for (int tid = 0; tid < threads; tid++)
+ {
+ int istart = first + tid * subsize;
+ int iend = min(first + (tid + 1) * subsize, last);
+ int *inewCounts = args.newCounts + tid * m_iBKTKmeansK;
+ float *inewCenters = args.newCenters + tid * m_iBKTKmeansK * p_index->GetFeatureDim();
+ int * iclusterIdx = args.clusterIdx + tid * m_iBKTKmeansK;
+ float * iclusterDist = args.clusterDist + tid * m_iBKTKmeansK;
+ float idist = 0;
+ for (int i = istart; i < iend; i++) {
+ int clusterid = 0;
+ float smallestDist = MaxDist;
+ for (int k = 0; k < m_iBKTKmeansK; k++) {
+ float dist = p_index->ComputeDistance(p_index->GetSample(indices[i]), (const void*)(args.centers + k*p_index->GetFeatureDim())) + lambda*args.counts[k];
+ if (dist > -MaxDist && dist < smallestDist) {
+ clusterid = k; smallestDist = dist;
+ }
+ }
+ args.label[i] = clusterid;
+ inewCounts[clusterid]++;
+ idist += smallestDist;
+ if (updateCenters) {
+ const T* v = (const T*)p_index->GetSample(indices[i]);
+ float* center = inewCenters + clusterid*p_index->GetFeatureDim();
+ for (int j = 0; j < p_index->GetFeatureDim(); j++) center[j] += v[j];
+ if (smallestDist > iclusterDist[clusterid]) {
+ iclusterDist[clusterid] = smallestDist;
+ iclusterIdx[clusterid] = indices[i];
+ }
+ }
+ else {
+ if (smallestDist <= iclusterDist[clusterid]) {
+ iclusterDist[clusterid] = smallestDist;
+ iclusterIdx[clusterid] = indices[i];
+ }
+ }
+ }
+ COMMON::Utils::atomic_float_add(&currDist, idist);
+ }
+
+ for (int i = 1; i < threads; i++) {
+ for (int k = 0; k < m_iBKTKmeansK; k++)
+ args.newCounts[k] += args.newCounts[i*m_iBKTKmeansK + k];
+ }
+
+ if (updateCenters) {
+ for (int i = 1; i < threads; i++) {
+ float* currCenter = args.newCenters + i*m_iBKTKmeansK*p_index->GetFeatureDim();
+ for (int j = 0; j < m_iBKTKmeansK * p_index->GetFeatureDim(); j++) args.newCenters[j] += currCenter[j];
+ }
+
+ int maxcluster = 0;
+ for (int k = 1; k < m_iBKTKmeansK; k++) if (args.newCounts[maxcluster] < args.newCounts[k]) maxcluster = k;
+
+ int maxid = maxcluster;
+ for (int tid = 1; tid < threads; tid++) {
+ if (args.clusterDist[maxid] < args.clusterDist[tid * m_iBKTKmeansK + maxcluster]) maxid = tid * m_iBKTKmeansK + maxcluster;
+ }
+ if (args.clusterIdx[maxid] < 0 || args.clusterIdx[maxid] >= p_index->GetNumSamples())
+ std::cout << "first:" << first << " last:" << last << " maxcluster:" << maxcluster << "(" << args.newCounts[maxcluster] << ") Error maxid:" << maxid << " dist:" << args.clusterDist[maxid] << std::endl;
+ maxid = args.clusterIdx[maxid];
+
+ for (int k = 0; k < m_iBKTKmeansK; k++) {
+ T* TCenter = args.newTCenters + k * p_index->GetFeatureDim();
+ if (args.newCounts[k] == 0) {
+ //int nextid = Utils::rand_int(last, first);
+ //while (args.label[nextid] != maxcluster) nextid = Utils::rand_int(last, first);
+ int nextid = maxid;
+ std::memcpy(TCenter, p_index->GetSample(nextid), sizeof(T)*p_index->GetFeatureDim());
+ }
+ else {
+ float* currCenters = args.newCenters + k * p_index->GetFeatureDim();
+ for (int j = 0; j < p_index->GetFeatureDim(); j++) currCenters[j] /= args.newCounts[k];
+
+ if (p_index->GetDistCalcMethod() == DistCalcMethod::Cosine) {
+ COMMON::Utils::Normalize(currCenters, p_index->GetFeatureDim(), COMMON::Utils::GetBase());
+ }
+ for (int j = 0; j < p_index->GetFeatureDim(); j++) TCenter[j] = (T)(currCenters[j]);
+ }
+ }
+ }
+ else {
+ for (int i = 1; i < threads; i++) {
+ for (int k = 0; k < m_iBKTKmeansK; k++) {
+ if (args.clusterIdx[i*m_iBKTKmeansK + k] != -1 && args.clusterDist[i*m_iBKTKmeansK + k] <= args.clusterDist[k]) {
+ args.clusterDist[k] = args.clusterDist[i*m_iBKTKmeansK + k];
+ args.clusterIdx[k] = args.clusterIdx[i*m_iBKTKmeansK + k];
+ }
+ }
+ }
+ }
+ return currDist;
+ }
+
+ template
+ int KmeansClustering(VectorIndex* p_index,
+ std::vector& indices, const int first, const int last, KmeansArgs& args) const {
+ int iterLimit = 100;
+
+ int batchEnd = min(first + m_iSamples, last);
+ float currDiff, currDist, minClusterDist = MaxDist;
+ for (int numKmeans = 0; numKmeans < 3; numKmeans++) {
+ for (int k = 0; k < m_iBKTKmeansK; k++) {
+ int randid = COMMON::Utils::rand_int(last, first);
+ std::memcpy(args.centers + k*p_index->GetFeatureDim(), p_index->GetSample(indices[randid]), sizeof(T)*p_index->GetFeatureDim());
+ }
+ args.ClearCounts();
+ currDist = KmeansAssign(p_index, indices, first, batchEnd, args, false);
+ if (currDist < minClusterDist) {
+ minClusterDist = currDist;
+ memcpy(args.newTCenters, args.centers, sizeof(T)*m_iBKTKmeansK*p_index->GetFeatureDim());
+ memcpy(args.counts, args.newCounts, sizeof(int) * m_iBKTKmeansK);
+ }
+ }
+
+ minClusterDist = MaxDist;
+ int noImprovement = 0;
+ for (int iter = 0; iter < iterLimit; iter++) {
+ std::memcpy(args.centers, args.newTCenters, sizeof(T)*m_iBKTKmeansK*p_index->GetFeatureDim());
+ std::random_shuffle(indices.begin() + first, indices.begin() + last);
+
+ args.ClearCenters();
+ args.ClearCounts();
+ args.ClearDists(-MaxDist);
+ currDist = KmeansAssign(p_index, indices, first, batchEnd, args, true);
+ memcpy(args.counts, args.newCounts, sizeof(int)*m_iBKTKmeansK);
+
+ currDiff = 0;
+ for (int k = 0; k < m_iBKTKmeansK; k++) {
+ currDiff += p_index->ComputeDistance((const void*)(args.centers + k*p_index->GetFeatureDim()), (const void*)(args.newTCenters + k*p_index->GetFeatureDim()));
+ }
+
+ if (currDist < minClusterDist) {
+ noImprovement = 0;
+ minClusterDist = currDist;
+ }
+ else {
+ noImprovement++;
+ }
+ if (currDiff < 1e-3 || noImprovement >= 5) break;
+ }
+
+ args.ClearCounts();
+ args.ClearDists(MaxDist);
+ currDist = KmeansAssign(p_index, indices, first, last, args, false);
+ memcpy(args.counts, args.newCounts, sizeof(int)*m_iBKTKmeansK);
+
+ int numClusters = 0;
+ for (int i = 0; i < m_iBKTKmeansK; i++) if (args.counts[i] > 0) numClusters++;
+
+ if (numClusters <= 1) {
+ //if (last - first > 1) std::cout << "large cluster:" << last - first << " dist:" << currDist << std::endl;
+ return numClusters;
+ }
+ args.Shuffle(indices, first, last);
+ return numClusters;
+ }
+
+ private:
+ std::vector m_pTreeStart;
+ std::vector m_pTreeRoots;
+ std::unordered_map m_pSampleCenterMap;
+
+ public:
+ int m_iTreeNumber, m_iBKTKmeansK, m_iBKTLeafSize, m_iSamples;
+ };
+ }
+}
+#endif
diff --git a/AnnService/inc/Core/Common/Dataset.h b/AnnService/inc/Core/Common/Dataset.h
index 4753b088..fd7817f8 100644
--- a/AnnService/inc/Core/Common/Dataset.h
+++ b/AnnService/inc/Core/Common/Dataset.h
@@ -3,11 +3,19 @@
#include
+#if defined(_MSC_VER) || defined(__INTEL_COMPILER)
+#include
+#else
+#include
+#endif // defined(__GNUC__)
+
#define ALIGN 32
#define aligned_malloc(a, b) _mm_malloc(a, b)
#define aligned_free(a) _mm_free(a)
+#pragma warning(disable:4996) // 'fopen': This function or variable may be unsafe. Consider using fopen_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
+
namespace SPTAG
{
namespace COMMON
@@ -21,86 +29,158 @@ namespace SPTAG
int cols;
bool ownData = false;
T* data = nullptr;
- std::vector* dataIncremental = nullptr;
+ std::vector dataIncremental;
public:
- Dataset() {}
- Dataset(int rows_, int cols_, T* data_ = nullptr)
+ Dataset(): rows(0), cols(1) {}
+ Dataset(int rows_, int cols_, T* data_ = nullptr, bool transferOnwership_ = true)
{
- Initialize(rows_, cols_, data_);
+ Initialize(rows_, cols_, data_, transferOnwership_);
}
~Dataset()
{
if (ownData) aligned_free(data);
- if (dataIncremental) {
- dataIncremental->clear();
- delete dataIncremental;
- }
}
- void Initialize(int rows_, int cols_, T* data_ = nullptr)
+ void Initialize(int rows_, int cols_, T* data_ = nullptr, bool transferOnwership_ = true)
{
rows = rows_;
cols = cols_;
data = data_;
- if (data == nullptr)
+ if (data_ == nullptr || !transferOnwership_)
{
ownData = true;
data = (T*)aligned_malloc(sizeof(T) * rows * cols, ALIGN);
+ if (data_ != nullptr) memcpy(data, data_, rows * cols * sizeof(T));
+ else std::memset(data, -1, rows * cols * sizeof(T));
}
- dataIncremental = new std::vector();
}
void SetR(int R_)
{
if (R_ >= rows)
- dataIncremental->resize((R_ - rows) * cols);
+ dataIncremental.resize((R_ - rows) * cols);
else
{
rows = R_;
- dataIncremental->clear();
+ dataIncremental.clear();
}
}
- int R() const { return (int)(rows + dataIncremental->size() / cols); }
- int C() const { return cols; }
+ inline int R() const { return (int)(rows + dataIncremental.size() / cols); }
+ inline int C() const { return cols; }
T* operator[](int index)
{
if (index >= rows) {
- return dataIncremental->data() + (size_t)(index - rows)*cols;
+ return dataIncremental.data() + (size_t)(index - rows)*cols;
}
return data + (size_t)index*cols;
}
+
const T* operator[](int index) const
{
if (index >= rows) {
- return dataIncremental->data() + (size_t)(index - rows)*cols;
+ return dataIncremental.data() + (size_t)(index - rows)*cols;
}
return data + (size_t)index*cols;
}
- T* GetData()
+ void AddBatch(const T* pData, int num)
{
- return data;
+ dataIncremental.insert(dataIncremental.end(), pData, pData + num*cols);
}
- void reset()
+ void AddBatch(int num)
{
- if (ownData) {
- aligned_free(data);
- ownData = false;
+ dataIncremental.insert(dataIncremental.end(), (size_t)num*cols, T(-1));
+ }
+
+ bool Save(std::string sDataPointsFileName)
+ {
+ std::cout << "Save Data To " << sDataPointsFileName << std::endl;
+ FILE * fp = fopen(sDataPointsFileName.c_str(), "wb");
+ if (fp == NULL) return false;
+
+ int CR = R();
+ fwrite(&CR, sizeof(int), 1, fp);
+ fwrite(&cols, sizeof(int), 1, fp);
+
+ T* ptr = data;
+ int toWrite = rows;
+ while (toWrite > 0)
+ {
+ size_t write = fwrite(ptr, sizeof(T) * cols, toWrite, fp);
+ ptr += write * cols;
+ toWrite -= (int)write;
}
- if (dataIncremental) {
- dataIncremental->clear();
- delete dataIncremental;
+ ptr = dataIncremental.data();
+ toWrite = CR - rows;
+ while (toWrite > 0)
+ {
+ size_t write = fwrite(ptr, sizeof(T) * cols, toWrite, fp);
+ ptr += write * cols;
+ toWrite -= (int)write;
}
+ fclose(fp);
+
+ std::cout << "Save Data (" << CR << ", " << cols << ") Finish!" << std::endl;
+ return true;
}
-
- void AddBatch(const T* pData, int num)
+
+ bool Load(std::string sDataPointsFileName)
{
- dataIncremental->insert(dataIncremental->end(), pData, pData + num*cols);
+ std::cout << "Load Data From " << sDataPointsFileName << std::endl;
+ FILE * fp = fopen(sDataPointsFileName.c_str(), "rb");
+ if (fp == NULL) return false;
+
+ int R, C;
+ fread(&R, sizeof(int), 1, fp);
+ fread(&C, sizeof(int), 1, fp);
+
+ Initialize(R, C);
+ T* ptr = data;
+ while (R > 0) {
+ size_t read = fread(ptr, sizeof(T) * C, R, fp);
+ ptr += read * C;
+ R -= (int)read;
+ }
+ fclose(fp);
+ std::cout << "Load Data (" << rows << ", " << cols << ") Finish!" << std::endl;
+ return true;
}
- void AddBatch(int num)
+ // Functions for loading models from memory mapped files
+ bool Load(char* pDataPointsMemFile)
{
- dataIncremental->insert(dataIncremental->end(), (size_t)num*cols, T(-1));
+ int R, C;
+ R = *((int*)pDataPointsMemFile);
+ pDataPointsMemFile += sizeof(int);
+
+ C = *((int*)pDataPointsMemFile);
+ pDataPointsMemFile += sizeof(int);
+
+ Initialize(R, C, (T*)pDataPointsMemFile);
+ return true;
+ }
+
+ bool Refine(const std::vector& indices, std::string sDataPointsFileName)
+ {
+ std::cout << "Save Refine Data To " << sDataPointsFileName << std::endl;
+ FILE * fp = fopen(sDataPointsFileName.c_str(), "wb");
+ if (fp == NULL) return false;
+
+ int R = (int)(indices.size());
+ fwrite(&R, sizeof(int), 1, fp);
+ fwrite(&cols, sizeof(int), 1, fp);
+
+ // write point one by one in case for cache miss
+ for (int i = 0; i < R; i++) {
+ if (indices[i] < rows)
+ fwrite(data + (size_t)indices[i] * cols, sizeof(T) * cols, 1, fp);
+ else
+ fwrite(dataIncremental.data() + (size_t)(indices[i] - rows) * cols, sizeof(T) * cols, 1, fp);
+ }
+ fclose(fp);
+
+ std::cout << "Save Refine Data (" << R << ", " << cols << ") Finish!" << std::endl;
+ return true;
}
};
}
diff --git a/AnnService/inc/Core/Common/KDTree.h b/AnnService/inc/Core/Common/KDTree.h
new file mode 100644
index 00000000..ab2e1779
--- /dev/null
+++ b/AnnService/inc/Core/Common/KDTree.h
@@ -0,0 +1,327 @@
+#ifndef _SPTAG_COMMON_KDTREE_H_
+#define _SPTAG_COMMON_KDTREE_H_
+
+#include
+#include
+#include
+
+#include "../VectorIndex.h"
+
+#include "CommonUtils.h"
+#include "QueryResultSet.h"
+#include "WorkSpace.h"
+
+#pragma warning(disable:4996) // 'fopen': This function or variable may be unsafe. Consider using fopen_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
+
+namespace SPTAG
+{
+ namespace COMMON
+ {
+ // node type for storing KDT
+ struct KDTNode
+ {
+ int left;
+ int right;
+ short split_dim;
+ float split_value;
+ };
+
+ class KDTree
+ {
+ public:
+ KDTree() : m_iTreeNumber(2), m_numTopDimensionKDTSplit(5), m_iSamples(1000) {}
+
+ KDTree(KDTree& other) : m_iTreeNumber(other.m_iTreeNumber),
+ m_numTopDimensionKDTSplit(other.m_numTopDimensionKDTSplit),
+ m_iSamples(other.m_iSamples) {}
+ ~KDTree() {}
+
+ inline const KDTNode& operator[](int index) const { return m_pTreeRoots[index]; }
+ inline KDTNode& operator[](int index) { return m_pTreeRoots[index]; }
+
+ inline int size() const { return (int)m_pTreeRoots.size(); }
+
+ template
+ void BuildTrees(VectorIndex* p_index, std::vector* indices = nullptr)
+ {
+ std::vector localindices;
+ if (indices == nullptr) {
+ localindices.resize(p_index->GetNumSamples());
+ for (int i = 0; i < p_index->GetNumSamples(); i++) localindices[i] = i;
+ }
+ else {
+ localindices.assign(indices->begin(), indices->end());
+ }
+
+ m_pTreeRoots.resize(m_iTreeNumber * localindices.size());
+ m_pTreeStart.resize(m_iTreeNumber, 0);
+#pragma omp parallel for
+ for (int i = 0; i < m_iTreeNumber; i++)
+ {
+ Sleep(i * 100); std::srand(clock());
+
+ std::vector pindices(localindices.begin(), localindices.end());
+ std::random_shuffle(pindices.begin(), pindices.end());
+
+ m_pTreeStart[i] = i * (int)pindices.size();
+ std::cout << "Start to build KDTree " << i + 1 << std::endl;
+ int iTreeSize = m_pTreeStart[i];
+ DivideTree(p_index, pindices, 0, (int)pindices.size() - 1, m_pTreeStart[i], iTreeSize);
+ std::cout << i + 1 << " KDTree built, " << iTreeSize - m_pTreeStart[i] << " " << pindices.size() << std::endl;
+ }
+ }
+
+ bool SaveTrees(std::string sTreeFileName) const
+ {
+ std::cout << "Save KDT to " << sTreeFileName << std::endl;
+ FILE *fp = fopen(sTreeFileName.c_str(), "wb");
+ if (fp == NULL) return false;
+
+ fwrite(&m_iTreeNumber, sizeof(int), 1, fp);
+ fwrite(m_pTreeStart.data(), sizeof(int), m_iTreeNumber, fp);
+ int treeNodeSize = (int)m_pTreeRoots.size();
+ fwrite(&treeNodeSize, sizeof(int), 1, fp);
+ fwrite(m_pTreeRoots.data(), sizeof(KDTNode), treeNodeSize, fp);
+ fclose(fp);
+ std::cout << "Save KDT (" << m_iTreeNumber << "," << treeNodeSize << ") Finish!" << std::endl;
+ return true;
+ }
+
+ bool LoadTrees(char* pKDTMemFile)
+ {
+ m_iTreeNumber = *((int*)pKDTMemFile);
+ pKDTMemFile += sizeof(int);
+ m_pTreeStart.resize(m_iTreeNumber);
+ memcpy(m_pTreeStart.data(), pKDTMemFile, sizeof(int) * m_iTreeNumber);
+ pKDTMemFile += sizeof(int)*m_iTreeNumber;
+
+ int treeNodeSize = *((int*)pKDTMemFile);
+ pKDTMemFile += sizeof(int);
+ m_pTreeRoots.resize(treeNodeSize);
+ memcpy(m_pTreeRoots.data(), pKDTMemFile, sizeof(KDTNode) * treeNodeSize);
+ return true;
+ }
+
+ bool LoadTrees(std::string sTreeFileName)
+ {
+ std::cout << "Load KDT From " << sTreeFileName << std::endl;
+ FILE *fp = fopen(sTreeFileName.c_str(), "rb");
+ if (fp == NULL) return false;
+
+ fread(&m_iTreeNumber, sizeof(int), 1, fp);
+ m_pTreeStart.resize(m_iTreeNumber);
+ fread(m_pTreeStart.data(), sizeof(int), m_iTreeNumber, fp);
+
+ int treeNodeSize;
+ fread(&treeNodeSize, sizeof(int), 1, fp);
+ m_pTreeRoots.resize(treeNodeSize);
+ fread(m_pTreeRoots.data(), sizeof(KDTNode), treeNodeSize, fp);
+ fclose(fp);
+ std::cout << "Load KDT (" << m_iTreeNumber << "," << treeNodeSize << ") Finish!" << std::endl;
+ return true;
+ }
+
+ template
+ void InitSearchTrees(const VectorIndex* p_index, const COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const int p_limits) const
+ {
+ for (char i = 0; i < m_iTreeNumber; i++) {
+ KDTSearch(p_index, p_query, p_space, m_pTreeStart[i], true, 0);
+ }
+
+ while (!p_space.m_SPTQueue.empty() && p_space.m_iNumberOfCheckedLeaves < p_limits)
+ {
+ auto& tcell = p_space.m_SPTQueue.pop();
+ if (p_query.worstDist() < tcell.distance) break;
+ KDTSearch(p_index, p_query, p_space, tcell.node, true, tcell.distance);
+ }
+ }
+
+ template
+ void SearchTrees(const VectorIndex* p_index, const COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const int p_limits) const
+ {
+ while (!p_space.m_SPTQueue.empty() && p_space.m_iNumberOfCheckedLeaves < p_limits)
+ {
+ auto& tcell = p_space.m_SPTQueue.pop();
+ KDTSearch(p_index, p_query, p_space, tcell.node, false, tcell.distance);
+ }
+ }
+
+ private:
+
+ template
+ void KDTSearch(const VectorIndex* p_index, const COMMON::QueryResultSet &p_query,
+ COMMON::WorkSpace& p_space, const int node, const bool isInit, const float distBound) const {
+ if (node < 0)
+ {
+ int index = -node - 1;
+ if (index >= p_index->GetNumSamples()) return;
+#ifdef PREFETCH
+ const char* data = (const char *)(p_index->GetSample(index));
+ _mm_prefetch(data, _MM_HINT_T0);
+ _mm_prefetch(data + 64, _MM_HINT_T0);
+#endif
+ if (p_space.CheckAndSet(index)) return;
+
+ ++p_space.m_iNumberOfTreeCheckedLeaves;
+ ++p_space.m_iNumberOfCheckedLeaves;
+ p_space.m_NGQueue.insert(COMMON::HeapCell(index, p_index->ComputeDistance((const void*)p_query.GetTarget(), (const void*)data)));
+ return;
+ }
+
+ auto& tnode = m_pTreeRoots[node];
+
+ float diff = (p_query.GetTarget())[tnode.split_dim] - tnode.split_value;
+ float distanceBound = distBound + diff * diff;
+ int otherChild, bestChild;
+ if (diff < 0)
+ {
+ bestChild = tnode.left;
+ otherChild = tnode.right;
+ }
+ else
+ {
+ otherChild = tnode.left;
+ bestChild = tnode.right;
+ }
+
+ if (!isInit || distanceBound < p_query.worstDist())
+ {
+ p_space.m_SPTQueue.insert(COMMON::HeapCell(otherChild, distanceBound));
+ }
+ KDTSearch(p_index, p_query, p_space, bestChild, isInit, distBound);
+ }
+
+
+ template
+ void DivideTree(VectorIndex* p_index, std::vector& indices, int first, int last,
+ int index, int &iTreeSize) {
+ ChooseDivision(p_index, m_pTreeRoots[index], indices, first, last);
+ int i = Subdivide(p_index, m_pTreeRoots[index], indices, first, last);
+ if (i - 1 <= first)
+ {
+ m_pTreeRoots[index].left = -indices[first] - 1;
+ }
+ else
+ {
+ iTreeSize++;
+ m_pTreeRoots[index].left = iTreeSize;
+ DivideTree(p_index, indices, first, i - 1, iTreeSize, iTreeSize);
+ }
+ if (last == i)
+ {
+ m_pTreeRoots[index].right = -indices[last] - 1;
+ }
+ else
+ {
+ iTreeSize++;
+ m_pTreeRoots[index].right = iTreeSize;
+ DivideTree(p_index, indices, i, last, iTreeSize, iTreeSize);
+ }
+ }
+
+ template
+ void ChooseDivision(VectorIndex* p_index, KDTNode& node, const std::vector& indices, const int first, const int last)
+ {
+ std::vector meanValues(p_index->GetFeatureDim(), 0);
+ std::vector varianceValues(p_index->GetFeatureDim(), 0);
+ int end = min(first + m_iSamples, last);
+ int count = end - first + 1;
+ // calculate the mean of each dimension
+ for (int j = first; j <= end; j++)
+ {
+ const T* v = (const T*)p_index->GetSample(indices[j]);
+ for (int k = 0; k < p_index->GetFeatureDim(); k++)
+ {
+ meanValues[k] += v[k];
+ }
+ }
+ for (int k = 0; k < p_index->GetFeatureDim(); k++)
+ {
+ meanValues[k] /= count;
+ }
+ // calculate the variance of each dimension
+ for (int j = first; j <= end; j++)
+ {
+ const T* v = (const T*)p_index->GetSample(indices[j]);
+ for (int k = 0; k < p_index->GetFeatureDim(); k++)
+ {
+ float dist = v[k] - meanValues[k];
+ varianceValues[k] += dist*dist;
+ }
+ }
+ // choose the split dimension as one of the dimension inside TOP_DIM maximum variance
+ node.split_dim = SelectDivisionDimension(varianceValues);
+ // determine the threshold
+ node.split_value = meanValues[node.split_dim];
+ }
+
+ int SelectDivisionDimension(const std::vector& varianceValues) const
+ {
+ // Record the top maximum variances
+ std::vector topind(m_numTopDimensionKDTSplit);
+ int num = 0;
+ // order the variances
+ for (int i = 0; i < varianceValues.size(); i++)
+ {
+ if (num < m_numTopDimensionKDTSplit || varianceValues[i] > varianceValues[topind[num - 1]])
+ {
+ if (num < m_numTopDimensionKDTSplit)
+ {
+ topind[num++] = i;
+ }
+ else
+ {
+ topind[num - 1] = i;
+ }
+ int j = num - 1;
+ // order the TOP_DIM variances
+ while (j > 0 && varianceValues[topind[j]] > varianceValues[topind[j - 1]])
+ {
+ std::swap(topind[j], topind[j - 1]);
+ j--;
+ }
+ }
+ }
+ // randomly choose a dimension from TOP_DIM
+ return topind[COMMON::Utils::rand_int(num)];
+ }
+
+ template
+ int Subdivide(VectorIndex* p_index, const KDTNode& node, std::vector& indices, const int first, const int last) const
+ {
+ int i = first;
+ int j = last;
+ // decide which child one point belongs
+ while (i <= j)
+ {
+ int ind = indices[i];
+ const T* v = (const T*)p_index->GetSample(ind);
+ float val = v[node.split_dim];
+ if (val < node.split_value)
+ {
+ i++;
+ }
+ else
+ {
+ std::swap(indices[i], indices[j]);
+ j--;
+ }
+ }
+ // if all the points in the node are equal,equally split the node into 2
+ if ((i == first) || (i == last + 1))
+ {
+ i = (first + last + 1) / 2;
+ }
+ return i;
+ }
+
+ private:
+ std::vector m_pTreeStart;
+ std::vector m_pTreeRoots;
+
+ public:
+ int m_iTreeNumber, m_numTopDimensionKDTSplit, m_iSamples;
+ };
+ }
+}
+#endif
diff --git a/AnnService/inc/Core/Common/NeighborhoodGraph.h b/AnnService/inc/Core/Common/NeighborhoodGraph.h
new file mode 100644
index 00000000..9e646b69
--- /dev/null
+++ b/AnnService/inc/Core/Common/NeighborhoodGraph.h
@@ -0,0 +1,408 @@
+#ifndef _SPTAG_COMMON_NG_H_
+#define _SPTAG_COMMON_NG_H_
+
+#include "../VectorIndex.h"
+
+#include "CommonUtils.h"
+#include "Dataset.h"
+#include "FineGrainedLock.h"
+#include "QueryResultSet.h"
+
+namespace SPTAG
+{
+ namespace COMMON
+ {
+ class NeighborhoodGraph
+ {
+ public:
+ NeighborhoodGraph(): m_iTPTNumber(32),
+ m_iTPTLeafSize(2000),
+ m_iSamples(1000),
+ m_numTopDimensionTPTSplit(5),
+ m_iNeighborhoodSize(32),
+ m_iNeighborhoodScale(16),
+ m_iCEFScale(4),
+ m_iRefineIter(0),
+ m_iCEF(1000),
+ m_iMaxCheckForRefineGraph(10000) {}
+
+ ~NeighborhoodGraph() {}
+
+ virtual void InsertNeighbors(VectorIndex* index, const int node, int insertNode, float insertDist) = 0;
+
+ virtual void RebuildNeighbors(VectorIndex* index, const int node, int* nodes, const BasicResult* queryResults, const int numResults) = 0;
+
+ virtual float GraphAccuracyEstimation(VectorIndex* index, const int samples, const std::unordered_map* idmap = nullptr) = 0;
+
+ template
+ void BuildGraph(VectorIndex* index, const std::unordered_map* idmap = nullptr)
+ {
+ std::cout << "build RNG graph!" << std::endl;
+
+ m_iGraphSize = index->GetNumSamples();
+ m_iNeighborhoodSize = m_iNeighborhoodSize * m_iNeighborhoodScale;
+ m_pNeighborhoodGraph.Initialize(m_iGraphSize, m_iNeighborhoodSize);
+ m_dataUpdateLock.resize(m_iGraphSize);
+
+ if (m_iGraphSize < 1000) {
+ RefineGraph(index, idmap);
+ std::cout << "Build RNG Graph end!" << std::endl;
+ return;
+ }
+
+ {
+ COMMON::Dataset NeighborhoodDists(m_iGraphSize, m_iNeighborhoodSize);
+ std::vector> TptreeDataIndices(m_iTPTNumber, std::vector(m_iGraphSize));
+ std::vector>> TptreeLeafNodes(m_iTPTNumber, std::vector>());
+
+ for (int i = 0; i < m_iGraphSize; i++)
+ for (int j = 0; j < m_iNeighborhoodSize; j++)
+ (NeighborhoodDists)[i][j] = MaxDist;
+
+ std::cout << "Parallel TpTree Partition begin " << std::endl;
+#pragma omp parallel for schedule(dynamic)
+ for (int i = 0; i < m_iTPTNumber; i++)
+ {
+ Sleep(i * 100); std::srand(clock());
+ for (int j = 0; j < m_iGraphSize; j++) TptreeDataIndices[i][j] = j;
+ std::random_shuffle(TptreeDataIndices[i].begin(), TptreeDataIndices[i].end());
+ PartitionByTptree(index, TptreeDataIndices[i], 0, m_iGraphSize - 1, TptreeLeafNodes[i]);
+ std::cout << "Finish Getting Leaves for Tree " << i << std::endl;
+ }
+ std::cout << "Parallel TpTree Partition done" << std::endl;
+
+ for (int i = 0; i < m_iTPTNumber; i++)
+ {
+#pragma omp parallel for schedule(dynamic)
+ for (int j = 0; j < TptreeLeafNodes[i].size(); j++)
+ {
+ int start_index = TptreeLeafNodes[i][j].first;
+ int end_index = TptreeLeafNodes[i][j].second;
+ if (omp_get_thread_num() == 0) std::cout << "\rProcessing Tree " << i << ' ' << j * 100 / TptreeLeafNodes[i].size() << '%';
+ for (int x = start_index; x < end_index; x++)
+ {
+ for (int y = x + 1; y <= end_index; y++)
+ {
+ int p1 = TptreeDataIndices[i][x];
+ int p2 = TptreeDataIndices[i][y];
+ float dist = index->ComputeDistance(index->GetSample(p1), index->GetSample(p2));
+ if (idmap != nullptr) {
+ p1 = (idmap->find(p1) == idmap->end()) ? p1 : idmap->at(p1);
+ p2 = (idmap->find(p2) == idmap->end()) ? p2 : idmap->at(p2);
+ }
+ COMMON::Utils::AddNeighbor(p2, dist, (m_pNeighborhoodGraph)[p1], (NeighborhoodDists)[p1], m_iNeighborhoodSize);
+ COMMON::Utils::AddNeighbor(p1, dist, (m_pNeighborhoodGraph)[p2], (NeighborhoodDists)[p2], m_iNeighborhoodSize);
+ }
+ }
+ }
+ TptreeDataIndices[i].clear();
+ TptreeLeafNodes[i].clear();
+ std::cout << std::endl;
+ }
+ TptreeDataIndices.clear();
+ TptreeLeafNodes.clear();
+ }
+
+ if (m_iMaxCheckForRefineGraph > 0) {
+ RefineGraph(index, idmap);
+ }
+ }
+
+ template
+ void RefineGraph(VectorIndex* index, const std::unordered_map* idmap = nullptr)
+ {
+ m_iCEF *= m_iCEFScale;
+ m_iMaxCheckForRefineGraph *= m_iCEFScale;
+
+#pragma omp parallel for schedule(dynamic)
+ for (int i = 0; i < m_iGraphSize; i++)
+ {
+ RefineNode(index, i, false);
+ }
+ std::cout << "Refine RNG, graph acc:" << GraphAccuracyEstimation(index, 100, idmap) << std::endl;
+
+ m_iCEF /= m_iCEFScale;
+ m_iMaxCheckForRefineGraph /= m_iCEFScale;
+ m_iNeighborhoodSize /= m_iNeighborhoodScale;
+
+#pragma omp parallel for schedule(dynamic)
+ for (int i = 0; i < m_iGraphSize; i++)
+ {
+ RefineNode(index, i, false);
+ }
+ std::cout << "Refine RNG, graph acc:" << GraphAccuracyEstimation(index, 100, idmap) << std::endl;
+
+ if (idmap != nullptr) {
+ for (auto iter = idmap->begin(); iter != idmap->end(); iter++)
+ if (iter->first < 0)
+ {
+ m_pNeighborhoodGraph[-1 - iter->first][m_iNeighborhoodSize - 1] = -2 - iter->second;
+ }
+ }
+ }
+
+ template
+ ErrorCode RefineGraph(VectorIndex* index, std::vector& indices, std::vector& reverseIndices,
+ std::string graphFileName, const std::unordered_map* idmap = nullptr)
+ {
+ int R = (int)indices.size();
+
+#pragma omp parallel for schedule(dynamic)
+ for (int i = 0; i < R; i++)
+ {
+ RefineNode(index, indices[i], false);
+ int* nodes = m_pNeighborhoodGraph[indices[i]];
+ for (int j = 0; j < m_iNeighborhoodSize; j++)
+ {
+ if (nodes[j] < 0) nodes[j] = -1;
+ else nodes[j] = reverseIndices[nodes[j]];
+ }
+ if (idmap == nullptr || idmap->find(-1 - indices[i]) == idmap->end()) continue;
+ nodes[m_iNeighborhoodSize - 1] = -2 - idmap->at(-1 - indices[i]);
+ }
+
+ std::ofstream graphOut(graphFileName, std::ios::binary);
+ if (!graphOut.is_open()) return ErrorCode::FailedCreateFile;
+ graphOut.write((char*)&R, sizeof(int));
+ graphOut.write((char*)&m_iNeighborhoodSize, sizeof(int));
+ for (int i = 0; i < R; i++) {
+ graphOut.write((char*)m_pNeighborhoodGraph[indices[i]], sizeof(int) * m_iNeighborhoodSize);
+ }
+ graphOut.close();
+ return ErrorCode::Success;
+ }
+
+
+ template
+ void RefineNode(VectorIndex* index, const int node, bool updateNeighbors)
+ {
+ COMMON::QueryResultSet query((const T*)index->GetSample(node), m_iCEF + 1);
+ index->SearchIndex(query);
+ RebuildNeighbors(index, node, m_pNeighborhoodGraph[node], query.GetResults(), m_iCEF + 1);
+
+ if (updateNeighbors) {
+ // update neighbors
+ for (int j = 0; j <= m_iCEF; j++)
+ {
+ BasicResult* item = query.GetResult(j);
+ if (item->VID < 0) break;
+ if (item->VID == node) continue;
+
+ std::lock_guard lock(m_dataUpdateLock[item->VID]);
+ InsertNeighbors(index, item->VID, node, item->Dist);
+ }
+ }
+ }
+
+ template
+ void PartitionByTptree(VectorIndex* index, std::vector& indices, const int first, const int last,
+ std::vector> & leaves)
+ {
+ if (last - first <= m_iTPTLeafSize)
+ {
+ leaves.push_back(std::make_pair(first, last));
+ }
+ else
+ {
+ std::vector Mean(index->GetFeatureDim(), 0);
+
+ int iIteration = 100;
+ int end = min(first + m_iSamples, last);
+ int count = end - first + 1;
+ // calculate the mean of each dimension
+ for (int j = first; j <= end; j++)
+ {
+ const T* v = (const T*)index->GetSample(indices[j]);
+ for (int k = 0; k < index->GetFeatureDim(); k++)
+ {
+ Mean[k] += v[k];
+ }
+ }
+ for (int k = 0; k < index->GetFeatureDim(); k++)
+ {
+ Mean[k] /= count;
+ }
+ std::vector Variance;
+ Variance.reserve(index->GetFeatureDim());
+ for (int j = 0; j < index->GetFeatureDim(); j++)
+ {
+ Variance.push_back(BasicResult(j, 0));
+ }
+ // calculate the variance of each dimension
+ for (int j = first; j <= end; j++)
+ {
+ const T* v = (const T*)index->GetSample(indices[j]);
+ for (int k = 0; k < index->GetFeatureDim(); k++)
+ {
+ float dist = v[k] - Mean[k];
+ Variance[k].Dist += dist*dist;
+ }
+ }
+ std::sort(Variance.begin(), Variance.end(), COMMON::Compare);
+ std::vector indexs(m_numTopDimensionTPTSplit);
+ std::vector weight(m_numTopDimensionTPTSplit), bestweight(m_numTopDimensionTPTSplit);
+ float bestvariance = Variance[index->GetFeatureDim() - 1].Dist;
+ for (int i = 0; i < m_numTopDimensionTPTSplit; i++)
+ {
+ indexs[i] = Variance[index->GetFeatureDim() - 1 - i].VID;
+ bestweight[i] = 0;
+ }
+ bestweight[0] = 1;
+ float bestmean = Mean[indexs[0]];
+
+ std::vector Val(count);
+ for (int i = 0; i < iIteration; i++)
+ {
+ float sumweight = 0;
+ for (int j = 0; j < m_numTopDimensionTPTSplit; j++)
+ {
+ weight[j] = float(rand() % 10000) / 5000.0f - 1.0f;
+ sumweight += weight[j] * weight[j];
+ }
+ sumweight = sqrt(sumweight);
+ for (int j = 0; j < m_numTopDimensionTPTSplit; j++)
+ {
+ weight[j] /= sumweight;
+ }
+ float mean = 0;
+ for (int j = 0; j < count; j++)
+ {
+ Val[j] = 0;
+ const T* v = (const T*)index->GetSample(indices[first + j]);
+ for (int k = 0; k < m_numTopDimensionTPTSplit; k++)
+ {
+ Val[j] += weight[k] * v[indexs[k]];
+ }
+ mean += Val[j];
+ }
+ mean /= count;
+ float var = 0;
+ for (int j = 0; j < count; j++)
+ {
+ float dist = Val[j] - mean;
+ var += dist * dist;
+ }
+ if (var > bestvariance)
+ {
+ bestvariance = var;
+ bestmean = mean;
+ for (int j = 0; j < m_numTopDimensionTPTSplit; j++)
+ {
+ bestweight[j] = weight[j];
+ }
+ }
+ }
+ int i = first;
+ int j = last;
+ // decide which child one point belongs
+ while (i <= j)
+ {
+ float val = 0;
+ const T* v = (const T*)index->GetSample(indices[i]);
+ for (int k = 0; k < m_numTopDimensionTPTSplit; k++)
+ {
+ val += bestweight[k] * v[indexs[k]];
+ }
+ if (val < bestmean)
+ {
+ i++;
+ }
+ else
+ {
+ std::swap(indices[i], indices[j]);
+ j--;
+ }
+ }
+ // if all the points in the node are equal,equally split the node into 2
+ if ((i == first) || (i == last + 1))
+ {
+ i = (first + last + 1) / 2;
+ }
+
+ Mean.clear();
+ Variance.clear();
+ Val.clear();
+ indexs.clear();
+ weight.clear();
+ bestweight.clear();
+
+ PartitionByTptree(index, indices, first, i - 1, leaves);
+ PartitionByTptree(index, indices, i, last, leaves);
+ }
+ }
+
+ bool LoadGraph(std::string sGraphFilename)
+ {
+ std::cout << "Load Graph From " << sGraphFilename << std::endl;
+ FILE * fp = fopen(sGraphFilename.c_str(), "rb");
+ if (fp == NULL) return false;
+
+ fread(&m_iGraphSize, sizeof(int), 1, fp);
+ fread(&m_iNeighborhoodSize, sizeof(int), 1, fp);
+ m_pNeighborhoodGraph.Initialize(m_iGraphSize, m_iNeighborhoodSize);
+ m_dataUpdateLock.resize(m_iGraphSize);
+
+ for (int i = 0; i < m_iGraphSize; i++)
+ {
+ fread((m_pNeighborhoodGraph)[i], sizeof(int), m_iNeighborhoodSize, fp);
+ }
+ fclose(fp);
+ std::cout << "Load Graph (" << m_iGraphSize << "," << m_iNeighborhoodSize << ") Finish!" << std::endl;
+ return true;
+ }
+
+ bool SetGraph(char* pGraphMemFile)
+ {
+ m_iGraphSize = *((int*)pGraphMemFile);
+ pGraphMemFile += sizeof(int);
+
+ m_iNeighborhoodSize = *((int*)pGraphMemFile);
+ pGraphMemFile += sizeof(int);
+
+ m_pNeighborhoodGraph.Initialize(m_iGraphSize, m_iNeighborhoodSize, (int*)pGraphMemFile);
+ m_dataUpdateLock.resize(m_iGraphSize);
+ return true;
+ }
+
+ bool SaveGraph(std::string sGraphFilename) const
+ {
+ std::cout << "Save Graph To " << sGraphFilename << std::endl;
+ FILE *fp = fopen(sGraphFilename.c_str(), "wb");
+ if (fp == NULL) return false;
+
+ fwrite(&m_iGraphSize, sizeof(int), 1, fp);
+ fwrite(&m_iNeighborhoodSize, sizeof(int), 1, fp);
+ for (int i = 0; i < m_iGraphSize; i++)
+ {
+ fwrite((m_pNeighborhoodGraph)[i], sizeof(int), m_iNeighborhoodSize, fp);
+ }
+ fclose(fp);
+ std::cout << "Save Graph (" << m_iGraphSize << "," << m_iNeighborhoodSize << ") Finish!" << std::endl;
+ return true;
+ }
+
+ inline void AddBatch(int num) { m_pNeighborhoodGraph.AddBatch(num); m_iGraphSize += num; m_dataUpdateLock.resize(m_iGraphSize); }
+
+ inline int* operator[](int index) { return m_pNeighborhoodGraph[index]; }
+
+ inline const int* operator[](int index) const { return m_pNeighborhoodGraph[index]; }
+
+ inline void SetR(int rows) { m_pNeighborhoodGraph.SetR(rows); m_iGraphSize = rows; m_dataUpdateLock.resize(m_iGraphSize); }
+
+ inline int R() const { return m_iGraphSize; }
+
+ static std::shared_ptr CreateInstance(std::string type);
+
+ protected:
+ // Graph structure
+ int m_iGraphSize;
+ COMMON::Dataset m_pNeighborhoodGraph;
+ COMMON::FineGrainedLock m_dataUpdateLock; // protect one row of the graph
+
+ public:
+ int m_iTPTNumber, m_iTPTLeafSize, m_iSamples, m_numTopDimensionTPTSplit;
+ int m_iNeighborhoodSize, m_iNeighborhoodScale, m_iCEFScale, m_iRefineIter, m_iCEF, m_iMaxCheckForRefineGraph;
+ };
+ }
+}
+#endif
diff --git a/AnnService/inc/Core/Common/QueryResultSet.h b/AnnService/inc/Core/Common/QueryResultSet.h
index 33dcf5c7..f410b29d 100644
--- a/AnnService/inc/Core/Common/QueryResultSet.h
+++ b/AnnService/inc/Core/Common/QueryResultSet.h
@@ -38,7 +38,7 @@ class QueryResultSet : public QueryResult
m_target = p_target;
}
- inline const T* GetTarget()
+ inline const T* GetTarget() const
{
return reinterpret_cast(m_target);
}
diff --git a/AnnService/inc/Core/Common/RelativeNeighborhoodGraph.h b/AnnService/inc/Core/Common/RelativeNeighborhoodGraph.h
new file mode 100644
index 00000000..83d5ee4a
--- /dev/null
+++ b/AnnService/inc/Core/Common/RelativeNeighborhoodGraph.h
@@ -0,0 +1,120 @@
+#ifndef _SPTAG_COMMON_RNG_H_
+#define _SPTAG_COMMON_RNG_H_
+
+#include "NeighborhoodGraph.h"
+
+namespace SPTAG
+{
+ namespace COMMON
+ {
+ class RelativeNeighborhoodGraph: public NeighborhoodGraph
+ {
+ public:
+ void RebuildNeighbors(VectorIndex* index, const int node, int* nodes, const BasicResult* queryResults, const int numResults) {
+ int count = 0;
+ for (int j = 0; j < numResults && count < m_iNeighborhoodSize; j++) {
+ const BasicResult& item = queryResults[j];
+ if (item.VID < 0) break;
+ if (item.VID == node) continue;
+
+ bool good = true;
+ for (int k = 0; k < count; k++) {
+ if (index->ComputeDistance(index->GetSample(nodes[k]), index->GetSample(item.VID)) <= item.Dist) {
+ good = false;
+ break;
+ }
+ }
+ if (good) nodes[count++] = item.VID;
+ }
+ for (int j = count; j < m_iNeighborhoodSize; j++) nodes[j] = -1;
+ }
+
+ void InsertNeighbors(VectorIndex* index, const int node, int insertNode, float insertDist)
+ {
+ int* nodes = m_pNeighborhoodGraph[node];
+ for (int k = 0; k < m_iNeighborhoodSize; k++)
+ {
+ int tmpNode = nodes[k];
+ if (tmpNode < -1) continue;
+
+ if (tmpNode < 0)
+ {
+ bool good = true;
+ for (int t = 0; t < k; t++) {
+ if (index->ComputeDistance(index->GetSample(insertNode), index->GetSample(nodes[t])) < insertDist) {
+ good = false;
+ break;
+ }
+ }
+ if (good) {
+ nodes[k] = insertNode;
+ }
+ break;
+ }
+ float tmpDist = index->ComputeDistance(index->GetSample(node), index->GetSample(tmpNode));
+ if (insertDist < tmpDist || (insertDist == tmpDist && insertNode < tmpNode))
+ {
+ bool good = true;
+ for (int t = 0; t < k; t++) {
+ if (index->ComputeDistance(index->GetSample(insertNode), index->GetSample(nodes[t])) < insertDist) {
+ good = false;
+ break;
+ }
+ }
+ if (good) {
+ nodes[k] = insertNode;
+ insertNode = tmpNode;
+ insertDist = tmpDist;
+ }
+ else {
+ break;
+ }
+ }
+ }
+ }
+
+ float GraphAccuracyEstimation(VectorIndex* index, const int samples, const std::unordered_map* idmap = nullptr)
+ {
+ int* correct = new int[samples];
+
+#pragma omp parallel for schedule(dynamic)
+ for (int i = 0; i < samples; i++)
+ {
+ int x = COMMON::Utils::rand_int(m_iGraphSize);
+ //int x = i;
+ COMMON::QueryResultSet query(nullptr, m_iCEF);
+ for (int y = 0; y < m_iGraphSize; y++)
+ {
+ if ((idmap != nullptr && idmap->find(y) != idmap->end())) continue;
+ float dist = index->ComputeDistance(index->GetSample(x), index->GetSample(y));
+ query.AddPoint(y, dist);
+ }
+ query.SortResult();
+ int * exact_rng = new int[m_iNeighborhoodSize];
+ RebuildNeighbors(index, x, exact_rng, query.GetResults(), m_iCEF);
+
+ correct[i] = 0;
+ for (int j = 0; j < m_iNeighborhoodSize; j++) {
+ if (exact_rng[j] == -1) {
+ correct[i] += m_iNeighborhoodSize - j;
+ break;
+ }
+ for (int k = 0; k < m_iNeighborhoodSize; k++)
+ if ((m_pNeighborhoodGraph)[x][k] == exact_rng[j]) {
+ correct[i]++;
+ break;
+ }
+ }
+ delete[] exact_rng;
+ }
+ float acc = 0;
+ for (int i = 0; i < samples; i++) acc += float(correct[i]);
+ acc = acc / samples / m_iNeighborhoodSize;
+ delete[] correct;
+ return acc;
+ }
+
+ };
+ }
+}
+#endif
\ No newline at end of file
diff --git a/AnnService/inc/Core/Common/WorkSpace.h b/AnnService/inc/Core/Common/WorkSpace.h
index f17ff0bb..f2ce87a0 100644
--- a/AnnService/inc/Core/Common/WorkSpace.h
+++ b/AnnService/inc/Core/Common/WorkSpace.h
@@ -126,49 +126,6 @@ namespace SPTAG
}
};
- template
- class CountVector
- {
- size_t m_bytes;
- T* m_data;
- T m_count;
- T MAX;
-
- public:
- void Init(int size)
- {
- m_bytes = sizeof(T) * size;
- m_data = new T[size];
- m_count = 0;
- MAX = ((std::numeric_limits::max)());
- memset(m_data, 0, m_bytes);
- }
-
- CountVector() :m_data(nullptr) {}
- CountVector(int size) { Init(size); }
- ~CountVector() { if (m_data != nullptr) delete[] m_data; }
-
- inline void clear()
- {
- if (m_count == MAX)
- {
- memset(m_data, 0, m_bytes);
- m_count = 1;
- }
- else
- {
- m_count++;
- }
- }
-
- inline bool CheckAndSet(int idx)
- {
- if (m_data[idx] == m_count) return true;
- m_data[idx] = m_count;
- return false;
- }
- };
-
// Variables for each single NN search
struct WorkSpace
{
diff --git a/AnnService/inc/Core/KDT/Index.h b/AnnService/inc/Core/KDT/Index.h
index d21e76b0..5dd094e1 100644
--- a/AnnService/inc/Core/KDT/Index.h
+++ b/AnnService/inc/Core/KDT/Index.h
@@ -1,19 +1,19 @@
#ifndef _SPTAG_KDT_INDEX_H_
#define _SPTAG_KDT_INDEX_H_
-#include "../SearchQuery.h"
-#include "../VectorIndex.h"
#include "../Common.h"
+#include "../VectorIndex.h"
#include "../Common/CommonUtils.h"
#include "../Common/DistanceUtils.h"
#include "../Common/QueryResultSet.h"
-#include "../Common/Heap.h"
#include "../Common/Dataset.h"
#include "../Common/WorkSpace.h"
#include "../Common/WorkSpacePool.h"
-#include "../Common/FineGrainedLock.h"
-#include "../Common/DataUtils.h"
+#include "../Common/RelativeNeighborhoodGraph.h"
+#include "../Common/KDTree.h"
+#include "inc/Helper/StringConvert.h"
+#include "inc/Helper/SimpleIniReader.h"
#include
#include
@@ -29,168 +29,79 @@ namespace SPTAG
namespace KDT
{
- // node type for storing KDT
- struct KDTNode
- {
- int left;
- int right;
- short split_dim;
- float split_value;
- };
-
template
class Index : public VectorIndex
{
private:
- // Initial data points
- int m_iDataSize;
- int m_iDataDimension;
+ // data points
COMMON::Dataset m_pSamples;
// KDT structures.
- int m_iKDTNumber;
- std::vector m_pKDTStart;
- std::vector m_pKDTRoots;
- int m_numTopDimensionKDTSplit;
- int m_numSamplesKDTSplitConsideration;
+ COMMON::KDTree m_pTrees;
// Graph structure
- int m_iGraphSize;
- int m_iNeighborhoodSize;
- COMMON::Dataset m_pNeighborhoodGraph;
-
- // Variables for building TPTs
- int m_iTPTNumber;
- int m_iTPTLeafSize;
- int m_numTopDimensionTPTSplit;
- int m_numSamplesTPTSplitConsideration;
-
- // Variables for building graph
- int m_iRefineIter;
- int m_iCEF;
- int m_iMaxCheckForRefineGraph;
- int m_iMaxCheck;
- std::unordered_map m_pSampleToCenter;
+ COMMON::RelativeNeighborhoodGraph m_pGraph;
- // Load from files directly
std::string m_sKDTFilename;
std::string m_sGraphFilename;
std::string m_sDataPointsFilename;
- // Load from memory mapped files
- char* m_pKDTMemoryFile;
- char* m_pGraphMemoryFile;
- char* m_pDataPointsMemoryFile;
-
- DistCalcMethod m_iDistCalcMethod;
- float(*m_fComputeDistance)(const T* pX, const T* pY, int length);
-
- int m_iCacheSize;
- int m_iDebugLoad;
-
- int g_iThresholdOfNumberOfContinuousNoBetterPropagation;
- int g_iNumberOfInitialDynamicPivots;
- int g_iNumberOfOtherDynamicPivots;
-
- int m_iNumberOfThreads;
- std::mutex m_dataAllocLock;
- COMMON::FineGrainedLock m_dataUpdateLock;
+ std::mutex m_dataLock; // protect data and graph
tbb::concurrent_unordered_set m_deletedID;
std::unique_ptr m_workSpacePool;
+
+ int m_iNumberOfThreads;
+ DistCalcMethod m_iDistCalcMethod;
+ float(*m_fComputeDistance)(const T* pX, const T* pY, int length);
+
+ int m_iMaxCheck;
+ int m_iThresholdOfNumberOfContinuousNoBetterPropagation;
+ int m_iNumberOfInitialDynamicPivots;
+ int m_iNumberOfOtherDynamicPivots;
public:
- Index() : m_iKDTNumber(1),
- m_numTopDimensionKDTSplit(5),
- m_numSamplesKDTSplitConsideration(100),
- m_iNeighborhoodSize(32),
- m_iTPTNumber(32),
- m_iTPTLeafSize(2000),
- m_numTopDimensionTPTSplit(5),
- m_numSamplesTPTSplitConsideration(1000),
- m_iRefineIter(0),
- m_iCEF(1000),
- m_iMaxCheckForRefineGraph(10000),
- m_iMaxCheck(2048),
- m_pKDTMemoryFile(NULL),
- m_pGraphMemoryFile(NULL),
- m_pDataPointsMemoryFile(NULL),
+ Index() :
m_sKDTFilename("tree.bin"),
m_sGraphFilename("graph.bin"),
m_sDataPointsFilename("vectors.bin"),
m_iNumberOfThreads(1),
m_iDistCalcMethod(DistCalcMethod::Cosine),
m_fComputeDistance(COMMON::DistanceCalcSelector(DistCalcMethod::Cosine)),
- m_iCacheSize(-1),
- m_iDebugLoad(-1),
- g_iThresholdOfNumberOfContinuousNoBetterPropagation(3),
- g_iNumberOfInitialDynamicPivots(50),
- g_iNumberOfOtherDynamicPivots(4) {}
-
- ~Index() {
- m_pKDTRoots.clear();
- }
- int GetNumSamples() const { return m_pSamples.R(); }
- int GetFeatureDim() const { return m_pSamples.C(); }
- int GetNumThreads() const { return m_iNumberOfThreads; }
- int GetCurrMaxCheck() const { return m_iMaxCheck; }
-
- DistCalcMethod GetDistCalcMethod() const { return m_iDistCalcMethod; }
- IndexAlgoType GetIndexAlgoType() const { return IndexAlgoType::KDT; }
- VectorValueType GetVectorValueType() const { return GetEnumValueType(); }
+ m_iMaxCheck(2048),
+ m_iThresholdOfNumberOfContinuousNoBetterPropagation(3),
+ m_iNumberOfInitialDynamicPivots(50),
+ m_iNumberOfOtherDynamicPivots(4) {}
+
+ ~Index() {}
+
+ inline int GetNumSamples() const { return m_pSamples.R(); }
+ inline int GetFeatureDim() const { return m_pSamples.C(); }
+
+ inline int GetCurrMaxCheck() const { return m_iMaxCheck; }
+ inline int GetNumThreads() const { return m_iNumberOfThreads; }
+ inline DistCalcMethod GetDistCalcMethod() const { return m_iDistCalcMethod; }
+ inline IndexAlgoType GetIndexAlgoType() const { return IndexAlgoType::KDT; }
+ inline VectorValueType GetVectorValueType() const { return GetEnumValueType(); }
+
+ inline float ComputeDistance(const void* pX, const void* pY) const { return m_fComputeDistance((const T*)pX, (const T*)pY, m_pSamples.C()); }
+ inline const void* GetSample(const int idx) const { return (void*)m_pSamples[idx]; }
ErrorCode BuildIndex(const void* p_data, int p_vectorNum, int p_dimension);
- ErrorCode LoadIndex(const std::string& p_folderPath);
- ErrorCode LoadIndexFromMemory(const std::vector& p_indexBlobs);
- ErrorCode SaveIndex(const std::string& p_folderPath);
+ ErrorCode LoadIndexFromMemory(const std::vector& p_indexBlobs);
- void SearchIndex(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const tbb::concurrent_unordered_set &p_deleted) const;
+ ErrorCode SaveIndex(const std::string& p_folderPath, std::ofstream& p_configout);
+ ErrorCode LoadIndex(const std::string& p_folderPath, Helper::IniReader& p_reader);
ErrorCode SearchIndex(QueryResult &p_query) const;
-
ErrorCode AddIndex(const void* p_vectors, int p_vectorNum, int p_dimension);
ErrorCode DeleteIndex(const void* p_vectors, int p_vectorNum);
- ErrorCode RefineIndex(const std::string& p_folderPath);
- ErrorCode MergeIndex(const char* p_indexFilePath1, const char* p_indexFilePath2);
ErrorCode SetParameter(const char* p_param, const char* p_value);
std::string GetParameter(const char* p_param) const;
private:
- // Functions for loading models from files
- bool LoadDataPoints(std::string sDataPointsFileName);
- bool LoadKDT(std::string sKDTFilename);
- bool LoadGraph(std::string sGraphFilename);
-
- // Functions for loading models from memory mapped files
- bool LoadDataPoints(char* pDataPointsMemFile);
- bool LoadKDT(char* pKDTMemFile);
- bool LoadGraph(char* pGraphMemFile);
-
- bool SaveDataPoints(std::string sDataPointsFileName);
-
- // Functions for building kdtree
- void BuildKDT(std::vector& indices, std::vector& newStart, std::vector& newRoot);
- bool SaveKDT(std::string sKDTFilename, std::vector& newStart, std::vector& newRoot) const;
- void DivideTree(KDTNode* pTree, std::vector& indices,int first, int last,
- int index, int &iTreeSize);
- void ChooseDivision(KDTNode& node, const std::vector& indices, int first, int last);
- int SelectDivisionDimension(const std::vector& varianceValues) const;
- int Subdivide(const KDTNode& node, std::vector& indices, const int first, const int last);
-
- // Functions for building Graph
- void BuildRNG();
- bool SaveRNG(std::string sGraphFilename) const;
- void PartitionByTptree(std::vector &indices,
- const int first,
- const int last,
- std::vector> &leaves);
- void RefineRNG();
- void RefineRNGNode(const int node, COMMON::WorkSpace &space, bool updateNeighbors);
- void RebuildRNGNodeNeighbors(int* nodes, const BasicResult* queryResults, int numResults);
- float GraphAccuracyEstimation(int NSample, bool rng);
-
- // Functions for hybrid search
- void KDTSearch(const int node, const bool isInit, const float distBound,
- COMMON::WorkSpace& space, COMMON::QueryResultSet &query, const tbb::concurrent_unordered_set &deleted) const;
+ ErrorCode RefineIndex(const std::string& p_folderPath);
+ void SearchIndexWithDeleted(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const tbb::concurrent_unordered_set &p_deleted) const;
+ void SearchIndexWithoutDeleted(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space) const;
};
} // namespace KDT
} // namespace SPTAG
diff --git a/AnnService/inc/Core/KDT/ParameterDefinitionList.h b/AnnService/inc/Core/KDT/ParameterDefinitionList.h
index 932a525f..8ca2ef74 100644
--- a/AnnService/inc/Core/KDT/ParameterDefinitionList.h
+++ b/AnnService/inc/Core/KDT/ParameterDefinitionList.h
@@ -5,25 +5,27 @@ DefineKDTParameter(m_sKDTFilename, std::string, std::string("tree.bin"), "TreeFi
DefineKDTParameter(m_sGraphFilename, std::string, std::string("graph.bin"), "GraphFilePath")
DefineKDTParameter(m_sDataPointsFilename, std::string, std::string("vectors.bin"), "VectorFilePath")
-DefineKDTParameter(m_iKDTNumber, int, 1L, "KDTNumber")
-DefineKDTParameter(m_numTopDimensionKDTSplit, int, 5L, "NumTopDimensionKDTSplit")
-DefineKDTParameter(m_numSamplesKDTSplitConsideration, int, 100L, "NumSamplesKDTSplitConsideration")
-DefineKDTParameter(m_iNeighborhoodSize, int, 32L, "NeighborhoodSize")
-DefineKDTParameter(m_iTPTNumber, int, 32L, "TPTNumber")
-DefineKDTParameter(m_iTPTLeafSize, int, 2000L, "TPTLeafSize")
-DefineKDTParameter(m_numTopDimensionTPTSplit, int, 5L, "NumTopDimensionTPTSplit")
-DefineKDTParameter(m_numSamplesTPTSplitConsideration, int, 100L, "NumSamplesTPTSplitConsideration")
-DefineKDTParameter(m_iCEF, int, 1000L, "CEF")
-DefineKDTParameter(m_iMaxCheckForRefineGraph, int, 10000L, "MaxCheckForRefineGraph")
-DefineKDTParameter(m_iMaxCheck, int, 8192L, "MaxCheck")
-DefineKDTParameter(m_iNumberOfThreads, int, 1L, "NumberOfThreads")
+DefineKDTParameter(m_pTrees.m_iTreeNumber, int, 1L, "KDTNumber")
+DefineKDTParameter(m_pTrees.m_numTopDimensionKDTSplit, int, 5L, "NumTopDimensionKDTSplit")
+DefineKDTParameter(m_pTrees.m_iSamples, int, 100L, "NumSamplesKDTSplitConsideration")
-DefineKDTParameter(g_iThresholdOfNumberOfContinuousNoBetterPropagation, int, 3L, "ThresholdOfNumberOfContinuousNoBetterPropagation")
-DefineKDTParameter(g_iNumberOfInitialDynamicPivots, int, 50L, "NumberOfInitialDynamicPivots")
-DefineKDTParameter(g_iNumberOfOtherDynamicPivots, int, 4L, "NumberOfOtherDynamicPivots")
+DefineKDTParameter(m_pGraph.m_iTPTNumber, int, 32L, "TPTNumber")
+DefineKDTParameter(m_pGraph.m_iTPTLeafSize, int, 2000L, "TPTLeafSize")
+DefineKDTParameter(m_pGraph.m_numTopDimensionTPTSplit, int, 5L, "NumTopDimensionTPTSplit")
+DefineKDTParameter(m_pGraph.m_iNeighborhoodSize, int, 32L, "NeighborhoodSize")
+DefineKDTParameter(m_pGraph.m_iNeighborhoodScale, int, 16L, "GraphNeighborhoodScale")
+DefineKDTParameter(m_pGraph.m_iCEFScale, int, 4L, "GraphCEFScale")
+DefineKDTParameter(m_pGraph.m_iRefineIter, int, 0L, "RefineIterations")
+DefineKDTParameter(m_pGraph.m_iCEF, int, 1000L, "CEF")
+DefineKDTParameter(m_pGraph.m_iMaxCheckForRefineGraph, int, 10000L, "MaxCheckForRefineGraph")
+
+DefineKDTParameter(m_iNumberOfThreads, int, 1L, "NumberOfThreads")
DefineKDTParameter(m_iDistCalcMethod, SPTAG::DistCalcMethod, SPTAG::DistCalcMethod::Cosine, "DistCalcMethod")
-DefineKDTParameter(m_iRefineIter, int, 0L, "RefineIterations")
-DefineKDTParameter(m_iDebugLoad, int, -1, "NumTrains")
-DefineKDTParameter(m_iCacheSize, int, -1, "CacheSize")
+
+DefineKDTParameter(m_iMaxCheck, int, 8192L, "MaxCheck")
+DefineKDTParameter(m_iThresholdOfNumberOfContinuousNoBetterPropagation, int, 3L, "ThresholdOfNumberOfContinuousNoBetterPropagation")
+DefineKDTParameter(m_iNumberOfInitialDynamicPivots, int, 50L, "NumberOfInitialDynamicPivots")
+DefineKDTParameter(m_iNumberOfOtherDynamicPivots, int, 4L, "NumberOfOtherDynamicPivots")
+
#endif
diff --git a/AnnService/inc/Core/MetadataSet.h b/AnnService/inc/Core/MetadataSet.h
index e9794893..f476531e 100644
--- a/AnnService/inc/Core/MetadataSet.h
+++ b/AnnService/inc/Core/MetadataSet.h
@@ -26,6 +26,8 @@ class MetadataSet
virtual ErrorCode SaveMetadata(const std::string& p_metaFile, const std::string& p_metaindexFile) = 0;
+ virtual ErrorCode RefineMetadata(std::vector& indices, const std::string& p_folderPath);
+
static ErrorCode MetaCopy(const std::string& p_src, const std::string& p_dst);
};
@@ -52,7 +54,7 @@ class FileMetadataSet : public MetadataSet
std::vector m_pOffsets;
- int m_count;
+ SizeType m_count;
std::string m_metaFile;
diff --git a/AnnService/inc/Core/VectorIndex.h b/AnnService/inc/Core/VectorIndex.h
index 6f648d36..cbe1b579 100644
--- a/AnnService/inc/Core/VectorIndex.h
+++ b/AnnService/inc/Core/VectorIndex.h
@@ -5,6 +5,7 @@
#include "SearchQuery.h"
#include "VectorSet.h"
#include "MetadataSet.h"
+#include "inc/Helper/SimpleIniReader.h"
namespace SPTAG
{
@@ -16,9 +17,9 @@ class VectorIndex
virtual ~VectorIndex();
- virtual ErrorCode SaveIndex(const std::string& p_folderPath) = 0;
+ virtual ErrorCode SaveIndex(const std::string& p_folderPath, std::ofstream& p_configout) = 0;
- virtual ErrorCode LoadIndex(const std::string& p_folderPath) = 0;
+ virtual ErrorCode LoadIndex(const std::string& p_folderPath, Helper::IniReader& p_reader) = 0;
virtual ErrorCode LoadIndexFromMemory(const std::vector& p_indexBlobs) = 0;
@@ -30,14 +31,12 @@ class VectorIndex
virtual ErrorCode DeleteIndex(const void* p_vectors, int p_vectorNum) = 0;
- virtual ErrorCode RefineIndex(const std::string& p_folderPath) = 0;
-
- virtual ErrorCode MergeIndex(const char* p_indexFilePath1, const char* p_indexFilePath2) = 0;
-
//virtual ErrorCode AddIndexWithID(const void* p_vector, const int& p_id) = 0;
//virtual ErrorCode DeleteIndexWithID(const void* p_vector, const int& p_id) = 0;
-
+
+ virtual float ComputeDistance(const void* pX, const void* pY) const = 0;
+ virtual const void* GetSample(const int idx) const = 0;
virtual int GetFeatureDim() const = 0;
virtual int GetNumSamples() const = 0;
@@ -49,6 +48,10 @@ class VectorIndex
virtual std::string GetParameter(const char* p_param) const = 0;
virtual ErrorCode SetParameter(const char* p_param, const char* p_value) = 0;
+ virtual ErrorCode LoadIndex(const std::string& p_folderPath);
+
+ virtual ErrorCode SaveIndex(const std::string& p_folderPath);
+
virtual ErrorCode BuildIndex(std::shared_ptr p_vectorSet, std::shared_ptr p_metadataSet);
virtual ErrorCode SearchIndex(const void* p_vector, int p_neighborCount, std::vector& p_results) const;
@@ -61,16 +64,22 @@ class VectorIndex
virtual ByteArray GetMetadata(IndexType p_vectorID) const;
virtual void SetMetadata(const std::string& p_metadataFilePath, const std::string& p_metadataIndexPath);
- void SetIndexName(const std::string& p_indexName);
-
- const std::string& GetIndexName() const;
+ virtual std::string GetIndexName() const
+ {
+ if (m_sIndexName == "")
+ return Helper::Convert::ConvertToString(GetIndexAlgoType());
+ return m_sIndexName;
+ }
+ virtual void SetIndexName(std::string p_name) { m_sIndexName = p_name; }
static std::shared_ptr CreateInstance(IndexAlgoType p_algo, VectorValueType p_valuetype);
+ static ErrorCode MergeIndex(const char* p_indexFilePath1, const char* p_indexFilePath2);
+
static ErrorCode LoadIndex(const std::string& p_loaderFilePath, std::shared_ptr& p_vectorIndex);
protected:
- std::string m_indexName;
+ std::string m_sIndexName;
std::shared_ptr m_pMetadata;
};
diff --git a/AnnService/src/Core/BKT/BKTIndex.cpp b/AnnService/src/Core/BKT/BKTIndex.cpp
index 8f9a1862..c6f3d466 100644
--- a/AnnService/src/Core/BKT/BKTIndex.cpp
+++ b/AnnService/src/Core/BKT/BKTIndex.cpp
@@ -1,9 +1,4 @@
#include "inc/Core/BKT/Index.h"
-#include "inc/Core/Common/WorkSpacePool.h"
-#include "inc/Core/MetadataSet.h"
-#include "inc/Helper/StringConvert.h"
-#include "inc/Helper/CommonHelper.h"
-#include "inc/Helper/SimpleIniReader.h"
#pragma warning(disable:4996) // 'fopen': This function or variable may be unsafe. Consider using fopen_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details.
#pragma warning(disable:4242) // '=' : conversion from 'int' to 'short', possible loss of data
@@ -14,307 +9,93 @@ namespace SPTAG
{
namespace BKT
{
-#pragma region Load data points, kd-tree, neighborhood graph
template
ErrorCode Index::LoadIndexFromMemory(const std::vector& p_indexBlobs)
{
- if (!LoadDataPoints((char*)p_indexBlobs[0])) return ErrorCode::FailedParseValue;
- if (!LoadBKT((char*)p_indexBlobs[1])) return ErrorCode::FailedParseValue;
- if (!LoadGraph((char*)p_indexBlobs[2])) return ErrorCode::FailedParseValue;
+ if (!m_pSamples.Load((char*)p_indexBlobs[0])) return ErrorCode::FailedParseValue;
+ if (!m_pTrees.LoadTrees((char*)p_indexBlobs[1])) return ErrorCode::FailedParseValue;
+ if (!m_pGraph.LoadGraph((char*)p_indexBlobs[2])) return ErrorCode::FailedParseValue;
return ErrorCode::Success;
}
template
- ErrorCode Index::LoadIndex(const std::string& p_folderPath)
+ ErrorCode Index::LoadIndex(const std::string& p_folderPath, Helper::IniReader& p_reader)
{
- std::string folderPath(p_folderPath);
- if (!folderPath.empty() && *(folderPath.rbegin()) != FolderSep)
- {
- folderPath += FolderSep;
- }
-
- Helper::IniReader p_configReader;
- if (ErrorCode::Success != p_configReader.LoadIniFile(folderPath + "/indexloader.ini"))
- {
- return ErrorCode::FailedOpenFile;
- }
-
- std::string metadataSection("MetaData");
- if (p_configReader.DoesSectionExist(metadataSection))
- {
- std::string metadataFilePath = p_configReader.GetParameter(metadataSection,
- "MetaDataFilePath",
- std::string());
- std::string metadataIndexFilePath = p_configReader.GetParameter(metadataSection,
- "MetaDataIndexPath",
- std::string());
-
- m_pMetadata.reset(new FileMetadataSet(folderPath + metadataFilePath, folderPath + metadataIndexFilePath));
-
- if (!m_pMetadata->Available())
- {
- std::cerr << "Error: Failed to load metadata." << std::endl;
- return ErrorCode::Fail;
- }
- }
-
#define DefineBKTParameter(VarName, VarType, DefaultValue, RepresentStr) \
SetParameter(RepresentStr, \
- p_configReader.GetParameter("Index", \
- RepresentStr, \
- std::string(#DefaultValue)).c_str()); \
+ p_reader.GetParameter("Index", \
+ RepresentStr, \
+ std::string(#DefaultValue)).c_str()); \
#include "inc/Core/BKT/ParameterDefinitionList.h"
#undef DefineBKTParameter
- if (DistCalcMethod::Undefined == m_iDistCalcMethod)
- {
- return ErrorCode::Fail;
- }
-
- if (!LoadDataPoints(folderPath + m_sDataPointsFilename)) return ErrorCode::Fail;
- if (!LoadBKT(folderPath + m_sBKTFilename)) return ErrorCode::Fail;
- if (!LoadGraph(folderPath + m_sGraphFilename)) return ErrorCode::Fail;
-
- m_iDataSize = m_pSamples.R();
- m_iDataDimension = m_pSamples.C();
- m_dataUpdateLock.resize(m_iDataSize);
+ if (!m_pSamples.Load(p_folderPath + m_sDataPointsFilename)) return ErrorCode::Fail;
+ if (!m_pTrees.LoadTrees(p_folderPath + m_sBKTFilename)) return ErrorCode::Fail;
+ if (!m_pGraph.LoadGraph(p_folderPath + m_sGraphFilename)) return ErrorCode::Fail;
m_workSpacePool.reset(new COMMON::WorkSpacePool(m_iMaxCheck, GetNumSamples()));
m_workSpacePool->Init(m_iNumberOfThreads);
return ErrorCode::Success;
}
- template
- bool Index::LoadDataPoints(std::string sDataPointsFileName)
- {
- std::cout << "Load Data Points From " << sDataPointsFileName << std::endl;
- FILE * fp = fopen(sDataPointsFileName.c_str(), "rb");
- if (fp == NULL) return false;
-
- int R, C;
- fread(&R, sizeof(int), 1, fp);
- fread(&C, sizeof(int), 1, fp);
-
- if (m_iDebugLoad > 0 && R > m_iDebugLoad) R = m_iDebugLoad;
-
- m_pSamples.Initialize(R, C);
- int i = 0, batch = 10000;
- while (i + batch < R) {
- fread((m_pSamples)[i], sizeof(T), C * batch, fp);
- i += batch;
- }
- fread((m_pSamples)[i], sizeof(T), C * (R - i), fp);
- fclose(fp);
- std::cout << "Load Data Points (" << m_pSamples.R() << ", " << m_pSamples.C() << ") Finish!" << std::endl;
- return true;
- }
-
- // Functions for loading models from memory mapped files
- template
- bool Index::LoadDataPoints(char* pDataPointsMemFile)
- {
- int R, C;
- R = *((int*)pDataPointsMemFile);
- pDataPointsMemFile += sizeof(int);
-
- C = *((int*)pDataPointsMemFile);
- pDataPointsMemFile += sizeof(int);
-
- m_pSamples.Initialize(R, C, (T*)pDataPointsMemFile);
-
- return true;
- }
-
- template
- bool Index::LoadBKT(std::string sBKTFilename)
- {
- std::cout << "Load BKT From " << sBKTFilename << std::endl;
- FILE *fp = fopen(sBKTFilename.c_str(), "rb");
- if (fp == NULL) return false;
- int realBKTNumber;
- fread(&realBKTNumber, sizeof(int), 1, fp);
- m_pBKTStart.resize(realBKTNumber);
- fread(m_pBKTStart.data(), sizeof(int), realBKTNumber, fp);
- if (realBKTNumber < m_iBKTNumber) m_iBKTNumber = realBKTNumber;
- int treeNodeSize;
- fread(&treeNodeSize, sizeof(int), 1, fp);
- m_pBKTRoots.resize(treeNodeSize);
- for (int i = 0; i < treeNodeSize; i++) {
- fread(&(m_pBKTRoots[i].centerid), sizeof(int), 1, fp);
- fread(&(m_pBKTRoots[i].childStart), sizeof(int), 1, fp);
- fread(&(m_pBKTRoots[i].childEnd), sizeof(int), 1, fp);
- }
- fclose(fp);
- std::cout << "Load BKT (" << m_iBKTNumber << ", " << treeNodeSize << ") Finish!" << std::endl;
- return true;
- }
-
- template
- bool Index::LoadBKT(char* pBKTMemFile)
- {
- int realBKTNumber = *((int*)pBKTMemFile);
- pBKTMemFile += sizeof(int);
- m_pBKTStart.resize(realBKTNumber);
- memcpy(m_pBKTStart.data(), pBKTMemFile, sizeof(int)*realBKTNumber);
- pBKTMemFile += sizeof(int)*realBKTNumber;
- if (realBKTNumber < m_iBKTNumber) m_iBKTNumber = realBKTNumber;
+#pragma region K-NN search
- int treeNodeSize = *((int*)pBKTMemFile);
- pBKTMemFile += sizeof(int);
- m_pBKTRoots.resize(treeNodeSize);
- for (int i = 0; i < treeNodeSize; i++) {
- m_pBKTRoots[i].centerid = *((int*)pBKTMemFile);
- pBKTMemFile += sizeof(int);
- m_pBKTRoots[i].childStart = *((int*)pBKTMemFile);
- pBKTMemFile += sizeof(int);
- m_pBKTRoots[i].childEnd = *((int*)pBKTMemFile);
- pBKTMemFile += sizeof(int);
- }
- return true;
- }
+#define Search(CheckDeleted1) \
+ m_pTrees.InitSearchTrees(this, p_query, p_space); \
+ const int checkPos = m_pGraph.m_iNeighborhoodSize - 1; \
+ while (!p_space.m_SPTQueue.empty()) { \
+ m_pTrees.SearchTrees(this, p_query, p_space, m_iNumberOfOtherDynamicPivots + p_space.m_iNumberOfCheckedLeaves); \
+ while (!p_space.m_NGQueue.empty()) { \
+ COMMON::HeapCell gnode = p_space.m_NGQueue.pop(); \
+ const int *node = m_pGraph[gnode.node]; \
+ _mm_prefetch((const char *)node, _MM_HINT_T0); \
+ CheckDeleted1 { \
+ if (p_query.AddPoint(gnode.node, gnode.distance)) { \
+ p_space.m_iNumOfContinuousNoBetterPropagation = 0; \
+ int checkNode = node[checkPos]; \
+ if (checkNode < -1) { \
+ const COMMON::BKTNode& tnode = m_pTrees[-2 - checkNode]; \
+ for (int i = -tnode.childStart; i < tnode.childEnd; i++) { \
+ if (!p_query.AddPoint(m_pTrees[i].centerid, gnode.distance)) break; \
+ } \
+ } \
+ } \
+ else { \
+ p_space.m_iNumOfContinuousNoBetterPropagation++; \
+ if (p_space.m_iNumOfContinuousNoBetterPropagation > p_space.m_iContinuousLimit || p_space.m_iNumberOfCheckedLeaves > p_space.m_iMaxCheck) { \
+ p_query.SortResult(); return; \
+ } \
+ } \
+ } \
+ for (int i = 0; i <= checkPos; i++) { \
+ _mm_prefetch((const char *)(m_pSamples)[node[i]], _MM_HINT_T0); \
+ } \
+ for (int i = 0; i <= checkPos; i++) { \
+ int nn_index = node[i]; \
+ if (nn_index < 0) break; \
+ if (p_space.CheckAndSet(nn_index)) continue; \
+ float distance2leaf = m_fComputeDistance(p_query.GetTarget(), (m_pSamples)[nn_index], GetFeatureDim()); \
+ p_space.m_iNumberOfCheckedLeaves++; \
+ p_space.m_NGQueue.insert(COMMON::HeapCell(nn_index, distance2leaf)); \
+ } \
+ if (p_space.m_NGQueue.Top().distance > p_space.m_SPTQueue.Top().distance) { \
+ break; \
+ } \
+ } \
+ } \
+ p_query.SortResult(); \
template
- bool Index::LoadGraph(std::string sGraphFilename)
+ void Index::SearchIndexWithDeleted(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const tbb::concurrent_unordered_set &p_deleted) const
{
- std::cout << "Load Graph From " << sGraphFilename << std::endl;
- FILE * fp = fopen(sGraphFilename.c_str(), "rb");
- if (fp == NULL) return false;
- fread(&m_iGraphSize, sizeof(int), 1, fp);
- int KNNinGraph;
- fread(&KNNinGraph, sizeof(int), 1, fp);
- if (KNNinGraph < m_iNeighborhoodSize) m_iNeighborhoodSize = KNNinGraph;
-
- m_pNeighborhoodGraph.Initialize(m_iGraphSize, m_iNeighborhoodSize);
-
- std::vector unusedData(KNNinGraph);
- for (int i = 0; i < m_iGraphSize; i++)
- {
- fread((m_pNeighborhoodGraph)[i], sizeof(int), m_iNeighborhoodSize, fp);
- if (m_iNeighborhoodSize < KNNinGraph)
- {
- fread(&unusedData[0], sizeof(int), KNNinGraph - m_iNeighborhoodSize, fp);
- }
- }
- fclose(fp);
- std::cout << "Load Graph (" << m_iGraphSize << "," << m_iNeighborhoodSize << ") Finish!" << std::endl;
- return true;
+ Search(if (p_deleted.find(gnode.node) == p_deleted.end()))
}
template
- bool Index::LoadGraph(char* pGraphMemFile) {
- m_iGraphSize = *((int*)pGraphMemFile);
- pGraphMemFile += sizeof(int);
-
- int KNNinGraph = *((int*)pGraphMemFile);
- pGraphMemFile += sizeof(int);
-
- // In the memory mapped file mode, we'll not accept NeighborhoodSize in graph file that's larger than expected size (m_iNeighborhoodSize)
- // as we don't want to make another copy to fit.
- if (KNNinGraph > m_iNeighborhoodSize) return false;
-
- if (KNNinGraph < m_iNeighborhoodSize) m_iNeighborhoodSize = KNNinGraph;
-
- m_pNeighborhoodGraph.Initialize(m_iGraphSize, m_iNeighborhoodSize, (int*)pGraphMemFile);
-
- return true;
- }
-#pragma endregion
-
-#pragma region K-NN search
-
- template
- void Index::SearchIndex(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const tbb::concurrent_unordered_set &p_deleted) const
+ void Index::SearchIndexWithoutDeleted(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space) const
{
- for (char i = 0; i < m_iBKTNumber; i++) {
- const BKTNode& node = m_pBKTRoots[m_pBKTStart[i]];
- if (node.childStart < 0) {
- p_space.m_SPTQueue.insert(COMMON::HeapCell(m_pBKTStart[i], m_fComputeDistance(p_query.GetTarget(), (m_pSamples)[node.centerid], m_iDataDimension)));
- }
- else {
- for (int begin = node.childStart; begin < node.childEnd; begin++) {
- int index = m_pBKTRoots[begin].centerid;
- p_space.m_SPTQueue.insert(COMMON::HeapCell(begin, m_fComputeDistance(p_query.GetTarget(), (m_pSamples)[index], m_iDataDimension)));
- }
- }
- }
- int checkLimit = g_iNumberOfInitialDynamicPivots;
- const int checkPos = m_iNeighborhoodSize - 1;
- while (!p_space.m_SPTQueue.empty()) {
- do
- {
- COMMON::HeapCell bcell = p_space.m_SPTQueue.pop();
- const BKTNode& tnode = m_pBKTRoots[bcell.node];
-
- if (tnode.childStart < 0) {
- if (!p_space.CheckAndSet(tnode.centerid)) {
- p_space.m_iNumberOfCheckedLeaves++;
- p_space.m_NGQueue.insert(COMMON::HeapCell(tnode.centerid, bcell.distance));
- }
- if (p_space.m_iNumberOfCheckedLeaves >= checkLimit) break;
- }
- else {
- if (!p_space.CheckAndSet(tnode.centerid)) {
- p_space.m_NGQueue.insert(COMMON::HeapCell(tnode.centerid, bcell.distance));
- }
- for (int begin = tnode.childStart; begin < tnode.childEnd; begin++) {
- int index = m_pBKTRoots[begin].centerid;
- p_space.m_SPTQueue.insert(COMMON::HeapCell(begin, m_fComputeDistance(p_query.GetTarget(), (m_pSamples)[index], m_iDataDimension)));
- }
- }
- } while (!p_space.m_SPTQueue.empty());
- while (!p_space.m_NGQueue.empty()) {
- COMMON::HeapCell gnode = p_space.m_NGQueue.pop();
- const int *node = (m_pNeighborhoodGraph)[gnode.node];
- _mm_prefetch((const char *)node, _MM_HINT_T0);
- if (p_deleted.find(gnode.node) == p_deleted.end()) {
- if (p_query.AddPoint(gnode.node, gnode.distance)) {
- p_space.m_iNumOfContinuousNoBetterPropagation = 0;
-
- int checkNode = node[checkPos];
- if (checkNode < -1) {
- const BKTNode& tnode = m_pBKTRoots[-2 - checkNode];
- for (int i = -tnode.childStart; i < tnode.childEnd; i++) {
- if (p_deleted.find(m_pBKTRoots[i].centerid) == p_deleted.end()) {
- if (!p_query.AddPoint(m_pBKTRoots[i].centerid, gnode.distance)) break;
- }
- }
- }
- }
- else {
- p_space.m_iNumOfContinuousNoBetterPropagation++;
- if (p_space.m_iNumOfContinuousNoBetterPropagation > p_space.m_iContinuousLimit || p_space.m_iNumberOfCheckedLeaves > p_space.m_iMaxCheck) {
- p_query.SortResult(); return;
- }
- }
- }
-
-#ifdef PREFETCH
- for (int i = 0; i <= checkPos; i++) {
- _mm_prefetch((const char *)(m_pSamples)[node[i]], _MM_HINT_T0);
- }
-#endif
-
- for (int i = 0; i <= checkPos; i++)
- {
- int nn_index = node[i];
-
- // do not check it if it has been checked
- if (nn_index < 0) break;
- if (p_space.CheckAndSet(nn_index)) continue;
-
- // count the number of the computed nodes
- float distance2leaf = m_fComputeDistance(p_query.GetTarget(), (m_pSamples)[nn_index], m_iDataDimension);
- p_space.m_iNumberOfCheckedLeaves++;
- p_space.m_NGQueue.insert(COMMON::HeapCell(nn_index, distance2leaf));
- }
- if (p_space.m_NGQueue.Top().distance > p_space.m_SPTQueue.Top().distance) {
- checkLimit = g_iNumberOfOtherDynamicPivots + p_space.m_iNumberOfCheckedLeaves;
- break;
- }
- }
- }
- p_query.SortResult();
+ Search(;)
}
template
@@ -324,633 +105,48 @@ namespace SPTAG
auto workSpace = m_workSpacePool->Rent();
workSpace->Reset(m_iMaxCheck);
- SearchIndex(*((COMMON::QueryResultSet*)&p_query), *workSpace, m_deletedID);
+ if (m_deletedID.size() > 0)
+ SearchIndexWithDeleted(*((COMMON::QueryResultSet*)&p_query), *workSpace, m_deletedID);
+ else
+ SearchIndexWithoutDeleted(*((COMMON::QueryResultSet*)&p_query), *workSpace);
+
m_workSpacePool->Return(workSpace);
if (p_query.WithMeta() && nullptr != m_pMetadata)
{
for (int i = 0; i < p_query.GetResultNum(); ++i)
{
- for (int i = 0; i < p_query.GetResultNum(); ++i)
- {
- int result = p_query.GetResult(i)->VID;
- p_query.SetMetadata(i, (result < 0) ? ByteArray::c_empty : m_pMetadata->GetMetadata(result));
- }
+ int result = p_query.GetResult(i)->VID;
+ p_query.SetMetadata(i, (result < 0) ? ByteArray::c_empty : m_pMetadata->GetMetadata(result));
}
}
-
return ErrorCode::Success;
}
#pragma endregion
-#pragma region Build/Save kd-tree & neighborhood graphs
template
ErrorCode Index::BuildIndex(const void* p_data, int p_vectorNum, int p_dimension)
{
- m_pSamples.Initialize(p_vectorNum, p_dimension);
- std::memcpy(m_pSamples.GetData(), p_data, p_vectorNum * p_dimension * sizeof(T));
- m_iDataSize = m_pSamples.R();
- m_iDataDimension = m_pSamples.C();
- m_dataUpdateLock.resize(m_iDataSize);
+ omp_set_num_threads(m_iNumberOfThreads);
+
+ m_pSamples.Initialize(p_vectorNum, p_dimension, (T*)p_data, false);
if (DistCalcMethod::Cosine == m_iDistCalcMethod)
{
int base = COMMON::Utils::GetBase();
- for (int i = 0; i < m_iDataSize; i++) {
- COMMON::Utils::Normalize(m_pSamples[i], m_iDataDimension, base);
+#pragma omp parallel for
+ for (int i = 0; i < GetNumSamples(); i++) {
+ COMMON::Utils::Normalize(m_pSamples[i], GetFeatureDim(), base);
}
}
- std::vector indices(m_iDataSize);
- for (int i = 0; i < m_iDataSize; i++) indices[i] = i;
- BuildBKT(indices, m_pBKTStart, m_pBKTRoots);
- BuildRNG();
m_workSpacePool.reset(new COMMON::WorkSpacePool(m_iMaxCheck, GetNumSamples()));
m_workSpacePool->Init(m_iNumberOfThreads);
- return ErrorCode::Success;
- }
-
-#pragma region Build/Save kd-tree
- template
- bool Index::SaveBKT(std::string sBKTFilename, std::vector& newStart, std::vector& newRoot) const
- {
- std::cout << "Save BKT to " << sBKTFilename << std::endl;
- FILE *fp = fopen(sBKTFilename.c_str(), "wb");
- if(fp == NULL) return false;
- fwrite(&m_iBKTNumber, sizeof(int), 1, fp);
- fwrite(newStart.data(), sizeof(int), m_iBKTNumber, fp);
- int treeNodeSize = (int)newRoot.size();
- fwrite(&treeNodeSize, sizeof(int), 1, fp);
- for (int i = 0; i < treeNodeSize; i++) {
- fwrite(&(newRoot[i].centerid), sizeof(int), 1, fp);
- fwrite(&(newRoot[i].childStart), sizeof(int), 1, fp);
- fwrite(&(newRoot[i].childEnd), sizeof(int), 1, fp);
- }
- fclose(fp);
- std::cout << "Save BKT Finish!" << std::endl;
- return true;
- }
-
- template
- void Index::BuildBKT(std::vector& indices, std::vector& newStart, std::vector& newRoot)
- {
- omp_set_num_threads(m_iNumberOfThreads);
- struct BKTStackItem {
- int index, first, last;
- BKTStackItem(int index_, int first_, int last_) : index(index_), first(first_), last(last_) {}
- };
- std::stack ss;
-
- KmeansArgs args(m_iBKTKmeansK, m_iDataDimension, (int)indices.size(), m_iNumberOfThreads);
- m_pSampleToCenter.clear();
-
- for (char i = 0; i < m_iBKTNumber; i++)
- {
- std::random_shuffle(indices.begin(), indices.end());
-
- newStart.push_back((int)newRoot.size());
- newRoot.push_back(BKTNode((int)indices.size()));
- std::cout << "Start to build tree " << i + 1 << std::endl;
-
- ss.push(BKTStackItem(newStart[i], 0, (int)indices.size()));
- while (!ss.empty()) {
- BKTStackItem item = ss.top(); ss.pop();
- int newBKTid = (int)newRoot.size();
- newRoot[item.index].childStart = newBKTid;
- if (item.last - item.first <= m_iBKTLeafSize) {
- for (int j = item.first; j < item.last; j++) {
- newRoot.push_back(BKTNode(indices[j]));
- }
- }
- else { // clustering the data into BKTKmeansK clusters
- int numClusters = KmeansClustering(indices, item.first, item.last, args);
- if (numClusters <= 1) {
- int end = min(item.last + 1, (int)indices.size());
- std::sort(indices.begin() + item.first, indices.begin() + end);
- newRoot[item.index].centerid = indices[item.first];
- newRoot[item.index].childStart = -newRoot[item.index].childStart;
- for (int j = item.first + 1; j < end; j++) {
- newRoot.push_back(BKTNode(indices[j]));
- m_pSampleToCenter[indices[j]] = newRoot[item.index].centerid;
- }
- m_pSampleToCenter[-1 - newRoot[item.index].centerid] = item.index;
- }
- else {
- for (int k = 0; k < m_iBKTKmeansK; k++) {
- if (args.counts[k] == 0) continue;
- newRoot.push_back(BKTNode(indices[item.first + args.counts[k] - 1]));
- if (args.counts[k] > 1) ss.push(BKTStackItem(newBKTid++, item.first, item.first + args.counts[k] - 1));
- item.first += args.counts[k];
- }
- }
- }
- newRoot[item.index].childEnd = (int)newRoot.size();
- }
- std::cout << i + 1 << " trees built, " << newRoot.size() - newStart[i] << " " << indices.size() << std::endl;
- }
- }
-
- template
- float Index::KmeansAssign(std::vector& indices, const int first, const int last, KmeansArgs& args, bool updateCenters) {
- float currDist = 0;
- float lambda = (updateCenters) ? COMMON::Utils::GetBase() * COMMON::Utils::GetBase() / (100.0 * (last - first)) : 0;
- int subsize = (last - first - 1) / m_iNumberOfThreads + 1;
-
-#pragma omp parallel for
- for (int tid = 0; tid < m_iNumberOfThreads; tid++)
- {
- int istart = first + tid * subsize;
- int iend = min(first + (tid + 1) * subsize, last);
- int *inewCounts = args.newCounts + tid * m_iBKTKmeansK;
- float *inewCenters = args.newCenters + tid * m_iBKTKmeansK * m_iDataDimension;
- int * iclusterIdx = args.clusterIdx + tid * m_iBKTKmeansK;
- float * iclusterDist = args.clusterDist + tid * m_iBKTKmeansK;
- float idist = 0;
- for (int i = istart; i < iend; i++) {
- int clusterid = 0;
- float smallestDist = MaxDist;
- for (int k = 0; k < m_iBKTKmeansK; k++) {
- float dist = m_fComputeDistance(m_pSamples[indices[i]], args.centers + k*m_iDataDimension, m_iDataDimension) + lambda*args.counts[k];
- if (dist > -MaxDist && dist < smallestDist) {
- clusterid = k; smallestDist = dist;
- }
- }
- args.label[i] = clusterid;
- inewCounts[clusterid]++;
- idist += smallestDist;
- if (updateCenters) {
- for (int j = 0; j < m_iDataDimension; j++) inewCenters[clusterid*m_iDataDimension + j] += m_pSamples[indices[i]][j];
- if (smallestDist > iclusterDist[clusterid]) {
- iclusterDist[clusterid] = smallestDist;
- iclusterIdx[clusterid] = indices[i];
- }
- }
- else {
- if (smallestDist <= iclusterDist[clusterid]) {
- iclusterDist[clusterid] = smallestDist;
- iclusterIdx[clusterid] = indices[i];
- }
- }
- }
- COMMON::Utils::atomic_float_add(&currDist, idist);
- }
-
- for (int i = 1; i < m_iNumberOfThreads; i++) {
- for (int k = 0; k < m_iBKTKmeansK; k++)
- args.newCounts[k] += args.newCounts[i*m_iBKTKmeansK + k];
- }
-
- if (updateCenters) {
- for (int i = 1; i < m_iNumberOfThreads; i++) {
- float* currCenter = args.newCenters + i*m_iBKTKmeansK*m_iDataDimension;
- for (int j = 0; j < m_iBKTKmeansK * m_iDataDimension; j++) args.newCenters[j] += currCenter[j];
- }
-
- int maxcluster = 0;
- for (int k = 1; k < m_iBKTKmeansK; k++) if (args.newCounts[maxcluster] < args.newCounts[k]) maxcluster = k;
-
- int maxid = maxcluster;
- for (int tid = 1; tid < m_iNumberOfThreads; tid++) {
- if (args.clusterDist[maxid] < args.clusterDist[tid * m_iBKTKmeansK + maxcluster]) maxid = tid * m_iBKTKmeansK + maxcluster;
- }
- if (args.clusterIdx[maxid] < 0 || args.clusterIdx[maxid] >= m_iDataSize)
- std::cout << "first:" << first << " last:" << last << " maxcluster:" << maxcluster << "(" << args.newCounts[maxcluster] << ") Error maxid:" << maxid << " dist:" << args.clusterDist[maxid] << std::endl;
- maxid = args.clusterIdx[maxid];
-
- for (int k = 0; k < m_iBKTKmeansK; k++) {
- T* TCenter = args.newTCenters + k * m_iDataDimension;
- if (args.newCounts[k] == 0) {
- //int nextid = Utils::rand_int(last, first);
- //while (args.label[nextid] != maxcluster) nextid = Utils::rand_int(last, first);
- int nextid = maxid;
- std::memcpy(TCenter, m_pSamples[nextid], sizeof(T)*m_iDataDimension);
- }
- else {
- float* currCenters = args.newCenters + k * m_iDataDimension;
- for (int j = 0; j < m_iDataDimension; j++) currCenters[j] /= args.newCounts[k];
-
- if (m_iDistCalcMethod == DistCalcMethod::Cosine) {
- COMMON::Utils::Normalize(currCenters, m_iDataDimension, COMMON::Utils::GetBase());
- }
- for (int j = 0; j < m_iDataDimension; j++) TCenter[j] = (T)(currCenters[j]);
- }
- }
- }
- else {
- for (int i = 1; i < m_iNumberOfThreads; i++) {
- for (int k = 0; k < m_iBKTKmeansK; k++) {
- if (args.clusterIdx[i*m_iBKTKmeansK + k] != -1 && args.clusterDist[i*m_iBKTKmeansK + k] <= args.clusterDist[k]) {
- args.clusterDist[k] = args.clusterDist[i*m_iBKTKmeansK + k];
- args.clusterIdx[k] = args.clusterIdx[i*m_iBKTKmeansK + k];
- }
- }
- }
- }
- return currDist;
- }
-
- template
- int Index::KmeansClustering(std::vector& indices, const int first, const int last, KmeansArgs& args) {
- int iterLimit = 100;
-
- int batchEnd = min(first + m_iSamples, last);
- float currDiff, currDist, minClusterDist = MaxDist;
- for (int numKmeans = 0; numKmeans < 3; numKmeans++) {
- for (int k = 0; k < m_iBKTKmeansK; k++) {
- int randid = COMMON::Utils::rand_int(last, first);
- memcpy(args.centers + k*m_iDataDimension, m_pSamples[indices[randid]], sizeof(T)*m_iDataDimension);
- }
- args.ClearCounts();
- currDist = KmeansAssign(indices, first, batchEnd, args, false);
- if (currDist < minClusterDist) {
- minClusterDist = currDist;
- memcpy(args.newTCenters, args.centers, sizeof(T)*m_iBKTKmeansK*m_iDataDimension);
- memcpy(args.counts, args.newCounts, sizeof(int) * m_iBKTKmeansK);
- }
- }
-
- minClusterDist = MaxDist;
- int noImprovement = 0;
- for (int iter = 0; iter < iterLimit; iter++) {
- std::memcpy(args.centers, args.newTCenters, sizeof(T)*m_iBKTKmeansK*m_iDataDimension);
- std::random_shuffle(indices.begin() + first, indices.begin() + last);
-
- args.ClearCenters();
- args.ClearCounts();
- args.ClearDists(-MaxDist);
- currDist = KmeansAssign(indices, first, batchEnd, args, true);
- memcpy(args.counts, args.newCounts, sizeof(int)*m_iBKTKmeansK);
+
+ m_pTrees.BuildTrees(this);
+ m_pGraph.BuildGraph(this, &(m_pTrees.GetSampleMap()));
- currDiff = 0;
- for (int k = 0; k < m_iBKTKmeansK; k++) {
- currDiff += m_fComputeDistance(args.centers + k*m_iDataDimension, args.newTCenters + k*m_iDataDimension, m_iDataDimension);
- }
-
- if (currDist < minClusterDist) {
- noImprovement = 0;
- minClusterDist = currDist;
- }
- else {
- noImprovement++;
- }
- if (currDiff < 1e-3 || noImprovement >= 5) break;
- }
-
- args.ClearCounts();
- args.ClearDists(MaxDist);
- currDist = KmeansAssign(indices, first, last, args, false);
- memcpy(args.counts, args.newCounts, sizeof(int)*m_iBKTKmeansK);
-
- int numClusters = 0;
- for (int i = 0; i < m_iBKTKmeansK; i++) if (args.counts[i] > 0) numClusters++;
-
- if (numClusters <= 1) {
- //if (last - first > 1) std::cout << "large cluster:" << last - first << " dist:" << currDist << std::endl;
- return numClusters;
- }
- args.Shuffle(indices, first, last);
- return numClusters;
- }
-#pragma endregion
-
-#pragma region Build/Save neighborhood graph
- template
- bool Index::SaveRNG(std::string sGraphFilename) const
- {
- std::cout << "Save Graph To " << sGraphFilename << std::endl;
- FILE *fp = fopen(sGraphFilename.c_str(), "wb");
- if (fp == NULL) return false;
- fwrite(&m_iGraphSize, sizeof(int), 1, fp);
- fwrite(&m_iNeighborhoodSize, sizeof(int), 1, fp);
-
- for (int i = 0; i < m_iGraphSize; i++)
- {
- fwrite((m_pNeighborhoodGraph)[i], sizeof(int), m_iNeighborhoodSize, fp);
- }
- fclose(fp);
- std::cout << "Save Graph Finish!" << std::endl;
- return true;
- }
-
- template
- void Index::PartitionByTptree(std::vector& indices,
- const int first,
- const int last,
- std::vector> & leaves)
- {
- if (last - first <= m_iTPTLeafSize)
- {
- leaves.push_back(std::make_pair(first, last));
- }
- else
- {
- std::vector Mean(m_iDataDimension, 0);
-
- int iIteration = 100;
- int end = min(first + m_iSamples, last);
- int count = end - first + 1;
- // calculate the mean of each dimension
- for (int j = first; j <= end; j++)
- {
- T* v = (m_pSamples)[indices[j]];
- for (int k = 0; k < m_iDataDimension; k++)
- {
- Mean[k] += v[k];
- }
- }
- for (int k = 0; k < m_iDataDimension; k++)
- {
- Mean[k] /= count;
- }
- std::vector