diff --git a/AnnService/CoreLibrary.vcxproj b/AnnService/CoreLibrary.vcxproj index 9844c709..a0e884b3 100644 --- a/AnnService/CoreLibrary.vcxproj +++ b/AnnService/CoreLibrary.vcxproj @@ -157,9 +157,14 @@ + + + + + diff --git a/AnnService/CoreLibrary.vcxproj.filters b/AnnService/CoreLibrary.vcxproj.filters index 7d27224d..c411e8ce 100644 --- a/AnnService/CoreLibrary.vcxproj.filters +++ b/AnnService/CoreLibrary.vcxproj.filters @@ -118,6 +118,18 @@ Header Files\Core\Common + + Header Files\Core\Common + + + Header Files\Core\Common + + + Header Files\Core\Common + + + Header Files\Core\Common + @@ -156,5 +168,11 @@ Source Files\Core\KDT + + Source Files\Core\Common + + + + \ No newline at end of file diff --git a/AnnService/inc/Core/BKT/Index.h b/AnnService/inc/Core/BKT/Index.h index c14aa815..2ead4acd 100644 --- a/AnnService/inc/Core/BKT/Index.h +++ b/AnnService/inc/Core/BKT/Index.h @@ -1,267 +1,109 @@ #ifndef _SPTAG_BKT_INDEX_H_ #define _SPTAG_BKT_INDEX_H_ -#include "../SearchQuery.h" -#include "../VectorIndex.h" #include "../Common.h" +#include "../VectorIndex.h" #include "../Common/CommonUtils.h" #include "../Common/DistanceUtils.h" #include "../Common/QueryResultSet.h" -#include "../Common/Heap.h" #include "../Common/Dataset.h" #include "../Common/WorkSpace.h" #include "../Common/WorkSpacePool.h" -#include "../Common/FineGrainedLock.h" -#include "../Common/DataUtils.h" +#include "../Common/RelativeNeighborhoodGraph.h" +#include "../Common/BKTree.h" +#include "inc/Helper/SimpleIniReader.h" +#include "inc/Helper/StringConvert.h" #include #include -#include #include namespace SPTAG { -namespace Helper -{ -class IniReader; -} - - -namespace BKT -{ - // node type for storing BKT - struct BKTNode + namespace Helper { - int centerid; - int childStart; - int childEnd; - - BKTNode(int cid = -1) : centerid(cid), childStart(-1), childEnd(-1) {} - }; - - template - struct KmeansArgs { - int _K; - int _D; - int _T; - T* centers; - int* counts; - float* newCenters; - int* newCounts; - char* label; - int* clusterIdx; - float* clusterDist; - T* newTCenters; - - KmeansArgs(int k, int dim, int datasize, int threadnum): _K(k), _D(dim), _T(threadnum) { - centers = new T[k * dim]; - counts = new int[k]; - newCenters = new float[threadnum * k * dim]; - newCounts = new int[threadnum * k]; - label = new char[datasize]; - clusterIdx = new int[threadnum * k]; - clusterDist = new float[threadnum * k]; - newTCenters = new T[k * dim]; - } - - ~KmeansArgs() { - delete[] centers; - delete[] counts; - delete[] newCenters; - delete[] newCounts; - delete[] label; - delete[] clusterIdx; - delete[] clusterDist; - delete[] newTCenters; - } - - inline void ClearCounts() { - memset(newCounts, 0, sizeof(int) * _T * _K); - } - - inline void ClearCenters() { - memset(newCenters, 0, sizeof(float) * _T * _K * _D); - } - - inline void ClearDists(float dist) { - for (int i = 0; i < _T * _K; i++) { - clusterIdx[i] = -1; - clusterDist[i] = dist; - } - } - - void Shuffle(std::vector& indices, int first, int last) { - int* pos = new int[_K]; - pos[0] = first; - for (int k = 1; k < _K; k++) pos[k] = pos[k - 1] + newCounts[k - 1]; - - for (int k = 0; k < _K; k++) { - if (newCounts[k] == 0) continue; - int i = pos[k]; - while (newCounts[k] > 0) { - int swapid = pos[(int)(label[i])] + newCounts[(int)(label[i])] - 1; - newCounts[(int)(label[i])]--; - std::swap(indices[i], indices[swapid]); - std::swap(label[i], label[swapid]); - } - while (indices[i] != clusterIdx[k]) i++; - std::swap(indices[i], indices[pos[k] + counts[k] - 1]); - } - delete[] pos; - } - }; + class IniReader; + } - template - class Index : public VectorIndex + namespace BKT { - private: - // Initial data points - int m_iDataSize; - int m_iDataDimension; - COMMON::Dataset m_pSamples; + template + class Index : public VectorIndex + { + private: + // data points + COMMON::Dataset m_pSamples; - // BKT structures. - int m_iBKTNumber; - std::vector m_pBKTStart; - std::vector m_pBKTRoots; + // BKT structures. + COMMON::BKTree m_pTrees; - // Graph structure - int m_iGraphSize; - int m_iNeighborhoodSize; - COMMON::Dataset m_pNeighborhoodGraph; + // Graph structure + COMMON::RelativeNeighborhoodGraph m_pGraph; - // Variables for building BKTs and TPTs - int m_iBKTKmeansK; - int m_iBKTLeafSize; - int m_iSamples; - int m_iTptreeNumber; - int m_iTPTLeafSize; - int m_numTopDimensionTpTreeSplit; + std::string m_sBKTFilename; + std::string m_sGraphFilename; + std::string m_sDataPointsFilename; - // Variables for building graph - int m_iRefineIter; - int m_iCEF; - int m_iMaxCheckForRefineGraph; - int m_iMaxCheck; - std::unordered_map m_pSampleToCenter; + std::mutex m_dataLock; // protect data and graph + tbb::concurrent_unordered_set m_deletedID; + std::unique_ptr m_workSpacePool; - // Load from files directly - std::string m_sBKTFilename; - std::string m_sGraphFilename; - std::string m_sDataPointsFilename; - - // Load from memory mapped files - char* m_pBKTMemoryFile; - char* m_pGraphMemoryFile; - char* m_pDataPointsMemoryFile; - - DistCalcMethod m_iDistCalcMethod; - float(*m_fComputeDistance)(const T* pX, const T* pY, int length); - - int m_iCacheSize; - int m_iDebugLoad; - - int g_iThresholdOfNumberOfContinuousNoBetterPropagation; - int g_iNumberOfInitialDynamicPivots; - int g_iNumberOfOtherDynamicPivots; - - int m_iNumberOfThreads; - std::mutex m_dataAllocLock; - COMMON::FineGrainedLock m_dataUpdateLock; - tbb::concurrent_unordered_set m_deletedID; - std::unique_ptr m_workSpacePool; - public: - Index() : m_iBKTNumber(1), - m_iBKTKmeansK(32), - m_iBKTLeafSize(8), - m_iSamples(1000), - m_iNeighborhoodSize(32), - m_iTptreeNumber(32), - m_iTPTLeafSize(2000), - m_numTopDimensionTpTreeSplit(5), - m_iRefineIter(0), - m_iCEF(1000), - m_iMaxCheckForRefineGraph(10000), - m_iMaxCheck(2048), - m_pBKTMemoryFile(NULL), - m_pGraphMemoryFile(NULL), - m_pDataPointsMemoryFile(NULL), - m_sBKTFilename("tree.bin"), - m_sGraphFilename("graph.bin"), - m_sDataPointsFilename("vectors.bin"), - m_iNumberOfThreads(1), - m_iDistCalcMethod(DistCalcMethod::Cosine), - m_fComputeDistance(COMMON::DistanceCalcSelector(DistCalcMethod::Cosine)), - m_iCacheSize(-1), - m_iDebugLoad(-1), - g_iThresholdOfNumberOfContinuousNoBetterPropagation(3), - g_iNumberOfInitialDynamicPivots(50), - g_iNumberOfOtherDynamicPivots(4) {} - - ~Index() { - m_pBKTRoots.clear(); - } - int GetNumSamples() const { return m_pSamples.R(); } - int GetFeatureDim() const { return m_pSamples.C(); } - int GetNumThreads() const { return m_iNumberOfThreads; } - int GetCurrMaxCheck() const { return m_iMaxCheck; } - - DistCalcMethod GetDistCalcMethod() const { return m_iDistCalcMethod; } - IndexAlgoType GetIndexAlgoType() const { return IndexAlgoType::BKT; } - VectorValueType GetVectorValueType() const { return GetEnumValueType(); } - - ErrorCode BuildIndex(const void* p_data, int p_vectorNum, int p_dimension); + int m_iNumberOfThreads; + DistCalcMethod m_iDistCalcMethod; + float(*m_fComputeDistance)(const T* pX, const T* pY, int length); - ErrorCode LoadIndex(const std::string& p_folderPath); - ErrorCode LoadIndexFromMemory(const std::vector& p_indexBlobs); - - ErrorCode SaveIndex(const std::string& p_folderPath); - - void SearchIndex(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const tbb::concurrent_unordered_set &p_deleted) const; - ErrorCode SearchIndex(QueryResult &p_query) const; - - ErrorCode AddIndex(const void* p_vectors, int p_vectorNum, int p_dimension); - ErrorCode DeleteIndex(const void* p_vectors, int p_vectorNum); - ErrorCode RefineIndex(const std::string& p_folderPath); - ErrorCode MergeIndex(const char* p_indexFilePath1, const char* p_indexFilePath2); + int m_iMaxCheck; + int m_iThresholdOfNumberOfContinuousNoBetterPropagation; + int m_iNumberOfInitialDynamicPivots; + int m_iNumberOfOtherDynamicPivots; + public: + Index() : + m_sBKTFilename("tree.bin"), + m_sGraphFilename("graph.bin"), + m_sDataPointsFilename("vectors.bin"), + m_iNumberOfThreads(1), + m_iDistCalcMethod(DistCalcMethod::Cosine), + m_fComputeDistance(COMMON::DistanceCalcSelector(DistCalcMethod::Cosine)), + m_iMaxCheck(2048), + m_iThresholdOfNumberOfContinuousNoBetterPropagation(3), + m_iNumberOfInitialDynamicPivots(50), + m_iNumberOfOtherDynamicPivots(4) {} + + ~Index() {} + + inline int GetNumSamples() const { return m_pSamples.R(); } + inline int GetFeatureDim() const { return m_pSamples.C(); } - ErrorCode SetParameter(const char* p_param, const char* p_value); - std::string GetParameter(const char* p_param) const; - - private: - // Functions for loading models from files - bool LoadDataPoints(std::string sDataPointsFileName); - bool LoadBKT(std::string sBKTFilename); - bool LoadGraph(std::string sGraphFilename); - - // Functions for loading models from memory mapped files - bool LoadDataPoints(char* pDataPointsMemFile); - bool LoadBKT(char* pBKTMemFile); - bool LoadGraph(char* pGraphMemFile); - - bool SaveDataPoints(std::string sDataPointsFileName); - - // Functions for building balanced kmeans tree - void BuildBKT(std::vector& indices, std::vector& newStart, std::vector& newRoot); - bool SaveBKT(std::string sBKTFilename, std::vector& newStart, std::vector& newRoot) const; - float KmeansAssign(std::vector& indices, const int first, const int last, KmeansArgs& args, bool updateCenters); - int KmeansClustering(std::vector& indices, const int first, const int last, KmeansArgs& args); - - // Functions for building Graph - void BuildRNG(); - bool SaveRNG(std::string sGraphFilename) const; - void PartitionByTptree(std::vector &indices, - const int first, - const int last, - std::vector> &leaves); - void RefineRNG(); - void RefineRNGNode(const int node, COMMON::WorkSpace &space, bool updateNeighbors); - void RebuildRNGNodeNeighbors(int* nodes, const BasicResult* queryResults, int numResults); - float GraphAccuracyEstimation(int NSample, bool rng); - }; -} // namespace BKT + inline int GetCurrMaxCheck() const { return m_iMaxCheck; } + inline int GetNumThreads() const { return m_iNumberOfThreads; } + inline DistCalcMethod GetDistCalcMethod() const { return m_iDistCalcMethod; } + inline IndexAlgoType GetIndexAlgoType() const { return IndexAlgoType::BKT; } + inline VectorValueType GetVectorValueType() const { return GetEnumValueType(); } + + inline float ComputeDistance(const void* pX, const void* pY) const { return m_fComputeDistance((const T*)pX, (const T*)pY, m_pSamples.C()); } + inline const void* GetSample(const int idx) const { return (void*)m_pSamples[idx]; } + + ErrorCode BuildIndex(const void* p_data, int p_vectorNum, int p_dimension); + + ErrorCode LoadIndexFromMemory(const std::vector& p_indexBlobs); + + ErrorCode SaveIndex(const std::string& p_folderPath, std::ofstream& p_configout); + ErrorCode LoadIndex(const std::string& p_folderPath, Helper::IniReader& p_reader); + ErrorCode SearchIndex(QueryResult &p_query) const; + ErrorCode AddIndex(const void* p_vectors, int p_vectorNum, int p_dimension); + ErrorCode DeleteIndex(const void* p_vectors, int p_vectorNum); + + ErrorCode SetParameter(const char* p_param, const char* p_value); + std::string GetParameter(const char* p_param) const; + + private: + ErrorCode RefineIndex(const std::string& p_folderPath); + void SearchIndexWithDeleted(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const tbb::concurrent_unordered_set &p_deleted) const; + void SearchIndexWithoutDeleted(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space) const; + }; + } // namespace BKT } // namespace SPTAG #endif // _SPTAG_BKT_INDEX_H_ diff --git a/AnnService/inc/Core/BKT/ParameterDefinitionList.h b/AnnService/inc/Core/BKT/ParameterDefinitionList.h index 237b9da4..15c0cc01 100644 --- a/AnnService/inc/Core/BKT/ParameterDefinitionList.h +++ b/AnnService/inc/Core/BKT/ParameterDefinitionList.h @@ -5,25 +5,29 @@ DefineBKTParameter(m_sBKTFilename, std::string, std::string("tree.bin"), "TreeFi DefineBKTParameter(m_sGraphFilename, std::string, std::string("graph.bin"), "GraphFilePath") DefineBKTParameter(m_sDataPointsFilename, std::string, std::string("vectors.bin"), "VectorFilePath") -DefineBKTParameter(m_iBKTNumber, int, 1L, "BKTNumber") -DefineBKTParameter(m_iBKTKmeansK, int, 32L, "BKTKmeansK") -DefineBKTParameter(m_iNeighborhoodSize, int, 32L, "NeighborhoodSize") -DefineBKTParameter(m_iBKTLeafSize, int, 8L, "BKTLeafSize") -DefineBKTParameter(m_iSamples, int, 1000L, "Samples") -DefineBKTParameter(m_iTptreeNumber, int, 32L, "TpTreeNumber") -DefineBKTParameter(m_iTPTLeafSize, int, 2000L, "TPTLeafSize") -DefineBKTParameter(m_numTopDimensionTpTreeSplit, int, 5L, "NumTopDimensionTpTreeSplit") -DefineBKTParameter(m_iCEF, int, 1000L, "CEF") -DefineBKTParameter(m_iMaxCheckForRefineGraph, int, 10000L, "MaxCheckForRefineGraph") -DefineBKTParameter(m_iMaxCheck, int, 8192L, "MaxCheck") -DefineBKTParameter(m_iNumberOfThreads, int, 1L, "NumberOfThreads") +DefineBKTParameter(m_pTrees.m_iTreeNumber, int, 1L, "BKTNumber") +DefineBKTParameter(m_pTrees.m_iBKTKmeansK, int, 32L, "BKTKmeansK") +DefineBKTParameter(m_pTrees.m_iBKTLeafSize, int, 8L, "BKTLeafSize") +DefineBKTParameter(m_pTrees.m_iSamples, int, 1000L, "Samples") + + +DefineBKTParameter(m_pGraph.m_iTPTNumber, int, 32L, "TpTreeNumber") +DefineBKTParameter(m_pGraph.m_iTPTLeafSize, int, 2000L, "TPTLeafSize") +DefineBKTParameter(m_pGraph.m_numTopDimensionTPTSplit, int, 5L, "NumTopDimensionTpTreeSplit") -DefineBKTParameter(g_iThresholdOfNumberOfContinuousNoBetterPropagation, int, 3L, "ThresholdOfNumberOfContinuousNoBetterPropagation") -DefineBKTParameter(g_iNumberOfInitialDynamicPivots, int, 50L, "NumberOfInitialDynamicPivots") -DefineBKTParameter(g_iNumberOfOtherDynamicPivots, int, 4L, "NumberOfOtherDynamicPivots") +DefineBKTParameter(m_pGraph.m_iNeighborhoodSize, int, 32L, "NeighborhoodSize") +DefineBKTParameter(m_pGraph.m_iNeighborhoodScale, int, 16L, "GraphNeighborhoodScale") +DefineBKTParameter(m_pGraph.m_iCEFScale, int, 4L, "GraphCEFScale") +DefineBKTParameter(m_pGraph.m_iRefineIter, int, 0L, "RefineIterations") +DefineBKTParameter(m_pGraph.m_iCEF, int, 1000L, "CEF") +DefineBKTParameter(m_pGraph.m_iMaxCheckForRefineGraph, int, 10000L, "MaxCheckForRefineGraph") +DefineBKTParameter(m_iNumberOfThreads, int, 1L, "NumberOfThreads") DefineBKTParameter(m_iDistCalcMethod, SPTAG::DistCalcMethod, SPTAG::DistCalcMethod::Cosine, "DistCalcMethod") -DefineBKTParameter(m_iRefineIter, int, 0L, "RefineIterations") -DefineBKTParameter(m_iDebugLoad, int, -1, "NumTrains") -DefineBKTParameter(m_iCacheSize, int, -1, "CacheSize") + +DefineBKTParameter(m_iMaxCheck, int, 8192L, "MaxCheck") +DefineBKTParameter(m_iThresholdOfNumberOfContinuousNoBetterPropagation, int, 3L, "ThresholdOfNumberOfContinuousNoBetterPropagation") +DefineBKTParameter(m_iNumberOfInitialDynamicPivots, int, 50L, "NumberOfInitialDynamicPivots") +DefineBKTParameter(m_iNumberOfOtherDynamicPivots, int, 4L, "NumberOfOtherDynamicPivots") + #endif diff --git a/AnnService/inc/Core/Common/BKTree.h b/AnnService/inc/Core/Common/BKTree.h new file mode 100644 index 00000000..70140621 --- /dev/null +++ b/AnnService/inc/Core/Common/BKTree.h @@ -0,0 +1,461 @@ +#ifndef _SPTAG_COMMON_BKTREE_H_ +#define _SPTAG_COMMON_BKTREE_H_ + +#include +#include +#include +#include + +#include "../VectorIndex.h" + +#include "CommonUtils.h" +#include "QueryResultSet.h" +#include "WorkSpace.h" + +#pragma warning(disable:4996) // 'fopen': This function or variable may be unsafe. Consider using fopen_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details. + +namespace SPTAG +{ + namespace COMMON + { + // node type for storing BKT + struct BKTNode + { + int centerid; + int childStart; + int childEnd; + + BKTNode(int cid = -1) : centerid(cid), childStart(-1), childEnd(-1) {} + }; + + template + struct KmeansArgs { + int _K; + int _D; + int _T; + T* centers; + int* counts; + float* newCenters; + int* newCounts; + char* label; + int* clusterIdx; + float* clusterDist; + T* newTCenters; + + KmeansArgs(int k, int dim, int datasize, int threadnum) : _K(k), _D(dim), _T(threadnum) { + centers = new T[k * dim]; + counts = new int[k]; + newCenters = new float[threadnum * k * dim]; + newCounts = new int[threadnum * k]; + label = new char[datasize]; + clusterIdx = new int[threadnum * k]; + clusterDist = new float[threadnum * k]; + newTCenters = new T[k * dim]; + } + + ~KmeansArgs() { + delete[] centers; + delete[] counts; + delete[] newCenters; + delete[] newCounts; + delete[] label; + delete[] clusterIdx; + delete[] clusterDist; + delete[] newTCenters; + } + + inline void ClearCounts() { + memset(newCounts, 0, sizeof(int) * _T * _K); + } + + inline void ClearCenters() { + memset(newCenters, 0, sizeof(float) * _T * _K * _D); + } + + inline void ClearDists(float dist) { + for (int i = 0; i < _T * _K; i++) { + clusterIdx[i] = -1; + clusterDist[i] = dist; + } + } + + void Shuffle(std::vector& indices, int first, int last) { + int* pos = new int[_K]; + pos[0] = first; + for (int k = 1; k < _K; k++) pos[k] = pos[k - 1] + newCounts[k - 1]; + + for (int k = 0; k < _K; k++) { + if (newCounts[k] == 0) continue; + int i = pos[k]; + while (newCounts[k] > 0) { + int swapid = pos[(int)(label[i])] + newCounts[(int)(label[i])] - 1; + newCounts[(int)(label[i])]--; + std::swap(indices[i], indices[swapid]); + std::swap(label[i], label[swapid]); + } + while (indices[i] != clusterIdx[k]) i++; + std::swap(indices[i], indices[pos[k] + counts[k] - 1]); + } + delete[] pos; + } + }; + + class BKTree + { + public: + BKTree(): m_iTreeNumber(1), m_iBKTKmeansK(32), m_iBKTLeafSize(8), m_iSamples(1000) {} + + BKTree(BKTree& other): m_iTreeNumber(other.m_iTreeNumber), + m_iBKTKmeansK(other.m_iBKTKmeansK), + m_iBKTLeafSize(other.m_iBKTLeafSize), + m_iSamples(other.m_iSamples) {} + ~BKTree() {} + + inline const BKTNode& operator[](int index) const { return m_pTreeRoots[index]; } + inline BKTNode& operator[](int index) { return m_pTreeRoots[index]; } + + inline int size() const { return (int)m_pTreeRoots.size(); } + + inline const std::unordered_map& GetSampleMap() const { return m_pSampleCenterMap; } + + template + void BuildTrees(VectorIndex* index, std::vector* indices = nullptr) + { + struct BKTStackItem { + int index, first, last; + BKTStackItem(int index_, int first_, int last_) : index(index_), first(first_), last(last_) {} + }; + std::stack ss; + + std::vector localindices; + if (indices == nullptr) { + localindices.resize(index->GetNumSamples()); + for (int i = 0; i < index->GetNumSamples(); i++) localindices[i] = i; + } + else { + localindices.assign(indices->begin(), indices->end()); + } + KmeansArgs args(m_iBKTKmeansK, index->GetFeatureDim(), (int)localindices.size(), omp_get_num_threads()); + + m_pSampleCenterMap.clear(); + for (char i = 0; i < m_iTreeNumber; i++) + { + std::random_shuffle(localindices.begin(), localindices.end()); + + m_pTreeStart.push_back((int)m_pTreeRoots.size()); + m_pTreeRoots.push_back(BKTNode((int)localindices.size())); + std::cout << "Start to build BKTree " << i + 1 << std::endl; + + ss.push(BKTStackItem(m_pTreeStart[i], 0, (int)localindices.size())); + while (!ss.empty()) { + BKTStackItem item = ss.top(); ss.pop(); + int newBKTid = (int)m_pTreeRoots.size(); + m_pTreeRoots[item.index].childStart = newBKTid; + if (item.last - item.first <= m_iBKTLeafSize) { + for (int j = item.first; j < item.last; j++) { + m_pTreeRoots.push_back(BKTNode(localindices[j])); + } + } + else { // clustering the data into BKTKmeansK clusters + int numClusters = KmeansClustering(index, localindices, item.first, item.last, args); + if (numClusters <= 1) { + int end = min(item.last + 1, (int)localindices.size()); + std::sort(localindices.begin() + item.first, localindices.begin() + end); + m_pTreeRoots[item.index].centerid = localindices[item.first]; + m_pTreeRoots[item.index].childStart = -m_pTreeRoots[item.index].childStart; + for (int j = item.first + 1; j < end; j++) { + m_pTreeRoots.push_back(BKTNode(localindices[j])); + m_pSampleCenterMap[localindices[j]] = m_pTreeRoots[item.index].centerid; + } + m_pSampleCenterMap[-1 - m_pTreeRoots[item.index].centerid] = item.index; + } + else { + for (int k = 0; k < m_iBKTKmeansK; k++) { + if (args.counts[k] == 0) continue; + m_pTreeRoots.push_back(BKTNode(localindices[item.first + args.counts[k] - 1])); + if (args.counts[k] > 1) ss.push(BKTStackItem(newBKTid++, item.first, item.first + args.counts[k] - 1)); + item.first += args.counts[k]; + } + } + } + m_pTreeRoots[item.index].childEnd = (int)m_pTreeRoots.size(); + } + std::cout << i + 1 << " BKTree built, " << m_pTreeRoots.size() - m_pTreeStart[i] << " " << localindices.size() << std::endl; + } + } + + bool SaveTrees(std::string sTreeFileName) const + { + std::cout << "Save BKT to " << sTreeFileName << std::endl; + FILE *fp = fopen(sTreeFileName.c_str(), "wb"); + if (fp == NULL) return false; + + fwrite(&m_iTreeNumber, sizeof(int), 1, fp); + fwrite(m_pTreeStart.data(), sizeof(int), m_iTreeNumber, fp); + int treeNodeSize = (int)m_pTreeRoots.size(); + fwrite(&treeNodeSize, sizeof(int), 1, fp); + fwrite(m_pTreeRoots.data(), sizeof(BKTNode), treeNodeSize, fp); + fclose(fp); + std::cout << "Save BKT (" << m_iTreeNumber << "," << treeNodeSize << ") Finish!" << std::endl; + return true; + } + + bool LoadTrees(char* pBKTMemFile) + { + m_iTreeNumber = *((int*)pBKTMemFile); + pBKTMemFile += sizeof(int); + m_pTreeStart.resize(m_iTreeNumber); + memcpy(m_pTreeStart.data(), pBKTMemFile, sizeof(int) * m_iTreeNumber); + pBKTMemFile += sizeof(int)*m_iTreeNumber; + + int treeNodeSize = *((int*)pBKTMemFile); + pBKTMemFile += sizeof(int); + m_pTreeRoots.resize(treeNodeSize); + memcpy(m_pTreeRoots.data(), pBKTMemFile, sizeof(BKTNode) * treeNodeSize); + return true; + } + + bool LoadTrees(std::string sTreeFileName) + { + std::cout << "Load BKT From " << sTreeFileName << std::endl; + FILE *fp = fopen(sTreeFileName.c_str(), "rb"); + if (fp == NULL) return false; + + fread(&m_iTreeNumber, sizeof(int), 1, fp); + m_pTreeStart.resize(m_iTreeNumber); + fread(m_pTreeStart.data(), sizeof(int), m_iTreeNumber, fp); + + int treeNodeSize; + fread(&treeNodeSize, sizeof(int), 1, fp); + m_pTreeRoots.resize(treeNodeSize); + fread(m_pTreeRoots.data(), sizeof(BKTNode), treeNodeSize, fp); + fclose(fp); + std::cout << "Load BKT (" << m_iTreeNumber << "," << treeNodeSize << ") Finish!" << std::endl; + return true; + } + + template + void InitSearchTrees(const VectorIndex* p_index, const COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space) const + { + for (char i = 0; i < m_iTreeNumber; i++) { + const BKTNode& node = m_pTreeRoots[m_pTreeStart[i]]; + if (node.childStart < 0) { + p_space.m_SPTQueue.insert(COMMON::HeapCell(m_pTreeStart[i], p_index->ComputeDistance((const void*)p_query.GetTarget(), p_index->GetSample(node.centerid)))); + } + else { + for (int begin = node.childStart; begin < node.childEnd; begin++) { + int index = m_pTreeRoots[begin].centerid; + p_space.m_SPTQueue.insert(COMMON::HeapCell(begin, p_index->ComputeDistance((const void*)p_query.GetTarget(), p_index->GetSample(index)))); + } + } + } + } + + template + void SearchTrees(const VectorIndex* p_index, const COMMON::QueryResultSet &p_query, + COMMON::WorkSpace &p_space, const int p_limits) const + { + do + { + COMMON::HeapCell bcell = p_space.m_SPTQueue.pop(); + const BKTNode& tnode = m_pTreeRoots[bcell.node]; + if (tnode.childStart < 0) { + if (!p_space.CheckAndSet(tnode.centerid)) { + p_space.m_iNumberOfCheckedLeaves++; + p_space.m_NGQueue.insert(COMMON::HeapCell(tnode.centerid, bcell.distance)); + } + if (p_space.m_iNumberOfCheckedLeaves >= p_limits) break; + } + else { + if (!p_space.CheckAndSet(tnode.centerid)) { + p_space.m_NGQueue.insert(COMMON::HeapCell(tnode.centerid, bcell.distance)); + } + for (int begin = tnode.childStart; begin < tnode.childEnd; begin++) { + int index = m_pTreeRoots[begin].centerid; + p_space.m_SPTQueue.insert(COMMON::HeapCell(begin, p_index->ComputeDistance((const void*)p_query.GetTarget(), p_index->GetSample(index)))); + } + } + } while (!p_space.m_SPTQueue.empty()); + } + + private: + + template + float KmeansAssign(VectorIndex* p_index, + std::vector& indices, + const int first, const int last, KmeansArgs& args, const bool updateCenters) const { + float currDist = 0; + int threads = omp_get_num_threads(); + float lambda = (updateCenters) ? COMMON::Utils::GetBase() * COMMON::Utils::GetBase() / (100.0f * (last - first)) : 0.0f; + int subsize = (last - first - 1) / threads + 1; + +#pragma omp parallel for + for (int tid = 0; tid < threads; tid++) + { + int istart = first + tid * subsize; + int iend = min(first + (tid + 1) * subsize, last); + int *inewCounts = args.newCounts + tid * m_iBKTKmeansK; + float *inewCenters = args.newCenters + tid * m_iBKTKmeansK * p_index->GetFeatureDim(); + int * iclusterIdx = args.clusterIdx + tid * m_iBKTKmeansK; + float * iclusterDist = args.clusterDist + tid * m_iBKTKmeansK; + float idist = 0; + for (int i = istart; i < iend; i++) { + int clusterid = 0; + float smallestDist = MaxDist; + for (int k = 0; k < m_iBKTKmeansK; k++) { + float dist = p_index->ComputeDistance(p_index->GetSample(indices[i]), (const void*)(args.centers + k*p_index->GetFeatureDim())) + lambda*args.counts[k]; + if (dist > -MaxDist && dist < smallestDist) { + clusterid = k; smallestDist = dist; + } + } + args.label[i] = clusterid; + inewCounts[clusterid]++; + idist += smallestDist; + if (updateCenters) { + const T* v = (const T*)p_index->GetSample(indices[i]); + float* center = inewCenters + clusterid*p_index->GetFeatureDim(); + for (int j = 0; j < p_index->GetFeatureDim(); j++) center[j] += v[j]; + if (smallestDist > iclusterDist[clusterid]) { + iclusterDist[clusterid] = smallestDist; + iclusterIdx[clusterid] = indices[i]; + } + } + else { + if (smallestDist <= iclusterDist[clusterid]) { + iclusterDist[clusterid] = smallestDist; + iclusterIdx[clusterid] = indices[i]; + } + } + } + COMMON::Utils::atomic_float_add(&currDist, idist); + } + + for (int i = 1; i < threads; i++) { + for (int k = 0; k < m_iBKTKmeansK; k++) + args.newCounts[k] += args.newCounts[i*m_iBKTKmeansK + k]; + } + + if (updateCenters) { + for (int i = 1; i < threads; i++) { + float* currCenter = args.newCenters + i*m_iBKTKmeansK*p_index->GetFeatureDim(); + for (int j = 0; j < m_iBKTKmeansK * p_index->GetFeatureDim(); j++) args.newCenters[j] += currCenter[j]; + } + + int maxcluster = 0; + for (int k = 1; k < m_iBKTKmeansK; k++) if (args.newCounts[maxcluster] < args.newCounts[k]) maxcluster = k; + + int maxid = maxcluster; + for (int tid = 1; tid < threads; tid++) { + if (args.clusterDist[maxid] < args.clusterDist[tid * m_iBKTKmeansK + maxcluster]) maxid = tid * m_iBKTKmeansK + maxcluster; + } + if (args.clusterIdx[maxid] < 0 || args.clusterIdx[maxid] >= p_index->GetNumSamples()) + std::cout << "first:" << first << " last:" << last << " maxcluster:" << maxcluster << "(" << args.newCounts[maxcluster] << ") Error maxid:" << maxid << " dist:" << args.clusterDist[maxid] << std::endl; + maxid = args.clusterIdx[maxid]; + + for (int k = 0; k < m_iBKTKmeansK; k++) { + T* TCenter = args.newTCenters + k * p_index->GetFeatureDim(); + if (args.newCounts[k] == 0) { + //int nextid = Utils::rand_int(last, first); + //while (args.label[nextid] != maxcluster) nextid = Utils::rand_int(last, first); + int nextid = maxid; + std::memcpy(TCenter, p_index->GetSample(nextid), sizeof(T)*p_index->GetFeatureDim()); + } + else { + float* currCenters = args.newCenters + k * p_index->GetFeatureDim(); + for (int j = 0; j < p_index->GetFeatureDim(); j++) currCenters[j] /= args.newCounts[k]; + + if (p_index->GetDistCalcMethod() == DistCalcMethod::Cosine) { + COMMON::Utils::Normalize(currCenters, p_index->GetFeatureDim(), COMMON::Utils::GetBase()); + } + for (int j = 0; j < p_index->GetFeatureDim(); j++) TCenter[j] = (T)(currCenters[j]); + } + } + } + else { + for (int i = 1; i < threads; i++) { + for (int k = 0; k < m_iBKTKmeansK; k++) { + if (args.clusterIdx[i*m_iBKTKmeansK + k] != -1 && args.clusterDist[i*m_iBKTKmeansK + k] <= args.clusterDist[k]) { + args.clusterDist[k] = args.clusterDist[i*m_iBKTKmeansK + k]; + args.clusterIdx[k] = args.clusterIdx[i*m_iBKTKmeansK + k]; + } + } + } + } + return currDist; + } + + template + int KmeansClustering(VectorIndex* p_index, + std::vector& indices, const int first, const int last, KmeansArgs& args) const { + int iterLimit = 100; + + int batchEnd = min(first + m_iSamples, last); + float currDiff, currDist, minClusterDist = MaxDist; + for (int numKmeans = 0; numKmeans < 3; numKmeans++) { + for (int k = 0; k < m_iBKTKmeansK; k++) { + int randid = COMMON::Utils::rand_int(last, first); + std::memcpy(args.centers + k*p_index->GetFeatureDim(), p_index->GetSample(indices[randid]), sizeof(T)*p_index->GetFeatureDim()); + } + args.ClearCounts(); + currDist = KmeansAssign(p_index, indices, first, batchEnd, args, false); + if (currDist < minClusterDist) { + minClusterDist = currDist; + memcpy(args.newTCenters, args.centers, sizeof(T)*m_iBKTKmeansK*p_index->GetFeatureDim()); + memcpy(args.counts, args.newCounts, sizeof(int) * m_iBKTKmeansK); + } + } + + minClusterDist = MaxDist; + int noImprovement = 0; + for (int iter = 0; iter < iterLimit; iter++) { + std::memcpy(args.centers, args.newTCenters, sizeof(T)*m_iBKTKmeansK*p_index->GetFeatureDim()); + std::random_shuffle(indices.begin() + first, indices.begin() + last); + + args.ClearCenters(); + args.ClearCounts(); + args.ClearDists(-MaxDist); + currDist = KmeansAssign(p_index, indices, first, batchEnd, args, true); + memcpy(args.counts, args.newCounts, sizeof(int)*m_iBKTKmeansK); + + currDiff = 0; + for (int k = 0; k < m_iBKTKmeansK; k++) { + currDiff += p_index->ComputeDistance((const void*)(args.centers + k*p_index->GetFeatureDim()), (const void*)(args.newTCenters + k*p_index->GetFeatureDim())); + } + + if (currDist < minClusterDist) { + noImprovement = 0; + minClusterDist = currDist; + } + else { + noImprovement++; + } + if (currDiff < 1e-3 || noImprovement >= 5) break; + } + + args.ClearCounts(); + args.ClearDists(MaxDist); + currDist = KmeansAssign(p_index, indices, first, last, args, false); + memcpy(args.counts, args.newCounts, sizeof(int)*m_iBKTKmeansK); + + int numClusters = 0; + for (int i = 0; i < m_iBKTKmeansK; i++) if (args.counts[i] > 0) numClusters++; + + if (numClusters <= 1) { + //if (last - first > 1) std::cout << "large cluster:" << last - first << " dist:" << currDist << std::endl; + return numClusters; + } + args.Shuffle(indices, first, last); + return numClusters; + } + + private: + std::vector m_pTreeStart; + std::vector m_pTreeRoots; + std::unordered_map m_pSampleCenterMap; + + public: + int m_iTreeNumber, m_iBKTKmeansK, m_iBKTLeafSize, m_iSamples; + }; + } +} +#endif diff --git a/AnnService/inc/Core/Common/Dataset.h b/AnnService/inc/Core/Common/Dataset.h index 4753b088..fd7817f8 100644 --- a/AnnService/inc/Core/Common/Dataset.h +++ b/AnnService/inc/Core/Common/Dataset.h @@ -3,11 +3,19 @@ #include +#if defined(_MSC_VER) || defined(__INTEL_COMPILER) +#include +#else +#include +#endif // defined(__GNUC__) + #define ALIGN 32 #define aligned_malloc(a, b) _mm_malloc(a, b) #define aligned_free(a) _mm_free(a) +#pragma warning(disable:4996) // 'fopen': This function or variable may be unsafe. Consider using fopen_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details. + namespace SPTAG { namespace COMMON @@ -21,86 +29,158 @@ namespace SPTAG int cols; bool ownData = false; T* data = nullptr; - std::vector* dataIncremental = nullptr; + std::vector dataIncremental; public: - Dataset() {} - Dataset(int rows_, int cols_, T* data_ = nullptr) + Dataset(): rows(0), cols(1) {} + Dataset(int rows_, int cols_, T* data_ = nullptr, bool transferOnwership_ = true) { - Initialize(rows_, cols_, data_); + Initialize(rows_, cols_, data_, transferOnwership_); } ~Dataset() { if (ownData) aligned_free(data); - if (dataIncremental) { - dataIncremental->clear(); - delete dataIncremental; - } } - void Initialize(int rows_, int cols_, T* data_ = nullptr) + void Initialize(int rows_, int cols_, T* data_ = nullptr, bool transferOnwership_ = true) { rows = rows_; cols = cols_; data = data_; - if (data == nullptr) + if (data_ == nullptr || !transferOnwership_) { ownData = true; data = (T*)aligned_malloc(sizeof(T) * rows * cols, ALIGN); + if (data_ != nullptr) memcpy(data, data_, rows * cols * sizeof(T)); + else std::memset(data, -1, rows * cols * sizeof(T)); } - dataIncremental = new std::vector(); } void SetR(int R_) { if (R_ >= rows) - dataIncremental->resize((R_ - rows) * cols); + dataIncremental.resize((R_ - rows) * cols); else { rows = R_; - dataIncremental->clear(); + dataIncremental.clear(); } } - int R() const { return (int)(rows + dataIncremental->size() / cols); } - int C() const { return cols; } + inline int R() const { return (int)(rows + dataIncremental.size() / cols); } + inline int C() const { return cols; } T* operator[](int index) { if (index >= rows) { - return dataIncremental->data() + (size_t)(index - rows)*cols; + return dataIncremental.data() + (size_t)(index - rows)*cols; } return data + (size_t)index*cols; } + const T* operator[](int index) const { if (index >= rows) { - return dataIncremental->data() + (size_t)(index - rows)*cols; + return dataIncremental.data() + (size_t)(index - rows)*cols; } return data + (size_t)index*cols; } - T* GetData() + void AddBatch(const T* pData, int num) { - return data; + dataIncremental.insert(dataIncremental.end(), pData, pData + num*cols); } - void reset() + void AddBatch(int num) { - if (ownData) { - aligned_free(data); - ownData = false; + dataIncremental.insert(dataIncremental.end(), (size_t)num*cols, T(-1)); + } + + bool Save(std::string sDataPointsFileName) + { + std::cout << "Save Data To " << sDataPointsFileName << std::endl; + FILE * fp = fopen(sDataPointsFileName.c_str(), "wb"); + if (fp == NULL) return false; + + int CR = R(); + fwrite(&CR, sizeof(int), 1, fp); + fwrite(&cols, sizeof(int), 1, fp); + + T* ptr = data; + int toWrite = rows; + while (toWrite > 0) + { + size_t write = fwrite(ptr, sizeof(T) * cols, toWrite, fp); + ptr += write * cols; + toWrite -= (int)write; } - if (dataIncremental) { - dataIncremental->clear(); - delete dataIncremental; + ptr = dataIncremental.data(); + toWrite = CR - rows; + while (toWrite > 0) + { + size_t write = fwrite(ptr, sizeof(T) * cols, toWrite, fp); + ptr += write * cols; + toWrite -= (int)write; } + fclose(fp); + + std::cout << "Save Data (" << CR << ", " << cols << ") Finish!" << std::endl; + return true; } - - void AddBatch(const T* pData, int num) + + bool Load(std::string sDataPointsFileName) { - dataIncremental->insert(dataIncremental->end(), pData, pData + num*cols); + std::cout << "Load Data From " << sDataPointsFileName << std::endl; + FILE * fp = fopen(sDataPointsFileName.c_str(), "rb"); + if (fp == NULL) return false; + + int R, C; + fread(&R, sizeof(int), 1, fp); + fread(&C, sizeof(int), 1, fp); + + Initialize(R, C); + T* ptr = data; + while (R > 0) { + size_t read = fread(ptr, sizeof(T) * C, R, fp); + ptr += read * C; + R -= (int)read; + } + fclose(fp); + std::cout << "Load Data (" << rows << ", " << cols << ") Finish!" << std::endl; + return true; } - void AddBatch(int num) + // Functions for loading models from memory mapped files + bool Load(char* pDataPointsMemFile) { - dataIncremental->insert(dataIncremental->end(), (size_t)num*cols, T(-1)); + int R, C; + R = *((int*)pDataPointsMemFile); + pDataPointsMemFile += sizeof(int); + + C = *((int*)pDataPointsMemFile); + pDataPointsMemFile += sizeof(int); + + Initialize(R, C, (T*)pDataPointsMemFile); + return true; + } + + bool Refine(const std::vector& indices, std::string sDataPointsFileName) + { + std::cout << "Save Refine Data To " << sDataPointsFileName << std::endl; + FILE * fp = fopen(sDataPointsFileName.c_str(), "wb"); + if (fp == NULL) return false; + + int R = (int)(indices.size()); + fwrite(&R, sizeof(int), 1, fp); + fwrite(&cols, sizeof(int), 1, fp); + + // write point one by one in case for cache miss + for (int i = 0; i < R; i++) { + if (indices[i] < rows) + fwrite(data + (size_t)indices[i] * cols, sizeof(T) * cols, 1, fp); + else + fwrite(dataIncremental.data() + (size_t)(indices[i] - rows) * cols, sizeof(T) * cols, 1, fp); + } + fclose(fp); + + std::cout << "Save Refine Data (" << R << ", " << cols << ") Finish!" << std::endl; + return true; } }; } diff --git a/AnnService/inc/Core/Common/KDTree.h b/AnnService/inc/Core/Common/KDTree.h new file mode 100644 index 00000000..ab2e1779 --- /dev/null +++ b/AnnService/inc/Core/Common/KDTree.h @@ -0,0 +1,327 @@ +#ifndef _SPTAG_COMMON_KDTREE_H_ +#define _SPTAG_COMMON_KDTREE_H_ + +#include +#include +#include + +#include "../VectorIndex.h" + +#include "CommonUtils.h" +#include "QueryResultSet.h" +#include "WorkSpace.h" + +#pragma warning(disable:4996) // 'fopen': This function or variable may be unsafe. Consider using fopen_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details. + +namespace SPTAG +{ + namespace COMMON + { + // node type for storing KDT + struct KDTNode + { + int left; + int right; + short split_dim; + float split_value; + }; + + class KDTree + { + public: + KDTree() : m_iTreeNumber(2), m_numTopDimensionKDTSplit(5), m_iSamples(1000) {} + + KDTree(KDTree& other) : m_iTreeNumber(other.m_iTreeNumber), + m_numTopDimensionKDTSplit(other.m_numTopDimensionKDTSplit), + m_iSamples(other.m_iSamples) {} + ~KDTree() {} + + inline const KDTNode& operator[](int index) const { return m_pTreeRoots[index]; } + inline KDTNode& operator[](int index) { return m_pTreeRoots[index]; } + + inline int size() const { return (int)m_pTreeRoots.size(); } + + template + void BuildTrees(VectorIndex* p_index, std::vector* indices = nullptr) + { + std::vector localindices; + if (indices == nullptr) { + localindices.resize(p_index->GetNumSamples()); + for (int i = 0; i < p_index->GetNumSamples(); i++) localindices[i] = i; + } + else { + localindices.assign(indices->begin(), indices->end()); + } + + m_pTreeRoots.resize(m_iTreeNumber * localindices.size()); + m_pTreeStart.resize(m_iTreeNumber, 0); +#pragma omp parallel for + for (int i = 0; i < m_iTreeNumber; i++) + { + Sleep(i * 100); std::srand(clock()); + + std::vector pindices(localindices.begin(), localindices.end()); + std::random_shuffle(pindices.begin(), pindices.end()); + + m_pTreeStart[i] = i * (int)pindices.size(); + std::cout << "Start to build KDTree " << i + 1 << std::endl; + int iTreeSize = m_pTreeStart[i]; + DivideTree(p_index, pindices, 0, (int)pindices.size() - 1, m_pTreeStart[i], iTreeSize); + std::cout << i + 1 << " KDTree built, " << iTreeSize - m_pTreeStart[i] << " " << pindices.size() << std::endl; + } + } + + bool SaveTrees(std::string sTreeFileName) const + { + std::cout << "Save KDT to " << sTreeFileName << std::endl; + FILE *fp = fopen(sTreeFileName.c_str(), "wb"); + if (fp == NULL) return false; + + fwrite(&m_iTreeNumber, sizeof(int), 1, fp); + fwrite(m_pTreeStart.data(), sizeof(int), m_iTreeNumber, fp); + int treeNodeSize = (int)m_pTreeRoots.size(); + fwrite(&treeNodeSize, sizeof(int), 1, fp); + fwrite(m_pTreeRoots.data(), sizeof(KDTNode), treeNodeSize, fp); + fclose(fp); + std::cout << "Save KDT (" << m_iTreeNumber << "," << treeNodeSize << ") Finish!" << std::endl; + return true; + } + + bool LoadTrees(char* pKDTMemFile) + { + m_iTreeNumber = *((int*)pKDTMemFile); + pKDTMemFile += sizeof(int); + m_pTreeStart.resize(m_iTreeNumber); + memcpy(m_pTreeStart.data(), pKDTMemFile, sizeof(int) * m_iTreeNumber); + pKDTMemFile += sizeof(int)*m_iTreeNumber; + + int treeNodeSize = *((int*)pKDTMemFile); + pKDTMemFile += sizeof(int); + m_pTreeRoots.resize(treeNodeSize); + memcpy(m_pTreeRoots.data(), pKDTMemFile, sizeof(KDTNode) * treeNodeSize); + return true; + } + + bool LoadTrees(std::string sTreeFileName) + { + std::cout << "Load KDT From " << sTreeFileName << std::endl; + FILE *fp = fopen(sTreeFileName.c_str(), "rb"); + if (fp == NULL) return false; + + fread(&m_iTreeNumber, sizeof(int), 1, fp); + m_pTreeStart.resize(m_iTreeNumber); + fread(m_pTreeStart.data(), sizeof(int), m_iTreeNumber, fp); + + int treeNodeSize; + fread(&treeNodeSize, sizeof(int), 1, fp); + m_pTreeRoots.resize(treeNodeSize); + fread(m_pTreeRoots.data(), sizeof(KDTNode), treeNodeSize, fp); + fclose(fp); + std::cout << "Load KDT (" << m_iTreeNumber << "," << treeNodeSize << ") Finish!" << std::endl; + return true; + } + + template + void InitSearchTrees(const VectorIndex* p_index, const COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const int p_limits) const + { + for (char i = 0; i < m_iTreeNumber; i++) { + KDTSearch(p_index, p_query, p_space, m_pTreeStart[i], true, 0); + } + + while (!p_space.m_SPTQueue.empty() && p_space.m_iNumberOfCheckedLeaves < p_limits) + { + auto& tcell = p_space.m_SPTQueue.pop(); + if (p_query.worstDist() < tcell.distance) break; + KDTSearch(p_index, p_query, p_space, tcell.node, true, tcell.distance); + } + } + + template + void SearchTrees(const VectorIndex* p_index, const COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const int p_limits) const + { + while (!p_space.m_SPTQueue.empty() && p_space.m_iNumberOfCheckedLeaves < p_limits) + { + auto& tcell = p_space.m_SPTQueue.pop(); + KDTSearch(p_index, p_query, p_space, tcell.node, false, tcell.distance); + } + } + + private: + + template + void KDTSearch(const VectorIndex* p_index, const COMMON::QueryResultSet &p_query, + COMMON::WorkSpace& p_space, const int node, const bool isInit, const float distBound) const { + if (node < 0) + { + int index = -node - 1; + if (index >= p_index->GetNumSamples()) return; +#ifdef PREFETCH + const char* data = (const char *)(p_index->GetSample(index)); + _mm_prefetch(data, _MM_HINT_T0); + _mm_prefetch(data + 64, _MM_HINT_T0); +#endif + if (p_space.CheckAndSet(index)) return; + + ++p_space.m_iNumberOfTreeCheckedLeaves; + ++p_space.m_iNumberOfCheckedLeaves; + p_space.m_NGQueue.insert(COMMON::HeapCell(index, p_index->ComputeDistance((const void*)p_query.GetTarget(), (const void*)data))); + return; + } + + auto& tnode = m_pTreeRoots[node]; + + float diff = (p_query.GetTarget())[tnode.split_dim] - tnode.split_value; + float distanceBound = distBound + diff * diff; + int otherChild, bestChild; + if (diff < 0) + { + bestChild = tnode.left; + otherChild = tnode.right; + } + else + { + otherChild = tnode.left; + bestChild = tnode.right; + } + + if (!isInit || distanceBound < p_query.worstDist()) + { + p_space.m_SPTQueue.insert(COMMON::HeapCell(otherChild, distanceBound)); + } + KDTSearch(p_index, p_query, p_space, bestChild, isInit, distBound); + } + + + template + void DivideTree(VectorIndex* p_index, std::vector& indices, int first, int last, + int index, int &iTreeSize) { + ChooseDivision(p_index, m_pTreeRoots[index], indices, first, last); + int i = Subdivide(p_index, m_pTreeRoots[index], indices, first, last); + if (i - 1 <= first) + { + m_pTreeRoots[index].left = -indices[first] - 1; + } + else + { + iTreeSize++; + m_pTreeRoots[index].left = iTreeSize; + DivideTree(p_index, indices, first, i - 1, iTreeSize, iTreeSize); + } + if (last == i) + { + m_pTreeRoots[index].right = -indices[last] - 1; + } + else + { + iTreeSize++; + m_pTreeRoots[index].right = iTreeSize; + DivideTree(p_index, indices, i, last, iTreeSize, iTreeSize); + } + } + + template + void ChooseDivision(VectorIndex* p_index, KDTNode& node, const std::vector& indices, const int first, const int last) + { + std::vector meanValues(p_index->GetFeatureDim(), 0); + std::vector varianceValues(p_index->GetFeatureDim(), 0); + int end = min(first + m_iSamples, last); + int count = end - first + 1; + // calculate the mean of each dimension + for (int j = first; j <= end; j++) + { + const T* v = (const T*)p_index->GetSample(indices[j]); + for (int k = 0; k < p_index->GetFeatureDim(); k++) + { + meanValues[k] += v[k]; + } + } + for (int k = 0; k < p_index->GetFeatureDim(); k++) + { + meanValues[k] /= count; + } + // calculate the variance of each dimension + for (int j = first; j <= end; j++) + { + const T* v = (const T*)p_index->GetSample(indices[j]); + for (int k = 0; k < p_index->GetFeatureDim(); k++) + { + float dist = v[k] - meanValues[k]; + varianceValues[k] += dist*dist; + } + } + // choose the split dimension as one of the dimension inside TOP_DIM maximum variance + node.split_dim = SelectDivisionDimension(varianceValues); + // determine the threshold + node.split_value = meanValues[node.split_dim]; + } + + int SelectDivisionDimension(const std::vector& varianceValues) const + { + // Record the top maximum variances + std::vector topind(m_numTopDimensionKDTSplit); + int num = 0; + // order the variances + for (int i = 0; i < varianceValues.size(); i++) + { + if (num < m_numTopDimensionKDTSplit || varianceValues[i] > varianceValues[topind[num - 1]]) + { + if (num < m_numTopDimensionKDTSplit) + { + topind[num++] = i; + } + else + { + topind[num - 1] = i; + } + int j = num - 1; + // order the TOP_DIM variances + while (j > 0 && varianceValues[topind[j]] > varianceValues[topind[j - 1]]) + { + std::swap(topind[j], topind[j - 1]); + j--; + } + } + } + // randomly choose a dimension from TOP_DIM + return topind[COMMON::Utils::rand_int(num)]; + } + + template + int Subdivide(VectorIndex* p_index, const KDTNode& node, std::vector& indices, const int first, const int last) const + { + int i = first; + int j = last; + // decide which child one point belongs + while (i <= j) + { + int ind = indices[i]; + const T* v = (const T*)p_index->GetSample(ind); + float val = v[node.split_dim]; + if (val < node.split_value) + { + i++; + } + else + { + std::swap(indices[i], indices[j]); + j--; + } + } + // if all the points in the node are equal,equally split the node into 2 + if ((i == first) || (i == last + 1)) + { + i = (first + last + 1) / 2; + } + return i; + } + + private: + std::vector m_pTreeStart; + std::vector m_pTreeRoots; + + public: + int m_iTreeNumber, m_numTopDimensionKDTSplit, m_iSamples; + }; + } +} +#endif diff --git a/AnnService/inc/Core/Common/NeighborhoodGraph.h b/AnnService/inc/Core/Common/NeighborhoodGraph.h new file mode 100644 index 00000000..9e646b69 --- /dev/null +++ b/AnnService/inc/Core/Common/NeighborhoodGraph.h @@ -0,0 +1,408 @@ +#ifndef _SPTAG_COMMON_NG_H_ +#define _SPTAG_COMMON_NG_H_ + +#include "../VectorIndex.h" + +#include "CommonUtils.h" +#include "Dataset.h" +#include "FineGrainedLock.h" +#include "QueryResultSet.h" + +namespace SPTAG +{ + namespace COMMON + { + class NeighborhoodGraph + { + public: + NeighborhoodGraph(): m_iTPTNumber(32), + m_iTPTLeafSize(2000), + m_iSamples(1000), + m_numTopDimensionTPTSplit(5), + m_iNeighborhoodSize(32), + m_iNeighborhoodScale(16), + m_iCEFScale(4), + m_iRefineIter(0), + m_iCEF(1000), + m_iMaxCheckForRefineGraph(10000) {} + + ~NeighborhoodGraph() {} + + virtual void InsertNeighbors(VectorIndex* index, const int node, int insertNode, float insertDist) = 0; + + virtual void RebuildNeighbors(VectorIndex* index, const int node, int* nodes, const BasicResult* queryResults, const int numResults) = 0; + + virtual float GraphAccuracyEstimation(VectorIndex* index, const int samples, const std::unordered_map* idmap = nullptr) = 0; + + template + void BuildGraph(VectorIndex* index, const std::unordered_map* idmap = nullptr) + { + std::cout << "build RNG graph!" << std::endl; + + m_iGraphSize = index->GetNumSamples(); + m_iNeighborhoodSize = m_iNeighborhoodSize * m_iNeighborhoodScale; + m_pNeighborhoodGraph.Initialize(m_iGraphSize, m_iNeighborhoodSize); + m_dataUpdateLock.resize(m_iGraphSize); + + if (m_iGraphSize < 1000) { + RefineGraph(index, idmap); + std::cout << "Build RNG Graph end!" << std::endl; + return; + } + + { + COMMON::Dataset NeighborhoodDists(m_iGraphSize, m_iNeighborhoodSize); + std::vector> TptreeDataIndices(m_iTPTNumber, std::vector(m_iGraphSize)); + std::vector>> TptreeLeafNodes(m_iTPTNumber, std::vector>()); + + for (int i = 0; i < m_iGraphSize; i++) + for (int j = 0; j < m_iNeighborhoodSize; j++) + (NeighborhoodDists)[i][j] = MaxDist; + + std::cout << "Parallel TpTree Partition begin " << std::endl; +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < m_iTPTNumber; i++) + { + Sleep(i * 100); std::srand(clock()); + for (int j = 0; j < m_iGraphSize; j++) TptreeDataIndices[i][j] = j; + std::random_shuffle(TptreeDataIndices[i].begin(), TptreeDataIndices[i].end()); + PartitionByTptree(index, TptreeDataIndices[i], 0, m_iGraphSize - 1, TptreeLeafNodes[i]); + std::cout << "Finish Getting Leaves for Tree " << i << std::endl; + } + std::cout << "Parallel TpTree Partition done" << std::endl; + + for (int i = 0; i < m_iTPTNumber; i++) + { +#pragma omp parallel for schedule(dynamic) + for (int j = 0; j < TptreeLeafNodes[i].size(); j++) + { + int start_index = TptreeLeafNodes[i][j].first; + int end_index = TptreeLeafNodes[i][j].second; + if (omp_get_thread_num() == 0) std::cout << "\rProcessing Tree " << i << ' ' << j * 100 / TptreeLeafNodes[i].size() << '%'; + for (int x = start_index; x < end_index; x++) + { + for (int y = x + 1; y <= end_index; y++) + { + int p1 = TptreeDataIndices[i][x]; + int p2 = TptreeDataIndices[i][y]; + float dist = index->ComputeDistance(index->GetSample(p1), index->GetSample(p2)); + if (idmap != nullptr) { + p1 = (idmap->find(p1) == idmap->end()) ? p1 : idmap->at(p1); + p2 = (idmap->find(p2) == idmap->end()) ? p2 : idmap->at(p2); + } + COMMON::Utils::AddNeighbor(p2, dist, (m_pNeighborhoodGraph)[p1], (NeighborhoodDists)[p1], m_iNeighborhoodSize); + COMMON::Utils::AddNeighbor(p1, dist, (m_pNeighborhoodGraph)[p2], (NeighborhoodDists)[p2], m_iNeighborhoodSize); + } + } + } + TptreeDataIndices[i].clear(); + TptreeLeafNodes[i].clear(); + std::cout << std::endl; + } + TptreeDataIndices.clear(); + TptreeLeafNodes.clear(); + } + + if (m_iMaxCheckForRefineGraph > 0) { + RefineGraph(index, idmap); + } + } + + template + void RefineGraph(VectorIndex* index, const std::unordered_map* idmap = nullptr) + { + m_iCEF *= m_iCEFScale; + m_iMaxCheckForRefineGraph *= m_iCEFScale; + +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < m_iGraphSize; i++) + { + RefineNode(index, i, false); + } + std::cout << "Refine RNG, graph acc:" << GraphAccuracyEstimation(index, 100, idmap) << std::endl; + + m_iCEF /= m_iCEFScale; + m_iMaxCheckForRefineGraph /= m_iCEFScale; + m_iNeighborhoodSize /= m_iNeighborhoodScale; + +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < m_iGraphSize; i++) + { + RefineNode(index, i, false); + } + std::cout << "Refine RNG, graph acc:" << GraphAccuracyEstimation(index, 100, idmap) << std::endl; + + if (idmap != nullptr) { + for (auto iter = idmap->begin(); iter != idmap->end(); iter++) + if (iter->first < 0) + { + m_pNeighborhoodGraph[-1 - iter->first][m_iNeighborhoodSize - 1] = -2 - iter->second; + } + } + } + + template + ErrorCode RefineGraph(VectorIndex* index, std::vector& indices, std::vector& reverseIndices, + std::string graphFileName, const std::unordered_map* idmap = nullptr) + { + int R = (int)indices.size(); + +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < R; i++) + { + RefineNode(index, indices[i], false); + int* nodes = m_pNeighborhoodGraph[indices[i]]; + for (int j = 0; j < m_iNeighborhoodSize; j++) + { + if (nodes[j] < 0) nodes[j] = -1; + else nodes[j] = reverseIndices[nodes[j]]; + } + if (idmap == nullptr || idmap->find(-1 - indices[i]) == idmap->end()) continue; + nodes[m_iNeighborhoodSize - 1] = -2 - idmap->at(-1 - indices[i]); + } + + std::ofstream graphOut(graphFileName, std::ios::binary); + if (!graphOut.is_open()) return ErrorCode::FailedCreateFile; + graphOut.write((char*)&R, sizeof(int)); + graphOut.write((char*)&m_iNeighborhoodSize, sizeof(int)); + for (int i = 0; i < R; i++) { + graphOut.write((char*)m_pNeighborhoodGraph[indices[i]], sizeof(int) * m_iNeighborhoodSize); + } + graphOut.close(); + return ErrorCode::Success; + } + + + template + void RefineNode(VectorIndex* index, const int node, bool updateNeighbors) + { + COMMON::QueryResultSet query((const T*)index->GetSample(node), m_iCEF + 1); + index->SearchIndex(query); + RebuildNeighbors(index, node, m_pNeighborhoodGraph[node], query.GetResults(), m_iCEF + 1); + + if (updateNeighbors) { + // update neighbors + for (int j = 0; j <= m_iCEF; j++) + { + BasicResult* item = query.GetResult(j); + if (item->VID < 0) break; + if (item->VID == node) continue; + + std::lock_guard lock(m_dataUpdateLock[item->VID]); + InsertNeighbors(index, item->VID, node, item->Dist); + } + } + } + + template + void PartitionByTptree(VectorIndex* index, std::vector& indices, const int first, const int last, + std::vector> & leaves) + { + if (last - first <= m_iTPTLeafSize) + { + leaves.push_back(std::make_pair(first, last)); + } + else + { + std::vector Mean(index->GetFeatureDim(), 0); + + int iIteration = 100; + int end = min(first + m_iSamples, last); + int count = end - first + 1; + // calculate the mean of each dimension + for (int j = first; j <= end; j++) + { + const T* v = (const T*)index->GetSample(indices[j]); + for (int k = 0; k < index->GetFeatureDim(); k++) + { + Mean[k] += v[k]; + } + } + for (int k = 0; k < index->GetFeatureDim(); k++) + { + Mean[k] /= count; + } + std::vector Variance; + Variance.reserve(index->GetFeatureDim()); + for (int j = 0; j < index->GetFeatureDim(); j++) + { + Variance.push_back(BasicResult(j, 0)); + } + // calculate the variance of each dimension + for (int j = first; j <= end; j++) + { + const T* v = (const T*)index->GetSample(indices[j]); + for (int k = 0; k < index->GetFeatureDim(); k++) + { + float dist = v[k] - Mean[k]; + Variance[k].Dist += dist*dist; + } + } + std::sort(Variance.begin(), Variance.end(), COMMON::Compare); + std::vector indexs(m_numTopDimensionTPTSplit); + std::vector weight(m_numTopDimensionTPTSplit), bestweight(m_numTopDimensionTPTSplit); + float bestvariance = Variance[index->GetFeatureDim() - 1].Dist; + for (int i = 0; i < m_numTopDimensionTPTSplit; i++) + { + indexs[i] = Variance[index->GetFeatureDim() - 1 - i].VID; + bestweight[i] = 0; + } + bestweight[0] = 1; + float bestmean = Mean[indexs[0]]; + + std::vector Val(count); + for (int i = 0; i < iIteration; i++) + { + float sumweight = 0; + for (int j = 0; j < m_numTopDimensionTPTSplit; j++) + { + weight[j] = float(rand() % 10000) / 5000.0f - 1.0f; + sumweight += weight[j] * weight[j]; + } + sumweight = sqrt(sumweight); + for (int j = 0; j < m_numTopDimensionTPTSplit; j++) + { + weight[j] /= sumweight; + } + float mean = 0; + for (int j = 0; j < count; j++) + { + Val[j] = 0; + const T* v = (const T*)index->GetSample(indices[first + j]); + for (int k = 0; k < m_numTopDimensionTPTSplit; k++) + { + Val[j] += weight[k] * v[indexs[k]]; + } + mean += Val[j]; + } + mean /= count; + float var = 0; + for (int j = 0; j < count; j++) + { + float dist = Val[j] - mean; + var += dist * dist; + } + if (var > bestvariance) + { + bestvariance = var; + bestmean = mean; + for (int j = 0; j < m_numTopDimensionTPTSplit; j++) + { + bestweight[j] = weight[j]; + } + } + } + int i = first; + int j = last; + // decide which child one point belongs + while (i <= j) + { + float val = 0; + const T* v = (const T*)index->GetSample(indices[i]); + for (int k = 0; k < m_numTopDimensionTPTSplit; k++) + { + val += bestweight[k] * v[indexs[k]]; + } + if (val < bestmean) + { + i++; + } + else + { + std::swap(indices[i], indices[j]); + j--; + } + } + // if all the points in the node are equal,equally split the node into 2 + if ((i == first) || (i == last + 1)) + { + i = (first + last + 1) / 2; + } + + Mean.clear(); + Variance.clear(); + Val.clear(); + indexs.clear(); + weight.clear(); + bestweight.clear(); + + PartitionByTptree(index, indices, first, i - 1, leaves); + PartitionByTptree(index, indices, i, last, leaves); + } + } + + bool LoadGraph(std::string sGraphFilename) + { + std::cout << "Load Graph From " << sGraphFilename << std::endl; + FILE * fp = fopen(sGraphFilename.c_str(), "rb"); + if (fp == NULL) return false; + + fread(&m_iGraphSize, sizeof(int), 1, fp); + fread(&m_iNeighborhoodSize, sizeof(int), 1, fp); + m_pNeighborhoodGraph.Initialize(m_iGraphSize, m_iNeighborhoodSize); + m_dataUpdateLock.resize(m_iGraphSize); + + for (int i = 0; i < m_iGraphSize; i++) + { + fread((m_pNeighborhoodGraph)[i], sizeof(int), m_iNeighborhoodSize, fp); + } + fclose(fp); + std::cout << "Load Graph (" << m_iGraphSize << "," << m_iNeighborhoodSize << ") Finish!" << std::endl; + return true; + } + + bool SetGraph(char* pGraphMemFile) + { + m_iGraphSize = *((int*)pGraphMemFile); + pGraphMemFile += sizeof(int); + + m_iNeighborhoodSize = *((int*)pGraphMemFile); + pGraphMemFile += sizeof(int); + + m_pNeighborhoodGraph.Initialize(m_iGraphSize, m_iNeighborhoodSize, (int*)pGraphMemFile); + m_dataUpdateLock.resize(m_iGraphSize); + return true; + } + + bool SaveGraph(std::string sGraphFilename) const + { + std::cout << "Save Graph To " << sGraphFilename << std::endl; + FILE *fp = fopen(sGraphFilename.c_str(), "wb"); + if (fp == NULL) return false; + + fwrite(&m_iGraphSize, sizeof(int), 1, fp); + fwrite(&m_iNeighborhoodSize, sizeof(int), 1, fp); + for (int i = 0; i < m_iGraphSize; i++) + { + fwrite((m_pNeighborhoodGraph)[i], sizeof(int), m_iNeighborhoodSize, fp); + } + fclose(fp); + std::cout << "Save Graph (" << m_iGraphSize << "," << m_iNeighborhoodSize << ") Finish!" << std::endl; + return true; + } + + inline void AddBatch(int num) { m_pNeighborhoodGraph.AddBatch(num); m_iGraphSize += num; m_dataUpdateLock.resize(m_iGraphSize); } + + inline int* operator[](int index) { return m_pNeighborhoodGraph[index]; } + + inline const int* operator[](int index) const { return m_pNeighborhoodGraph[index]; } + + inline void SetR(int rows) { m_pNeighborhoodGraph.SetR(rows); m_iGraphSize = rows; m_dataUpdateLock.resize(m_iGraphSize); } + + inline int R() const { return m_iGraphSize; } + + static std::shared_ptr CreateInstance(std::string type); + + protected: + // Graph structure + int m_iGraphSize; + COMMON::Dataset m_pNeighborhoodGraph; + COMMON::FineGrainedLock m_dataUpdateLock; // protect one row of the graph + + public: + int m_iTPTNumber, m_iTPTLeafSize, m_iSamples, m_numTopDimensionTPTSplit; + int m_iNeighborhoodSize, m_iNeighborhoodScale, m_iCEFScale, m_iRefineIter, m_iCEF, m_iMaxCheckForRefineGraph; + }; + } +} +#endif diff --git a/AnnService/inc/Core/Common/QueryResultSet.h b/AnnService/inc/Core/Common/QueryResultSet.h index 33dcf5c7..f410b29d 100644 --- a/AnnService/inc/Core/Common/QueryResultSet.h +++ b/AnnService/inc/Core/Common/QueryResultSet.h @@ -38,7 +38,7 @@ class QueryResultSet : public QueryResult m_target = p_target; } - inline const T* GetTarget() + inline const T* GetTarget() const { return reinterpret_cast(m_target); } diff --git a/AnnService/inc/Core/Common/RelativeNeighborhoodGraph.h b/AnnService/inc/Core/Common/RelativeNeighborhoodGraph.h new file mode 100644 index 00000000..83d5ee4a --- /dev/null +++ b/AnnService/inc/Core/Common/RelativeNeighborhoodGraph.h @@ -0,0 +1,120 @@ +#ifndef _SPTAG_COMMON_RNG_H_ +#define _SPTAG_COMMON_RNG_H_ + +#include "NeighborhoodGraph.h" + +namespace SPTAG +{ + namespace COMMON + { + class RelativeNeighborhoodGraph: public NeighborhoodGraph + { + public: + void RebuildNeighbors(VectorIndex* index, const int node, int* nodes, const BasicResult* queryResults, const int numResults) { + int count = 0; + for (int j = 0; j < numResults && count < m_iNeighborhoodSize; j++) { + const BasicResult& item = queryResults[j]; + if (item.VID < 0) break; + if (item.VID == node) continue; + + bool good = true; + for (int k = 0; k < count; k++) { + if (index->ComputeDistance(index->GetSample(nodes[k]), index->GetSample(item.VID)) <= item.Dist) { + good = false; + break; + } + } + if (good) nodes[count++] = item.VID; + } + for (int j = count; j < m_iNeighborhoodSize; j++) nodes[j] = -1; + } + + void InsertNeighbors(VectorIndex* index, const int node, int insertNode, float insertDist) + { + int* nodes = m_pNeighborhoodGraph[node]; + for (int k = 0; k < m_iNeighborhoodSize; k++) + { + int tmpNode = nodes[k]; + if (tmpNode < -1) continue; + + if (tmpNode < 0) + { + bool good = true; + for (int t = 0; t < k; t++) { + if (index->ComputeDistance(index->GetSample(insertNode), index->GetSample(nodes[t])) < insertDist) { + good = false; + break; + } + } + if (good) { + nodes[k] = insertNode; + } + break; + } + float tmpDist = index->ComputeDistance(index->GetSample(node), index->GetSample(tmpNode)); + if (insertDist < tmpDist || (insertDist == tmpDist && insertNode < tmpNode)) + { + bool good = true; + for (int t = 0; t < k; t++) { + if (index->ComputeDistance(index->GetSample(insertNode), index->GetSample(nodes[t])) < insertDist) { + good = false; + break; + } + } + if (good) { + nodes[k] = insertNode; + insertNode = tmpNode; + insertDist = tmpDist; + } + else { + break; + } + } + } + } + + float GraphAccuracyEstimation(VectorIndex* index, const int samples, const std::unordered_map* idmap = nullptr) + { + int* correct = new int[samples]; + +#pragma omp parallel for schedule(dynamic) + for (int i = 0; i < samples; i++) + { + int x = COMMON::Utils::rand_int(m_iGraphSize); + //int x = i; + COMMON::QueryResultSet query(nullptr, m_iCEF); + for (int y = 0; y < m_iGraphSize; y++) + { + if ((idmap != nullptr && idmap->find(y) != idmap->end())) continue; + float dist = index->ComputeDistance(index->GetSample(x), index->GetSample(y)); + query.AddPoint(y, dist); + } + query.SortResult(); + int * exact_rng = new int[m_iNeighborhoodSize]; + RebuildNeighbors(index, x, exact_rng, query.GetResults(), m_iCEF); + + correct[i] = 0; + for (int j = 0; j < m_iNeighborhoodSize; j++) { + if (exact_rng[j] == -1) { + correct[i] += m_iNeighborhoodSize - j; + break; + } + for (int k = 0; k < m_iNeighborhoodSize; k++) + if ((m_pNeighborhoodGraph)[x][k] == exact_rng[j]) { + correct[i]++; + break; + } + } + delete[] exact_rng; + } + float acc = 0; + for (int i = 0; i < samples; i++) acc += float(correct[i]); + acc = acc / samples / m_iNeighborhoodSize; + delete[] correct; + return acc; + } + + }; + } +} +#endif \ No newline at end of file diff --git a/AnnService/inc/Core/Common/WorkSpace.h b/AnnService/inc/Core/Common/WorkSpace.h index f17ff0bb..f2ce87a0 100644 --- a/AnnService/inc/Core/Common/WorkSpace.h +++ b/AnnService/inc/Core/Common/WorkSpace.h @@ -126,49 +126,6 @@ namespace SPTAG } }; - template - class CountVector - { - size_t m_bytes; - T* m_data; - T m_count; - T MAX; - - public: - void Init(int size) - { - m_bytes = sizeof(T) * size; - m_data = new T[size]; - m_count = 0; - MAX = ((std::numeric_limits::max)()); - memset(m_data, 0, m_bytes); - } - - CountVector() :m_data(nullptr) {} - CountVector(int size) { Init(size); } - ~CountVector() { if (m_data != nullptr) delete[] m_data; } - - inline void clear() - { - if (m_count == MAX) - { - memset(m_data, 0, m_bytes); - m_count = 1; - } - else - { - m_count++; - } - } - - inline bool CheckAndSet(int idx) - { - if (m_data[idx] == m_count) return true; - m_data[idx] = m_count; - return false; - } - }; - // Variables for each single NN search struct WorkSpace { diff --git a/AnnService/inc/Core/KDT/Index.h b/AnnService/inc/Core/KDT/Index.h index d21e76b0..5dd094e1 100644 --- a/AnnService/inc/Core/KDT/Index.h +++ b/AnnService/inc/Core/KDT/Index.h @@ -1,19 +1,19 @@ #ifndef _SPTAG_KDT_INDEX_H_ #define _SPTAG_KDT_INDEX_H_ -#include "../SearchQuery.h" -#include "../VectorIndex.h" #include "../Common.h" +#include "../VectorIndex.h" #include "../Common/CommonUtils.h" #include "../Common/DistanceUtils.h" #include "../Common/QueryResultSet.h" -#include "../Common/Heap.h" #include "../Common/Dataset.h" #include "../Common/WorkSpace.h" #include "../Common/WorkSpacePool.h" -#include "../Common/FineGrainedLock.h" -#include "../Common/DataUtils.h" +#include "../Common/RelativeNeighborhoodGraph.h" +#include "../Common/KDTree.h" +#include "inc/Helper/StringConvert.h" +#include "inc/Helper/SimpleIniReader.h" #include #include @@ -29,168 +29,79 @@ namespace SPTAG namespace KDT { - // node type for storing KDT - struct KDTNode - { - int left; - int right; - short split_dim; - float split_value; - }; - template class Index : public VectorIndex { private: - // Initial data points - int m_iDataSize; - int m_iDataDimension; + // data points COMMON::Dataset m_pSamples; // KDT structures. - int m_iKDTNumber; - std::vector m_pKDTStart; - std::vector m_pKDTRoots; - int m_numTopDimensionKDTSplit; - int m_numSamplesKDTSplitConsideration; + COMMON::KDTree m_pTrees; // Graph structure - int m_iGraphSize; - int m_iNeighborhoodSize; - COMMON::Dataset m_pNeighborhoodGraph; - - // Variables for building TPTs - int m_iTPTNumber; - int m_iTPTLeafSize; - int m_numTopDimensionTPTSplit; - int m_numSamplesTPTSplitConsideration; - - // Variables for building graph - int m_iRefineIter; - int m_iCEF; - int m_iMaxCheckForRefineGraph; - int m_iMaxCheck; - std::unordered_map m_pSampleToCenter; + COMMON::RelativeNeighborhoodGraph m_pGraph; - // Load from files directly std::string m_sKDTFilename; std::string m_sGraphFilename; std::string m_sDataPointsFilename; - // Load from memory mapped files - char* m_pKDTMemoryFile; - char* m_pGraphMemoryFile; - char* m_pDataPointsMemoryFile; - - DistCalcMethod m_iDistCalcMethod; - float(*m_fComputeDistance)(const T* pX, const T* pY, int length); - - int m_iCacheSize; - int m_iDebugLoad; - - int g_iThresholdOfNumberOfContinuousNoBetterPropagation; - int g_iNumberOfInitialDynamicPivots; - int g_iNumberOfOtherDynamicPivots; - - int m_iNumberOfThreads; - std::mutex m_dataAllocLock; - COMMON::FineGrainedLock m_dataUpdateLock; + std::mutex m_dataLock; // protect data and graph tbb::concurrent_unordered_set m_deletedID; std::unique_ptr m_workSpacePool; + + int m_iNumberOfThreads; + DistCalcMethod m_iDistCalcMethod; + float(*m_fComputeDistance)(const T* pX, const T* pY, int length); + + int m_iMaxCheck; + int m_iThresholdOfNumberOfContinuousNoBetterPropagation; + int m_iNumberOfInitialDynamicPivots; + int m_iNumberOfOtherDynamicPivots; public: - Index() : m_iKDTNumber(1), - m_numTopDimensionKDTSplit(5), - m_numSamplesKDTSplitConsideration(100), - m_iNeighborhoodSize(32), - m_iTPTNumber(32), - m_iTPTLeafSize(2000), - m_numTopDimensionTPTSplit(5), - m_numSamplesTPTSplitConsideration(1000), - m_iRefineIter(0), - m_iCEF(1000), - m_iMaxCheckForRefineGraph(10000), - m_iMaxCheck(2048), - m_pKDTMemoryFile(NULL), - m_pGraphMemoryFile(NULL), - m_pDataPointsMemoryFile(NULL), + Index() : m_sKDTFilename("tree.bin"), m_sGraphFilename("graph.bin"), m_sDataPointsFilename("vectors.bin"), m_iNumberOfThreads(1), m_iDistCalcMethod(DistCalcMethod::Cosine), m_fComputeDistance(COMMON::DistanceCalcSelector(DistCalcMethod::Cosine)), - m_iCacheSize(-1), - m_iDebugLoad(-1), - g_iThresholdOfNumberOfContinuousNoBetterPropagation(3), - g_iNumberOfInitialDynamicPivots(50), - g_iNumberOfOtherDynamicPivots(4) {} - - ~Index() { - m_pKDTRoots.clear(); - } - int GetNumSamples() const { return m_pSamples.R(); } - int GetFeatureDim() const { return m_pSamples.C(); } - int GetNumThreads() const { return m_iNumberOfThreads; } - int GetCurrMaxCheck() const { return m_iMaxCheck; } - - DistCalcMethod GetDistCalcMethod() const { return m_iDistCalcMethod; } - IndexAlgoType GetIndexAlgoType() const { return IndexAlgoType::KDT; } - VectorValueType GetVectorValueType() const { return GetEnumValueType(); } + m_iMaxCheck(2048), + m_iThresholdOfNumberOfContinuousNoBetterPropagation(3), + m_iNumberOfInitialDynamicPivots(50), + m_iNumberOfOtherDynamicPivots(4) {} + + ~Index() {} + + inline int GetNumSamples() const { return m_pSamples.R(); } + inline int GetFeatureDim() const { return m_pSamples.C(); } + + inline int GetCurrMaxCheck() const { return m_iMaxCheck; } + inline int GetNumThreads() const { return m_iNumberOfThreads; } + inline DistCalcMethod GetDistCalcMethod() const { return m_iDistCalcMethod; } + inline IndexAlgoType GetIndexAlgoType() const { return IndexAlgoType::KDT; } + inline VectorValueType GetVectorValueType() const { return GetEnumValueType(); } + + inline float ComputeDistance(const void* pX, const void* pY) const { return m_fComputeDistance((const T*)pX, (const T*)pY, m_pSamples.C()); } + inline const void* GetSample(const int idx) const { return (void*)m_pSamples[idx]; } ErrorCode BuildIndex(const void* p_data, int p_vectorNum, int p_dimension); - ErrorCode LoadIndex(const std::string& p_folderPath); - ErrorCode LoadIndexFromMemory(const std::vector& p_indexBlobs); - ErrorCode SaveIndex(const std::string& p_folderPath); + ErrorCode LoadIndexFromMemory(const std::vector& p_indexBlobs); - void SearchIndex(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const tbb::concurrent_unordered_set &p_deleted) const; + ErrorCode SaveIndex(const std::string& p_folderPath, std::ofstream& p_configout); + ErrorCode LoadIndex(const std::string& p_folderPath, Helper::IniReader& p_reader); ErrorCode SearchIndex(QueryResult &p_query) const; - ErrorCode AddIndex(const void* p_vectors, int p_vectorNum, int p_dimension); ErrorCode DeleteIndex(const void* p_vectors, int p_vectorNum); - ErrorCode RefineIndex(const std::string& p_folderPath); - ErrorCode MergeIndex(const char* p_indexFilePath1, const char* p_indexFilePath2); ErrorCode SetParameter(const char* p_param, const char* p_value); std::string GetParameter(const char* p_param) const; private: - // Functions for loading models from files - bool LoadDataPoints(std::string sDataPointsFileName); - bool LoadKDT(std::string sKDTFilename); - bool LoadGraph(std::string sGraphFilename); - - // Functions for loading models from memory mapped files - bool LoadDataPoints(char* pDataPointsMemFile); - bool LoadKDT(char* pKDTMemFile); - bool LoadGraph(char* pGraphMemFile); - - bool SaveDataPoints(std::string sDataPointsFileName); - - // Functions for building kdtree - void BuildKDT(std::vector& indices, std::vector& newStart, std::vector& newRoot); - bool SaveKDT(std::string sKDTFilename, std::vector& newStart, std::vector& newRoot) const; - void DivideTree(KDTNode* pTree, std::vector& indices,int first, int last, - int index, int &iTreeSize); - void ChooseDivision(KDTNode& node, const std::vector& indices, int first, int last); - int SelectDivisionDimension(const std::vector& varianceValues) const; - int Subdivide(const KDTNode& node, std::vector& indices, const int first, const int last); - - // Functions for building Graph - void BuildRNG(); - bool SaveRNG(std::string sGraphFilename) const; - void PartitionByTptree(std::vector &indices, - const int first, - const int last, - std::vector> &leaves); - void RefineRNG(); - void RefineRNGNode(const int node, COMMON::WorkSpace &space, bool updateNeighbors); - void RebuildRNGNodeNeighbors(int* nodes, const BasicResult* queryResults, int numResults); - float GraphAccuracyEstimation(int NSample, bool rng); - - // Functions for hybrid search - void KDTSearch(const int node, const bool isInit, const float distBound, - COMMON::WorkSpace& space, COMMON::QueryResultSet &query, const tbb::concurrent_unordered_set &deleted) const; + ErrorCode RefineIndex(const std::string& p_folderPath); + void SearchIndexWithDeleted(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const tbb::concurrent_unordered_set &p_deleted) const; + void SearchIndexWithoutDeleted(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space) const; }; } // namespace KDT } // namespace SPTAG diff --git a/AnnService/inc/Core/KDT/ParameterDefinitionList.h b/AnnService/inc/Core/KDT/ParameterDefinitionList.h index 932a525f..8ca2ef74 100644 --- a/AnnService/inc/Core/KDT/ParameterDefinitionList.h +++ b/AnnService/inc/Core/KDT/ParameterDefinitionList.h @@ -5,25 +5,27 @@ DefineKDTParameter(m_sKDTFilename, std::string, std::string("tree.bin"), "TreeFi DefineKDTParameter(m_sGraphFilename, std::string, std::string("graph.bin"), "GraphFilePath") DefineKDTParameter(m_sDataPointsFilename, std::string, std::string("vectors.bin"), "VectorFilePath") -DefineKDTParameter(m_iKDTNumber, int, 1L, "KDTNumber") -DefineKDTParameter(m_numTopDimensionKDTSplit, int, 5L, "NumTopDimensionKDTSplit") -DefineKDTParameter(m_numSamplesKDTSplitConsideration, int, 100L, "NumSamplesKDTSplitConsideration") -DefineKDTParameter(m_iNeighborhoodSize, int, 32L, "NeighborhoodSize") -DefineKDTParameter(m_iTPTNumber, int, 32L, "TPTNumber") -DefineKDTParameter(m_iTPTLeafSize, int, 2000L, "TPTLeafSize") -DefineKDTParameter(m_numTopDimensionTPTSplit, int, 5L, "NumTopDimensionTPTSplit") -DefineKDTParameter(m_numSamplesTPTSplitConsideration, int, 100L, "NumSamplesTPTSplitConsideration") -DefineKDTParameter(m_iCEF, int, 1000L, "CEF") -DefineKDTParameter(m_iMaxCheckForRefineGraph, int, 10000L, "MaxCheckForRefineGraph") -DefineKDTParameter(m_iMaxCheck, int, 8192L, "MaxCheck") -DefineKDTParameter(m_iNumberOfThreads, int, 1L, "NumberOfThreads") +DefineKDTParameter(m_pTrees.m_iTreeNumber, int, 1L, "KDTNumber") +DefineKDTParameter(m_pTrees.m_numTopDimensionKDTSplit, int, 5L, "NumTopDimensionKDTSplit") +DefineKDTParameter(m_pTrees.m_iSamples, int, 100L, "NumSamplesKDTSplitConsideration") -DefineKDTParameter(g_iThresholdOfNumberOfContinuousNoBetterPropagation, int, 3L, "ThresholdOfNumberOfContinuousNoBetterPropagation") -DefineKDTParameter(g_iNumberOfInitialDynamicPivots, int, 50L, "NumberOfInitialDynamicPivots") -DefineKDTParameter(g_iNumberOfOtherDynamicPivots, int, 4L, "NumberOfOtherDynamicPivots") +DefineKDTParameter(m_pGraph.m_iTPTNumber, int, 32L, "TPTNumber") +DefineKDTParameter(m_pGraph.m_iTPTLeafSize, int, 2000L, "TPTLeafSize") +DefineKDTParameter(m_pGraph.m_numTopDimensionTPTSplit, int, 5L, "NumTopDimensionTPTSplit") +DefineKDTParameter(m_pGraph.m_iNeighborhoodSize, int, 32L, "NeighborhoodSize") +DefineKDTParameter(m_pGraph.m_iNeighborhoodScale, int, 16L, "GraphNeighborhoodScale") +DefineKDTParameter(m_pGraph.m_iCEFScale, int, 4L, "GraphCEFScale") +DefineKDTParameter(m_pGraph.m_iRefineIter, int, 0L, "RefineIterations") +DefineKDTParameter(m_pGraph.m_iCEF, int, 1000L, "CEF") +DefineKDTParameter(m_pGraph.m_iMaxCheckForRefineGraph, int, 10000L, "MaxCheckForRefineGraph") + +DefineKDTParameter(m_iNumberOfThreads, int, 1L, "NumberOfThreads") DefineKDTParameter(m_iDistCalcMethod, SPTAG::DistCalcMethod, SPTAG::DistCalcMethod::Cosine, "DistCalcMethod") -DefineKDTParameter(m_iRefineIter, int, 0L, "RefineIterations") -DefineKDTParameter(m_iDebugLoad, int, -1, "NumTrains") -DefineKDTParameter(m_iCacheSize, int, -1, "CacheSize") + +DefineKDTParameter(m_iMaxCheck, int, 8192L, "MaxCheck") +DefineKDTParameter(m_iThresholdOfNumberOfContinuousNoBetterPropagation, int, 3L, "ThresholdOfNumberOfContinuousNoBetterPropagation") +DefineKDTParameter(m_iNumberOfInitialDynamicPivots, int, 50L, "NumberOfInitialDynamicPivots") +DefineKDTParameter(m_iNumberOfOtherDynamicPivots, int, 4L, "NumberOfOtherDynamicPivots") + #endif diff --git a/AnnService/inc/Core/MetadataSet.h b/AnnService/inc/Core/MetadataSet.h index e9794893..f476531e 100644 --- a/AnnService/inc/Core/MetadataSet.h +++ b/AnnService/inc/Core/MetadataSet.h @@ -26,6 +26,8 @@ class MetadataSet virtual ErrorCode SaveMetadata(const std::string& p_metaFile, const std::string& p_metaindexFile) = 0; + virtual ErrorCode RefineMetadata(std::vector& indices, const std::string& p_folderPath); + static ErrorCode MetaCopy(const std::string& p_src, const std::string& p_dst); }; @@ -52,7 +54,7 @@ class FileMetadataSet : public MetadataSet std::vector m_pOffsets; - int m_count; + SizeType m_count; std::string m_metaFile; diff --git a/AnnService/inc/Core/VectorIndex.h b/AnnService/inc/Core/VectorIndex.h index 6f648d36..cbe1b579 100644 --- a/AnnService/inc/Core/VectorIndex.h +++ b/AnnService/inc/Core/VectorIndex.h @@ -5,6 +5,7 @@ #include "SearchQuery.h" #include "VectorSet.h" #include "MetadataSet.h" +#include "inc/Helper/SimpleIniReader.h" namespace SPTAG { @@ -16,9 +17,9 @@ class VectorIndex virtual ~VectorIndex(); - virtual ErrorCode SaveIndex(const std::string& p_folderPath) = 0; + virtual ErrorCode SaveIndex(const std::string& p_folderPath, std::ofstream& p_configout) = 0; - virtual ErrorCode LoadIndex(const std::string& p_folderPath) = 0; + virtual ErrorCode LoadIndex(const std::string& p_folderPath, Helper::IniReader& p_reader) = 0; virtual ErrorCode LoadIndexFromMemory(const std::vector& p_indexBlobs) = 0; @@ -30,14 +31,12 @@ class VectorIndex virtual ErrorCode DeleteIndex(const void* p_vectors, int p_vectorNum) = 0; - virtual ErrorCode RefineIndex(const std::string& p_folderPath) = 0; - - virtual ErrorCode MergeIndex(const char* p_indexFilePath1, const char* p_indexFilePath2) = 0; - //virtual ErrorCode AddIndexWithID(const void* p_vector, const int& p_id) = 0; //virtual ErrorCode DeleteIndexWithID(const void* p_vector, const int& p_id) = 0; - + + virtual float ComputeDistance(const void* pX, const void* pY) const = 0; + virtual const void* GetSample(const int idx) const = 0; virtual int GetFeatureDim() const = 0; virtual int GetNumSamples() const = 0; @@ -49,6 +48,10 @@ class VectorIndex virtual std::string GetParameter(const char* p_param) const = 0; virtual ErrorCode SetParameter(const char* p_param, const char* p_value) = 0; + virtual ErrorCode LoadIndex(const std::string& p_folderPath); + + virtual ErrorCode SaveIndex(const std::string& p_folderPath); + virtual ErrorCode BuildIndex(std::shared_ptr p_vectorSet, std::shared_ptr p_metadataSet); virtual ErrorCode SearchIndex(const void* p_vector, int p_neighborCount, std::vector& p_results) const; @@ -61,16 +64,22 @@ class VectorIndex virtual ByteArray GetMetadata(IndexType p_vectorID) const; virtual void SetMetadata(const std::string& p_metadataFilePath, const std::string& p_metadataIndexPath); - void SetIndexName(const std::string& p_indexName); - - const std::string& GetIndexName() const; + virtual std::string GetIndexName() const + { + if (m_sIndexName == "") + return Helper::Convert::ConvertToString(GetIndexAlgoType()); + return m_sIndexName; + } + virtual void SetIndexName(std::string p_name) { m_sIndexName = p_name; } static std::shared_ptr CreateInstance(IndexAlgoType p_algo, VectorValueType p_valuetype); + static ErrorCode MergeIndex(const char* p_indexFilePath1, const char* p_indexFilePath2); + static ErrorCode LoadIndex(const std::string& p_loaderFilePath, std::shared_ptr& p_vectorIndex); protected: - std::string m_indexName; + std::string m_sIndexName; std::shared_ptr m_pMetadata; }; diff --git a/AnnService/src/Core/BKT/BKTIndex.cpp b/AnnService/src/Core/BKT/BKTIndex.cpp index 8f9a1862..c6f3d466 100644 --- a/AnnService/src/Core/BKT/BKTIndex.cpp +++ b/AnnService/src/Core/BKT/BKTIndex.cpp @@ -1,9 +1,4 @@ #include "inc/Core/BKT/Index.h" -#include "inc/Core/Common/WorkSpacePool.h" -#include "inc/Core/MetadataSet.h" -#include "inc/Helper/StringConvert.h" -#include "inc/Helper/CommonHelper.h" -#include "inc/Helper/SimpleIniReader.h" #pragma warning(disable:4996) // 'fopen': This function or variable may be unsafe. Consider using fopen_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details. #pragma warning(disable:4242) // '=' : conversion from 'int' to 'short', possible loss of data @@ -14,307 +9,93 @@ namespace SPTAG { namespace BKT { -#pragma region Load data points, kd-tree, neighborhood graph template ErrorCode Index::LoadIndexFromMemory(const std::vector& p_indexBlobs) { - if (!LoadDataPoints((char*)p_indexBlobs[0])) return ErrorCode::FailedParseValue; - if (!LoadBKT((char*)p_indexBlobs[1])) return ErrorCode::FailedParseValue; - if (!LoadGraph((char*)p_indexBlobs[2])) return ErrorCode::FailedParseValue; + if (!m_pSamples.Load((char*)p_indexBlobs[0])) return ErrorCode::FailedParseValue; + if (!m_pTrees.LoadTrees((char*)p_indexBlobs[1])) return ErrorCode::FailedParseValue; + if (!m_pGraph.LoadGraph((char*)p_indexBlobs[2])) return ErrorCode::FailedParseValue; return ErrorCode::Success; } template - ErrorCode Index::LoadIndex(const std::string& p_folderPath) + ErrorCode Index::LoadIndex(const std::string& p_folderPath, Helper::IniReader& p_reader) { - std::string folderPath(p_folderPath); - if (!folderPath.empty() && *(folderPath.rbegin()) != FolderSep) - { - folderPath += FolderSep; - } - - Helper::IniReader p_configReader; - if (ErrorCode::Success != p_configReader.LoadIniFile(folderPath + "/indexloader.ini")) - { - return ErrorCode::FailedOpenFile; - } - - std::string metadataSection("MetaData"); - if (p_configReader.DoesSectionExist(metadataSection)) - { - std::string metadataFilePath = p_configReader.GetParameter(metadataSection, - "MetaDataFilePath", - std::string()); - std::string metadataIndexFilePath = p_configReader.GetParameter(metadataSection, - "MetaDataIndexPath", - std::string()); - - m_pMetadata.reset(new FileMetadataSet(folderPath + metadataFilePath, folderPath + metadataIndexFilePath)); - - if (!m_pMetadata->Available()) - { - std::cerr << "Error: Failed to load metadata." << std::endl; - return ErrorCode::Fail; - } - } - #define DefineBKTParameter(VarName, VarType, DefaultValue, RepresentStr) \ SetParameter(RepresentStr, \ - p_configReader.GetParameter("Index", \ - RepresentStr, \ - std::string(#DefaultValue)).c_str()); \ + p_reader.GetParameter("Index", \ + RepresentStr, \ + std::string(#DefaultValue)).c_str()); \ #include "inc/Core/BKT/ParameterDefinitionList.h" #undef DefineBKTParameter - if (DistCalcMethod::Undefined == m_iDistCalcMethod) - { - return ErrorCode::Fail; - } - - if (!LoadDataPoints(folderPath + m_sDataPointsFilename)) return ErrorCode::Fail; - if (!LoadBKT(folderPath + m_sBKTFilename)) return ErrorCode::Fail; - if (!LoadGraph(folderPath + m_sGraphFilename)) return ErrorCode::Fail; - - m_iDataSize = m_pSamples.R(); - m_iDataDimension = m_pSamples.C(); - m_dataUpdateLock.resize(m_iDataSize); + if (!m_pSamples.Load(p_folderPath + m_sDataPointsFilename)) return ErrorCode::Fail; + if (!m_pTrees.LoadTrees(p_folderPath + m_sBKTFilename)) return ErrorCode::Fail; + if (!m_pGraph.LoadGraph(p_folderPath + m_sGraphFilename)) return ErrorCode::Fail; m_workSpacePool.reset(new COMMON::WorkSpacePool(m_iMaxCheck, GetNumSamples())); m_workSpacePool->Init(m_iNumberOfThreads); return ErrorCode::Success; } - template - bool Index::LoadDataPoints(std::string sDataPointsFileName) - { - std::cout << "Load Data Points From " << sDataPointsFileName << std::endl; - FILE * fp = fopen(sDataPointsFileName.c_str(), "rb"); - if (fp == NULL) return false; - - int R, C; - fread(&R, sizeof(int), 1, fp); - fread(&C, sizeof(int), 1, fp); - - if (m_iDebugLoad > 0 && R > m_iDebugLoad) R = m_iDebugLoad; - - m_pSamples.Initialize(R, C); - int i = 0, batch = 10000; - while (i + batch < R) { - fread((m_pSamples)[i], sizeof(T), C * batch, fp); - i += batch; - } - fread((m_pSamples)[i], sizeof(T), C * (R - i), fp); - fclose(fp); - std::cout << "Load Data Points (" << m_pSamples.R() << ", " << m_pSamples.C() << ") Finish!" << std::endl; - return true; - } - - // Functions for loading models from memory mapped files - template - bool Index::LoadDataPoints(char* pDataPointsMemFile) - { - int R, C; - R = *((int*)pDataPointsMemFile); - pDataPointsMemFile += sizeof(int); - - C = *((int*)pDataPointsMemFile); - pDataPointsMemFile += sizeof(int); - - m_pSamples.Initialize(R, C, (T*)pDataPointsMemFile); - - return true; - } - - template - bool Index::LoadBKT(std::string sBKTFilename) - { - std::cout << "Load BKT From " << sBKTFilename << std::endl; - FILE *fp = fopen(sBKTFilename.c_str(), "rb"); - if (fp == NULL) return false; - int realBKTNumber; - fread(&realBKTNumber, sizeof(int), 1, fp); - m_pBKTStart.resize(realBKTNumber); - fread(m_pBKTStart.data(), sizeof(int), realBKTNumber, fp); - if (realBKTNumber < m_iBKTNumber) m_iBKTNumber = realBKTNumber; - int treeNodeSize; - fread(&treeNodeSize, sizeof(int), 1, fp); - m_pBKTRoots.resize(treeNodeSize); - for (int i = 0; i < treeNodeSize; i++) { - fread(&(m_pBKTRoots[i].centerid), sizeof(int), 1, fp); - fread(&(m_pBKTRoots[i].childStart), sizeof(int), 1, fp); - fread(&(m_pBKTRoots[i].childEnd), sizeof(int), 1, fp); - } - fclose(fp); - std::cout << "Load BKT (" << m_iBKTNumber << ", " << treeNodeSize << ") Finish!" << std::endl; - return true; - } - - template - bool Index::LoadBKT(char* pBKTMemFile) - { - int realBKTNumber = *((int*)pBKTMemFile); - pBKTMemFile += sizeof(int); - m_pBKTStart.resize(realBKTNumber); - memcpy(m_pBKTStart.data(), pBKTMemFile, sizeof(int)*realBKTNumber); - pBKTMemFile += sizeof(int)*realBKTNumber; - if (realBKTNumber < m_iBKTNumber) m_iBKTNumber = realBKTNumber; +#pragma region K-NN search - int treeNodeSize = *((int*)pBKTMemFile); - pBKTMemFile += sizeof(int); - m_pBKTRoots.resize(treeNodeSize); - for (int i = 0; i < treeNodeSize; i++) { - m_pBKTRoots[i].centerid = *((int*)pBKTMemFile); - pBKTMemFile += sizeof(int); - m_pBKTRoots[i].childStart = *((int*)pBKTMemFile); - pBKTMemFile += sizeof(int); - m_pBKTRoots[i].childEnd = *((int*)pBKTMemFile); - pBKTMemFile += sizeof(int); - } - return true; - } +#define Search(CheckDeleted1) \ + m_pTrees.InitSearchTrees(this, p_query, p_space); \ + const int checkPos = m_pGraph.m_iNeighborhoodSize - 1; \ + while (!p_space.m_SPTQueue.empty()) { \ + m_pTrees.SearchTrees(this, p_query, p_space, m_iNumberOfOtherDynamicPivots + p_space.m_iNumberOfCheckedLeaves); \ + while (!p_space.m_NGQueue.empty()) { \ + COMMON::HeapCell gnode = p_space.m_NGQueue.pop(); \ + const int *node = m_pGraph[gnode.node]; \ + _mm_prefetch((const char *)node, _MM_HINT_T0); \ + CheckDeleted1 { \ + if (p_query.AddPoint(gnode.node, gnode.distance)) { \ + p_space.m_iNumOfContinuousNoBetterPropagation = 0; \ + int checkNode = node[checkPos]; \ + if (checkNode < -1) { \ + const COMMON::BKTNode& tnode = m_pTrees[-2 - checkNode]; \ + for (int i = -tnode.childStart; i < tnode.childEnd; i++) { \ + if (!p_query.AddPoint(m_pTrees[i].centerid, gnode.distance)) break; \ + } \ + } \ + } \ + else { \ + p_space.m_iNumOfContinuousNoBetterPropagation++; \ + if (p_space.m_iNumOfContinuousNoBetterPropagation > p_space.m_iContinuousLimit || p_space.m_iNumberOfCheckedLeaves > p_space.m_iMaxCheck) { \ + p_query.SortResult(); return; \ + } \ + } \ + } \ + for (int i = 0; i <= checkPos; i++) { \ + _mm_prefetch((const char *)(m_pSamples)[node[i]], _MM_HINT_T0); \ + } \ + for (int i = 0; i <= checkPos; i++) { \ + int nn_index = node[i]; \ + if (nn_index < 0) break; \ + if (p_space.CheckAndSet(nn_index)) continue; \ + float distance2leaf = m_fComputeDistance(p_query.GetTarget(), (m_pSamples)[nn_index], GetFeatureDim()); \ + p_space.m_iNumberOfCheckedLeaves++; \ + p_space.m_NGQueue.insert(COMMON::HeapCell(nn_index, distance2leaf)); \ + } \ + if (p_space.m_NGQueue.Top().distance > p_space.m_SPTQueue.Top().distance) { \ + break; \ + } \ + } \ + } \ + p_query.SortResult(); \ template - bool Index::LoadGraph(std::string sGraphFilename) + void Index::SearchIndexWithDeleted(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const tbb::concurrent_unordered_set &p_deleted) const { - std::cout << "Load Graph From " << sGraphFilename << std::endl; - FILE * fp = fopen(sGraphFilename.c_str(), "rb"); - if (fp == NULL) return false; - fread(&m_iGraphSize, sizeof(int), 1, fp); - int KNNinGraph; - fread(&KNNinGraph, sizeof(int), 1, fp); - if (KNNinGraph < m_iNeighborhoodSize) m_iNeighborhoodSize = KNNinGraph; - - m_pNeighborhoodGraph.Initialize(m_iGraphSize, m_iNeighborhoodSize); - - std::vector unusedData(KNNinGraph); - for (int i = 0; i < m_iGraphSize; i++) - { - fread((m_pNeighborhoodGraph)[i], sizeof(int), m_iNeighborhoodSize, fp); - if (m_iNeighborhoodSize < KNNinGraph) - { - fread(&unusedData[0], sizeof(int), KNNinGraph - m_iNeighborhoodSize, fp); - } - } - fclose(fp); - std::cout << "Load Graph (" << m_iGraphSize << "," << m_iNeighborhoodSize << ") Finish!" << std::endl; - return true; + Search(if (p_deleted.find(gnode.node) == p_deleted.end())) } template - bool Index::LoadGraph(char* pGraphMemFile) { - m_iGraphSize = *((int*)pGraphMemFile); - pGraphMemFile += sizeof(int); - - int KNNinGraph = *((int*)pGraphMemFile); - pGraphMemFile += sizeof(int); - - // In the memory mapped file mode, we'll not accept NeighborhoodSize in graph file that's larger than expected size (m_iNeighborhoodSize) - // as we don't want to make another copy to fit. - if (KNNinGraph > m_iNeighborhoodSize) return false; - - if (KNNinGraph < m_iNeighborhoodSize) m_iNeighborhoodSize = KNNinGraph; - - m_pNeighborhoodGraph.Initialize(m_iGraphSize, m_iNeighborhoodSize, (int*)pGraphMemFile); - - return true; - } -#pragma endregion - -#pragma region K-NN search - - template - void Index::SearchIndex(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const tbb::concurrent_unordered_set &p_deleted) const + void Index::SearchIndexWithoutDeleted(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space) const { - for (char i = 0; i < m_iBKTNumber; i++) { - const BKTNode& node = m_pBKTRoots[m_pBKTStart[i]]; - if (node.childStart < 0) { - p_space.m_SPTQueue.insert(COMMON::HeapCell(m_pBKTStart[i], m_fComputeDistance(p_query.GetTarget(), (m_pSamples)[node.centerid], m_iDataDimension))); - } - else { - for (int begin = node.childStart; begin < node.childEnd; begin++) { - int index = m_pBKTRoots[begin].centerid; - p_space.m_SPTQueue.insert(COMMON::HeapCell(begin, m_fComputeDistance(p_query.GetTarget(), (m_pSamples)[index], m_iDataDimension))); - } - } - } - int checkLimit = g_iNumberOfInitialDynamicPivots; - const int checkPos = m_iNeighborhoodSize - 1; - while (!p_space.m_SPTQueue.empty()) { - do - { - COMMON::HeapCell bcell = p_space.m_SPTQueue.pop(); - const BKTNode& tnode = m_pBKTRoots[bcell.node]; - - if (tnode.childStart < 0) { - if (!p_space.CheckAndSet(tnode.centerid)) { - p_space.m_iNumberOfCheckedLeaves++; - p_space.m_NGQueue.insert(COMMON::HeapCell(tnode.centerid, bcell.distance)); - } - if (p_space.m_iNumberOfCheckedLeaves >= checkLimit) break; - } - else { - if (!p_space.CheckAndSet(tnode.centerid)) { - p_space.m_NGQueue.insert(COMMON::HeapCell(tnode.centerid, bcell.distance)); - } - for (int begin = tnode.childStart; begin < tnode.childEnd; begin++) { - int index = m_pBKTRoots[begin].centerid; - p_space.m_SPTQueue.insert(COMMON::HeapCell(begin, m_fComputeDistance(p_query.GetTarget(), (m_pSamples)[index], m_iDataDimension))); - } - } - } while (!p_space.m_SPTQueue.empty()); - while (!p_space.m_NGQueue.empty()) { - COMMON::HeapCell gnode = p_space.m_NGQueue.pop(); - const int *node = (m_pNeighborhoodGraph)[gnode.node]; - _mm_prefetch((const char *)node, _MM_HINT_T0); - if (p_deleted.find(gnode.node) == p_deleted.end()) { - if (p_query.AddPoint(gnode.node, gnode.distance)) { - p_space.m_iNumOfContinuousNoBetterPropagation = 0; - - int checkNode = node[checkPos]; - if (checkNode < -1) { - const BKTNode& tnode = m_pBKTRoots[-2 - checkNode]; - for (int i = -tnode.childStart; i < tnode.childEnd; i++) { - if (p_deleted.find(m_pBKTRoots[i].centerid) == p_deleted.end()) { - if (!p_query.AddPoint(m_pBKTRoots[i].centerid, gnode.distance)) break; - } - } - } - } - else { - p_space.m_iNumOfContinuousNoBetterPropagation++; - if (p_space.m_iNumOfContinuousNoBetterPropagation > p_space.m_iContinuousLimit || p_space.m_iNumberOfCheckedLeaves > p_space.m_iMaxCheck) { - p_query.SortResult(); return; - } - } - } - -#ifdef PREFETCH - for (int i = 0; i <= checkPos; i++) { - _mm_prefetch((const char *)(m_pSamples)[node[i]], _MM_HINT_T0); - } -#endif - - for (int i = 0; i <= checkPos; i++) - { - int nn_index = node[i]; - - // do not check it if it has been checked - if (nn_index < 0) break; - if (p_space.CheckAndSet(nn_index)) continue; - - // count the number of the computed nodes - float distance2leaf = m_fComputeDistance(p_query.GetTarget(), (m_pSamples)[nn_index], m_iDataDimension); - p_space.m_iNumberOfCheckedLeaves++; - p_space.m_NGQueue.insert(COMMON::HeapCell(nn_index, distance2leaf)); - } - if (p_space.m_NGQueue.Top().distance > p_space.m_SPTQueue.Top().distance) { - checkLimit = g_iNumberOfOtherDynamicPivots + p_space.m_iNumberOfCheckedLeaves; - break; - } - } - } - p_query.SortResult(); + Search(;) } template @@ -324,633 +105,48 @@ namespace SPTAG auto workSpace = m_workSpacePool->Rent(); workSpace->Reset(m_iMaxCheck); - SearchIndex(*((COMMON::QueryResultSet*)&p_query), *workSpace, m_deletedID); + if (m_deletedID.size() > 0) + SearchIndexWithDeleted(*((COMMON::QueryResultSet*)&p_query), *workSpace, m_deletedID); + else + SearchIndexWithoutDeleted(*((COMMON::QueryResultSet*)&p_query), *workSpace); + m_workSpacePool->Return(workSpace); if (p_query.WithMeta() && nullptr != m_pMetadata) { for (int i = 0; i < p_query.GetResultNum(); ++i) { - for (int i = 0; i < p_query.GetResultNum(); ++i) - { - int result = p_query.GetResult(i)->VID; - p_query.SetMetadata(i, (result < 0) ? ByteArray::c_empty : m_pMetadata->GetMetadata(result)); - } + int result = p_query.GetResult(i)->VID; + p_query.SetMetadata(i, (result < 0) ? ByteArray::c_empty : m_pMetadata->GetMetadata(result)); } } - return ErrorCode::Success; } #pragma endregion -#pragma region Build/Save kd-tree & neighborhood graphs template ErrorCode Index::BuildIndex(const void* p_data, int p_vectorNum, int p_dimension) { - m_pSamples.Initialize(p_vectorNum, p_dimension); - std::memcpy(m_pSamples.GetData(), p_data, p_vectorNum * p_dimension * sizeof(T)); - m_iDataSize = m_pSamples.R(); - m_iDataDimension = m_pSamples.C(); - m_dataUpdateLock.resize(m_iDataSize); + omp_set_num_threads(m_iNumberOfThreads); + + m_pSamples.Initialize(p_vectorNum, p_dimension, (T*)p_data, false); if (DistCalcMethod::Cosine == m_iDistCalcMethod) { int base = COMMON::Utils::GetBase(); - for (int i = 0; i < m_iDataSize; i++) { - COMMON::Utils::Normalize(m_pSamples[i], m_iDataDimension, base); +#pragma omp parallel for + for (int i = 0; i < GetNumSamples(); i++) { + COMMON::Utils::Normalize(m_pSamples[i], GetFeatureDim(), base); } } - std::vector indices(m_iDataSize); - for (int i = 0; i < m_iDataSize; i++) indices[i] = i; - BuildBKT(indices, m_pBKTStart, m_pBKTRoots); - BuildRNG(); m_workSpacePool.reset(new COMMON::WorkSpacePool(m_iMaxCheck, GetNumSamples())); m_workSpacePool->Init(m_iNumberOfThreads); - return ErrorCode::Success; - } - -#pragma region Build/Save kd-tree - template - bool Index::SaveBKT(std::string sBKTFilename, std::vector& newStart, std::vector& newRoot) const - { - std::cout << "Save BKT to " << sBKTFilename << std::endl; - FILE *fp = fopen(sBKTFilename.c_str(), "wb"); - if(fp == NULL) return false; - fwrite(&m_iBKTNumber, sizeof(int), 1, fp); - fwrite(newStart.data(), sizeof(int), m_iBKTNumber, fp); - int treeNodeSize = (int)newRoot.size(); - fwrite(&treeNodeSize, sizeof(int), 1, fp); - for (int i = 0; i < treeNodeSize; i++) { - fwrite(&(newRoot[i].centerid), sizeof(int), 1, fp); - fwrite(&(newRoot[i].childStart), sizeof(int), 1, fp); - fwrite(&(newRoot[i].childEnd), sizeof(int), 1, fp); - } - fclose(fp); - std::cout << "Save BKT Finish!" << std::endl; - return true; - } - - template - void Index::BuildBKT(std::vector& indices, std::vector& newStart, std::vector& newRoot) - { - omp_set_num_threads(m_iNumberOfThreads); - struct BKTStackItem { - int index, first, last; - BKTStackItem(int index_, int first_, int last_) : index(index_), first(first_), last(last_) {} - }; - std::stack ss; - - KmeansArgs args(m_iBKTKmeansK, m_iDataDimension, (int)indices.size(), m_iNumberOfThreads); - m_pSampleToCenter.clear(); - - for (char i = 0; i < m_iBKTNumber; i++) - { - std::random_shuffle(indices.begin(), indices.end()); - - newStart.push_back((int)newRoot.size()); - newRoot.push_back(BKTNode((int)indices.size())); - std::cout << "Start to build tree " << i + 1 << std::endl; - - ss.push(BKTStackItem(newStart[i], 0, (int)indices.size())); - while (!ss.empty()) { - BKTStackItem item = ss.top(); ss.pop(); - int newBKTid = (int)newRoot.size(); - newRoot[item.index].childStart = newBKTid; - if (item.last - item.first <= m_iBKTLeafSize) { - for (int j = item.first; j < item.last; j++) { - newRoot.push_back(BKTNode(indices[j])); - } - } - else { // clustering the data into BKTKmeansK clusters - int numClusters = KmeansClustering(indices, item.first, item.last, args); - if (numClusters <= 1) { - int end = min(item.last + 1, (int)indices.size()); - std::sort(indices.begin() + item.first, indices.begin() + end); - newRoot[item.index].centerid = indices[item.first]; - newRoot[item.index].childStart = -newRoot[item.index].childStart; - for (int j = item.first + 1; j < end; j++) { - newRoot.push_back(BKTNode(indices[j])); - m_pSampleToCenter[indices[j]] = newRoot[item.index].centerid; - } - m_pSampleToCenter[-1 - newRoot[item.index].centerid] = item.index; - } - else { - for (int k = 0; k < m_iBKTKmeansK; k++) { - if (args.counts[k] == 0) continue; - newRoot.push_back(BKTNode(indices[item.first + args.counts[k] - 1])); - if (args.counts[k] > 1) ss.push(BKTStackItem(newBKTid++, item.first, item.first + args.counts[k] - 1)); - item.first += args.counts[k]; - } - } - } - newRoot[item.index].childEnd = (int)newRoot.size(); - } - std::cout << i + 1 << " trees built, " << newRoot.size() - newStart[i] << " " << indices.size() << std::endl; - } - } - - template - float Index::KmeansAssign(std::vector& indices, const int first, const int last, KmeansArgs& args, bool updateCenters) { - float currDist = 0; - float lambda = (updateCenters) ? COMMON::Utils::GetBase() * COMMON::Utils::GetBase() / (100.0 * (last - first)) : 0; - int subsize = (last - first - 1) / m_iNumberOfThreads + 1; - -#pragma omp parallel for - for (int tid = 0; tid < m_iNumberOfThreads; tid++) - { - int istart = first + tid * subsize; - int iend = min(first + (tid + 1) * subsize, last); - int *inewCounts = args.newCounts + tid * m_iBKTKmeansK; - float *inewCenters = args.newCenters + tid * m_iBKTKmeansK * m_iDataDimension; - int * iclusterIdx = args.clusterIdx + tid * m_iBKTKmeansK; - float * iclusterDist = args.clusterDist + tid * m_iBKTKmeansK; - float idist = 0; - for (int i = istart; i < iend; i++) { - int clusterid = 0; - float smallestDist = MaxDist; - for (int k = 0; k < m_iBKTKmeansK; k++) { - float dist = m_fComputeDistance(m_pSamples[indices[i]], args.centers + k*m_iDataDimension, m_iDataDimension) + lambda*args.counts[k]; - if (dist > -MaxDist && dist < smallestDist) { - clusterid = k; smallestDist = dist; - } - } - args.label[i] = clusterid; - inewCounts[clusterid]++; - idist += smallestDist; - if (updateCenters) { - for (int j = 0; j < m_iDataDimension; j++) inewCenters[clusterid*m_iDataDimension + j] += m_pSamples[indices[i]][j]; - if (smallestDist > iclusterDist[clusterid]) { - iclusterDist[clusterid] = smallestDist; - iclusterIdx[clusterid] = indices[i]; - } - } - else { - if (smallestDist <= iclusterDist[clusterid]) { - iclusterDist[clusterid] = smallestDist; - iclusterIdx[clusterid] = indices[i]; - } - } - } - COMMON::Utils::atomic_float_add(&currDist, idist); - } - - for (int i = 1; i < m_iNumberOfThreads; i++) { - for (int k = 0; k < m_iBKTKmeansK; k++) - args.newCounts[k] += args.newCounts[i*m_iBKTKmeansK + k]; - } - - if (updateCenters) { - for (int i = 1; i < m_iNumberOfThreads; i++) { - float* currCenter = args.newCenters + i*m_iBKTKmeansK*m_iDataDimension; - for (int j = 0; j < m_iBKTKmeansK * m_iDataDimension; j++) args.newCenters[j] += currCenter[j]; - } - - int maxcluster = 0; - for (int k = 1; k < m_iBKTKmeansK; k++) if (args.newCounts[maxcluster] < args.newCounts[k]) maxcluster = k; - - int maxid = maxcluster; - for (int tid = 1; tid < m_iNumberOfThreads; tid++) { - if (args.clusterDist[maxid] < args.clusterDist[tid * m_iBKTKmeansK + maxcluster]) maxid = tid * m_iBKTKmeansK + maxcluster; - } - if (args.clusterIdx[maxid] < 0 || args.clusterIdx[maxid] >= m_iDataSize) - std::cout << "first:" << first << " last:" << last << " maxcluster:" << maxcluster << "(" << args.newCounts[maxcluster] << ") Error maxid:" << maxid << " dist:" << args.clusterDist[maxid] << std::endl; - maxid = args.clusterIdx[maxid]; - - for (int k = 0; k < m_iBKTKmeansK; k++) { - T* TCenter = args.newTCenters + k * m_iDataDimension; - if (args.newCounts[k] == 0) { - //int nextid = Utils::rand_int(last, first); - //while (args.label[nextid] != maxcluster) nextid = Utils::rand_int(last, first); - int nextid = maxid; - std::memcpy(TCenter, m_pSamples[nextid], sizeof(T)*m_iDataDimension); - } - else { - float* currCenters = args.newCenters + k * m_iDataDimension; - for (int j = 0; j < m_iDataDimension; j++) currCenters[j] /= args.newCounts[k]; - - if (m_iDistCalcMethod == DistCalcMethod::Cosine) { - COMMON::Utils::Normalize(currCenters, m_iDataDimension, COMMON::Utils::GetBase()); - } - for (int j = 0; j < m_iDataDimension; j++) TCenter[j] = (T)(currCenters[j]); - } - } - } - else { - for (int i = 1; i < m_iNumberOfThreads; i++) { - for (int k = 0; k < m_iBKTKmeansK; k++) { - if (args.clusterIdx[i*m_iBKTKmeansK + k] != -1 && args.clusterDist[i*m_iBKTKmeansK + k] <= args.clusterDist[k]) { - args.clusterDist[k] = args.clusterDist[i*m_iBKTKmeansK + k]; - args.clusterIdx[k] = args.clusterIdx[i*m_iBKTKmeansK + k]; - } - } - } - } - return currDist; - } - - template - int Index::KmeansClustering(std::vector& indices, const int first, const int last, KmeansArgs& args) { - int iterLimit = 100; - - int batchEnd = min(first + m_iSamples, last); - float currDiff, currDist, minClusterDist = MaxDist; - for (int numKmeans = 0; numKmeans < 3; numKmeans++) { - for (int k = 0; k < m_iBKTKmeansK; k++) { - int randid = COMMON::Utils::rand_int(last, first); - memcpy(args.centers + k*m_iDataDimension, m_pSamples[indices[randid]], sizeof(T)*m_iDataDimension); - } - args.ClearCounts(); - currDist = KmeansAssign(indices, first, batchEnd, args, false); - if (currDist < minClusterDist) { - minClusterDist = currDist; - memcpy(args.newTCenters, args.centers, sizeof(T)*m_iBKTKmeansK*m_iDataDimension); - memcpy(args.counts, args.newCounts, sizeof(int) * m_iBKTKmeansK); - } - } - - minClusterDist = MaxDist; - int noImprovement = 0; - for (int iter = 0; iter < iterLimit; iter++) { - std::memcpy(args.centers, args.newTCenters, sizeof(T)*m_iBKTKmeansK*m_iDataDimension); - std::random_shuffle(indices.begin() + first, indices.begin() + last); - - args.ClearCenters(); - args.ClearCounts(); - args.ClearDists(-MaxDist); - currDist = KmeansAssign(indices, first, batchEnd, args, true); - memcpy(args.counts, args.newCounts, sizeof(int)*m_iBKTKmeansK); + + m_pTrees.BuildTrees(this); + m_pGraph.BuildGraph(this, &(m_pTrees.GetSampleMap())); - currDiff = 0; - for (int k = 0; k < m_iBKTKmeansK; k++) { - currDiff += m_fComputeDistance(args.centers + k*m_iDataDimension, args.newTCenters + k*m_iDataDimension, m_iDataDimension); - } - - if (currDist < minClusterDist) { - noImprovement = 0; - minClusterDist = currDist; - } - else { - noImprovement++; - } - if (currDiff < 1e-3 || noImprovement >= 5) break; - } - - args.ClearCounts(); - args.ClearDists(MaxDist); - currDist = KmeansAssign(indices, first, last, args, false); - memcpy(args.counts, args.newCounts, sizeof(int)*m_iBKTKmeansK); - - int numClusters = 0; - for (int i = 0; i < m_iBKTKmeansK; i++) if (args.counts[i] > 0) numClusters++; - - if (numClusters <= 1) { - //if (last - first > 1) std::cout << "large cluster:" << last - first << " dist:" << currDist << std::endl; - return numClusters; - } - args.Shuffle(indices, first, last); - return numClusters; - } -#pragma endregion - -#pragma region Build/Save neighborhood graph - template - bool Index::SaveRNG(std::string sGraphFilename) const - { - std::cout << "Save Graph To " << sGraphFilename << std::endl; - FILE *fp = fopen(sGraphFilename.c_str(), "wb"); - if (fp == NULL) return false; - fwrite(&m_iGraphSize, sizeof(int), 1, fp); - fwrite(&m_iNeighborhoodSize, sizeof(int), 1, fp); - - for (int i = 0; i < m_iGraphSize; i++) - { - fwrite((m_pNeighborhoodGraph)[i], sizeof(int), m_iNeighborhoodSize, fp); - } - fclose(fp); - std::cout << "Save Graph Finish!" << std::endl; - return true; - } - - template - void Index::PartitionByTptree(std::vector& indices, - const int first, - const int last, - std::vector> & leaves) - { - if (last - first <= m_iTPTLeafSize) - { - leaves.push_back(std::make_pair(first, last)); - } - else - { - std::vector Mean(m_iDataDimension, 0); - - int iIteration = 100; - int end = min(first + m_iSamples, last); - int count = end - first + 1; - // calculate the mean of each dimension - for (int j = first; j <= end; j++) - { - T* v = (m_pSamples)[indices[j]]; - for (int k = 0; k < m_iDataDimension; k++) - { - Mean[k] += v[k]; - } - } - for (int k = 0; k < m_iDataDimension; k++) - { - Mean[k] /= count; - } - std::vector Variance; - Variance.reserve(m_iDataDimension); - for (int j = 0; j < m_iDataDimension; j++) - { - Variance.push_back(BasicResult(j, 0)); - } - // calculate the variance of each dimension - for (int j = first; j <= end; j++) - { - T* v = (m_pSamples)[indices[j]]; - for (int k = 0; k < m_iDataDimension; k++) - { - float dist = v[k] - Mean[k]; - Variance[k].Dist += dist*dist; - } - } - std::sort(Variance.begin(), Variance.end(), COMMON::Compare); - std::vector index(m_numTopDimensionTpTreeSplit); - std::vector weight(m_numTopDimensionTpTreeSplit), bestweight(m_numTopDimensionTpTreeSplit); - float bestvariance = Variance[m_iDataDimension - 1].Dist; - for (int i = 0; i < m_numTopDimensionTpTreeSplit; i++) - { - index[i] = Variance[m_iDataDimension - 1 - i].VID; - bestweight[i] = 0; - } - bestweight[0] = 1; - float bestmean = Mean[index[0]]; - - std::vector Val(count); - for (int i = 0; i < iIteration; i++) - { - float sumweight = 0; - for (int j = 0; j < m_numTopDimensionTpTreeSplit; j++) - { - weight[j] = float(rand() % 10000) / 5000.0f - 1.0f; - sumweight += weight[j] * weight[j]; - } - sumweight = sqrt(sumweight); - for (int j = 0; j < m_numTopDimensionTpTreeSplit; j++) - { - weight[j] /= sumweight; - } - float mean = 0; - for (int j = 0; j < count; j++) - { - Val[j] = 0; - for (int k = 0; k < m_numTopDimensionTpTreeSplit; k++) - { - Val[j] += weight[k] * (m_pSamples)[indices[first + j]][index[k]]; - } - mean += Val[j]; - } - mean /= count; - float var = 0; - for (int j = 0; j < count; j++) - { - float dist = Val[j] - mean; - var += dist * dist; - } - if (var > bestvariance) - { - bestvariance = var; - bestmean = mean; - for (int j = 0; j < m_numTopDimensionTpTreeSplit; j++) - { - bestweight[j] = weight[j]; - } - } - } - int i = first; - int j = last; - // decide which child one point belongs - while (i <= j) - { - float val = 0; - for (int k = 0; k < m_numTopDimensionTpTreeSplit; k++) - { - val += bestweight[k] * (m_pSamples)[indices[i]][index[k]]; - } - if (val < bestmean) - { - i++; - } - else - { - std::swap(indices[i], indices[j]); - j--; - } - } - // if all the points in the node are equal,equally split the node into 2 - if ((i == first) || (i == last + 1)) - { - i = (first + last + 1) / 2; - } - - Mean.clear(); - Variance.clear(); - Val.clear(); - index.clear(); - weight.clear(); - bestweight.clear(); - - PartitionByTptree(indices, first, i - 1, leaves); - PartitionByTptree(indices, i, last, leaves); - } - } - - template - void Index::RefineRNG() { - std::vector spaces(m_iNumberOfThreads); - for (int i = 0; i < m_iNumberOfThreads; i++) spaces[i].Initialize(m_iMaxCheckForRefineGraph, m_iGraphSize); - -#pragma omp parallel for schedule(dynamic) - for (int i = 0; i < m_iGraphSize; i++) - { - RefineRNGNode(i, spaces[omp_get_thread_num()], false); - } - } - - template - void Index::BuildRNG() - { - std::cout << "build RNG graph!" << std::endl; - - omp_set_num_threads(m_iNumberOfThreads); - - int graphScale = 16; - int cefScale = 4; - m_iNeighborhoodSize *= graphScale; - m_iGraphSize = m_iDataSize; - - m_pNeighborhoodGraph.Initialize(m_iGraphSize, m_iNeighborhoodSize); - if (m_iGraphSize < 1000) { - std::memset(m_pNeighborhoodGraph.GetData(), -1, m_iGraphSize * m_iNeighborhoodSize * sizeof(int)); - m_iNeighborhoodSize /= graphScale; - RefineRNG(); - for (int i = 0; i < m_iGraphSize; i++) { - if (m_pSampleToCenter.find(-1 - i) != m_pSampleToCenter.end()) - m_pNeighborhoodGraph[i][m_iNeighborhoodSize - 1] = -2 - m_pSampleToCenter[-1 - i]; - } - std::cout << "Build RNG Graph end!" << std::endl; - return; - } - - { - COMMON::Dataset NeighborhoodDists(m_iGraphSize, m_iNeighborhoodSize); - std::vector> TptreeDataIndices(m_iTptreeNumber, std::vector(m_iGraphSize)); - std::vector>> TptreeLeafNodes(m_iTptreeNumber, std::vector>()); - for (int i = 0; i < m_iGraphSize; i++) - { - for (int j = 0; j < m_iNeighborhoodSize; j++) - { - (m_pNeighborhoodGraph)[i][j] = -1; - (NeighborhoodDists)[i][j] = MaxDist; - } - TptreeDataIndices[0][i] = i; - } - for (int i = 1; i < m_iTptreeNumber; i++) { - std::memcpy(TptreeDataIndices[i].data(), TptreeDataIndices[0].data(), sizeof(int) * m_iGraphSize); - } - - std::cout << "Parallel TpTree Partition begin " << std::endl; -#pragma omp parallel for schedule(dynamic) - for (int i = 0; i < m_iTptreeNumber; i++) - { - Sleep(i * 100); std::srand(clock()); - std::random_shuffle(TptreeDataIndices[i].begin(), TptreeDataIndices[i].end()); - PartitionByTptree(TptreeDataIndices[i], 0, m_iGraphSize - 1, TptreeLeafNodes[i]); - std::cout << "Finish Getting Leaves for Tree " << i << std::endl; - } - std::cout << "Parallel TpTree Partition done" << std::endl; - - for (int i = 0; i < m_iTptreeNumber; i++) - { -#pragma omp parallel for schedule(dynamic) - for (int j = 0; j < TptreeLeafNodes[i].size(); j++) - { - int start_index = TptreeLeafNodes[i][j].first; - int end_index = TptreeLeafNodes[i][j].second; - if (omp_get_thread_num() == 0) std::cout << "\rProcessing Tree " << i << ' ' << j * 100 / TptreeLeafNodes[i].size() << '%'; - for (int x = start_index; x < end_index; x++) - { - for (int y = x + 1; y <= end_index; y++) - { - int p1 = TptreeDataIndices[i][x]; - int p2 = TptreeDataIndices[i][y]; - float dist = m_fComputeDistance((m_pSamples)[p1], (m_pSamples)[p2], m_iDataDimension); - if (m_pSampleToCenter.find(p2) == m_pSampleToCenter.end()) - COMMON::Utils::AddNeighbor(p2, dist, (m_pNeighborhoodGraph)[p1], (NeighborhoodDists)[p1], m_iNeighborhoodSize); - else - COMMON::Utils::AddNeighbor(m_pSampleToCenter[p2], dist, (m_pNeighborhoodGraph)[p1], (NeighborhoodDists)[p1], m_iNeighborhoodSize); - if (m_pSampleToCenter.find(p1) == m_pSampleToCenter.end()) - COMMON::Utils::AddNeighbor(p1, dist, (m_pNeighborhoodGraph)[p2], (NeighborhoodDists)[p2], m_iNeighborhoodSize); - else - COMMON::Utils::AddNeighbor(m_pSampleToCenter[p1], dist, (m_pNeighborhoodGraph)[p2], (NeighborhoodDists)[p2], m_iNeighborhoodSize); - } - } - } - TptreeDataIndices[i].clear(); - TptreeLeafNodes[i].clear(); - std::cout << std::endl; - } - TptreeDataIndices.clear(); - TptreeLeafNodes.clear(); - - for (int i = 0; i < m_iDataSize; i++) { - if (m_pSampleToCenter.find(-1 - i) != m_pSampleToCenter.end()) { - BKTNode& tnode = m_pBKTRoots[m_pSampleToCenter[-1 - i]]; - for (int iter = -tnode.childStart; iter != tnode.childEnd; iter++) { - int node = m_pBKTRoots[iter].centerid; - for (int j = 0; j < m_iNeighborhoodSize; j++) { - int index = m_pNeighborhoodGraph[node][j]; - if (index == i) continue; - COMMON::Utils::AddNeighbor(index, NeighborhoodDists[node][j], (m_pNeighborhoodGraph)[i], (NeighborhoodDists)[i], m_iNeighborhoodSize); - } - } - } - } - } - std::cout << "NNG acc:" << GraphAccuracyEstimation(100, false) << std::endl; - if (m_iMaxCheckForRefineGraph > 0) { - m_iCEF *= cefScale; - m_iMaxCheckForRefineGraph *= cefScale; - RefineRNG(); - std::cout << "Refine RNG, graph acc:" << GraphAccuracyEstimation(100, true) << std::endl; - - m_iCEF /= cefScale; - m_iMaxCheckForRefineGraph /= cefScale; - m_iNeighborhoodSize /= graphScale; - - //RefineRNG(); - //std::cout << "Refine RNG, graph acc:" << GraphAccuracyEstimation(100, true) << std::endl; - RefineRNG(); - std::cout << "Refine RNG, graph acc:" << GraphAccuracyEstimation(100, true) << std::endl; - - for (int i = 0; i < m_iGraphSize; i++) { - if (m_pSampleToCenter.find(-1 - i) != m_pSampleToCenter.end()) - m_pNeighborhoodGraph[i][m_iNeighborhoodSize - 1] = -2 - m_pSampleToCenter[-1 - i]; - } - } - std::cout << "Build RNG Graph end!" << std::endl; - } - - template - float Index::GraphAccuracyEstimation(int NSample, bool rng) { - int* correct = new int[NSample]; - -#pragma omp parallel for schedule(dynamic) - for (int i = 0; i < NSample; i++) - { - int x = COMMON::Utils::rand_int(m_iGraphSize); - //int x = i; - COMMON::QueryResultSet query((m_pSamples)[x], m_iCEF); - for (int y = 0; y < m_iGraphSize; y++) - { - if (m_pSampleToCenter.find(y) != m_pSampleToCenter.end() || y == x) continue; - float dist = m_fComputeDistance(query.GetTarget(), (m_pSamples)[y], m_iDataDimension); - query.AddPoint(y, dist); - } - query.SortResult(); - int * exact_rng = new int[m_iNeighborhoodSize]; - if (rng) { - RebuildRNGNodeNeighbors(exact_rng, query.GetResults(), m_iCEF); - } - else { - for (int j = 0; j < m_iNeighborhoodSize && j < m_iCEF; j++) { - exact_rng[j] = query.GetResult(j)->VID; - } - for (int j = m_iCEF; j < m_iNeighborhoodSize; j++) exact_rng[j] = -1; - } - correct[i] = 0; - for (int j = 0; j < m_iNeighborhoodSize; j++) { - if (exact_rng[j] == -1) { - correct[i] += m_iNeighborhoodSize - j; - break; - } - for (int k = 0; k < m_iNeighborhoodSize; k++) - if ((m_pNeighborhoodGraph)[x][k] == exact_rng[j]) { - correct[i]++; - break; - } - } - delete[] exact_rng; - } - float acc = 0; - for (int i = 0; i < NSample; i++) acc += float(correct[i]); - acc = acc / NSample / m_iNeighborhoodSize; - delete[] correct; - return acc; + return ErrorCode::Success; } template @@ -966,136 +162,41 @@ namespace SPTAG { mkdir(folderPath.c_str()); } - tbb::concurrent_unordered_set deleted(m_deletedID.begin(), m_deletedID.end()); + + std::lock_guard lock(m_dataLock); + int newR = GetNumSamples(); + std::vector indices; - std::unordered_map old2new; - int newR = m_iDataSize; + std::vector reverseIndices(newR); for (int i = 0; i < newR; i++) { - if (deleted.find(i) == deleted.end()) { + if (m_deletedID.find(i) == m_deletedID.end()) { indices.push_back(i); - old2new[i] = i; + reverseIndices[i] = i; } else { - while (deleted.find(newR - 1) != deleted.end() && newR > i) newR--; + while (m_deletedID.find(newR - 1) != m_deletedID.end() && newR > i) newR--; if (newR == i) break; indices.push_back(newR - 1); - old2new[newR - 1] = i; + reverseIndices[newR - 1] = i; newR--; } } - old2new[-1] = -1; - std::cout << "Refine... from " << m_iDataSize << "->" << newR << std::endl; - std::ofstream vecOut(folderPath + m_sDataPointsFilename, std::ios::binary); - if (!vecOut.is_open()) return ErrorCode::FailedCreateFile; - vecOut.write((char*)&newR, sizeof(int)); - vecOut.write((char*)&m_iDataDimension, sizeof(int)); - for (int i = 0; i < newR; i++) { - vecOut.write((char*)m_pSamples[indices[i]], sizeof(T)*m_iDataDimension); - } - vecOut.close(); - - if (nullptr != m_pMetadata) - { - std::ofstream metaOut(folderPath + "metadata.bin_tmp", std::ios::binary); - std::ofstream metaIndexOut(folderPath + "metadataIndex.bin", std::ios::binary); - if (!metaOut.is_open() || !metaIndexOut.is_open()) return ErrorCode::FailedCreateFile; - metaIndexOut.write((char*)&newR, sizeof(int)); - std::uint64_t offset = 0; - for (int i = 0; i < newR; i++) { - metaIndexOut.write((char*)&offset, sizeof(std::uint64_t)); - ByteArray meta = m_pMetadata->GetMetadata(indices[i]); - metaOut.write((char*)meta.Data(), sizeof(uint8_t)*meta.Length()); - offset += meta.Length(); - } - metaOut.close(); - metaIndexOut.write((char*)&offset, sizeof(std::uint64_t)); - metaIndexOut.close(); - - SPTAG::MetadataSet::MetaCopy(folderPath + "metadata.bin_tmp", folderPath + "metadata.bin"); - } + std::cout << "Refine... from " << GetNumSamples() << "->" << newR << std::endl; - std::vector newRoot; - std::vector newStart; - std::vector tmpindices(indices.begin(), indices.end()); - BuildBKT(tmpindices, newStart, newRoot); + if (false == m_pSamples.Refine(indices, folderPath + m_sDataPointsFilename)) return ErrorCode::FailedCreateFile; + if (nullptr != m_pMetadata && ErrorCode::Success != m_pMetadata->RefineMetadata(indices, folderPath)) return ErrorCode::FailedCreateFile; + + COMMON::BKTree newTrees(m_pTrees); + newTrees.BuildTrees(this, &indices); #pragma omp parallel for - for (int i = 0; i < newRoot.size(); i++) { - newRoot[i].centerid = old2new[newRoot[i].centerid]; + for (int i = 0; i < newTrees.size(); i++) { + newTrees[i].centerid = reverseIndices[newTrees[i].centerid]; } - SaveBKT(folderPath + m_sBKTFilename, newStart, newRoot); - - std::ofstream graphOut(folderPath + m_sGraphFilename, std::ios::binary); - if (!graphOut.is_open()) return ErrorCode::FailedCreateFile; - graphOut.write((char*)&newR, sizeof(int)); - graphOut.write((char*)&m_iNeighborhoodSize, sizeof(int)); - - int *neighbors = new int[m_iNeighborhoodSize]; - COMMON::WorkSpace space; - space.Initialize(m_iMaxCheckForRefineGraph, m_iDataSize); - for (int i = 0; i < newR; i++) { - space.Reset(m_iMaxCheckForRefineGraph); - COMMON::QueryResultSet query((m_pSamples)[indices[i]], m_iCEF); - space.CheckAndSet(indices[i]); - for (int j = 0; j < m_iNeighborhoodSize; j++) { - int index = m_pNeighborhoodGraph[indices[i]][j]; - if (index < 0 || space.CheckAndSet(index)) continue; - space.m_NGQueue.insert(COMMON::HeapCell(index, m_fComputeDistance(query.GetTarget(), m_pSamples[index], m_iDataDimension))); - } - SearchIndex(query, space, deleted); - RebuildRNGNodeNeighbors(neighbors, query.GetResults(), m_iCEF); - for (int j = 0; j < m_iNeighborhoodSize; j++) - neighbors[j] = old2new[neighbors[j]]; - if (m_pSampleToCenter.find(-1 - indices[i]) != m_pSampleToCenter.end()) { - neighbors[m_iNeighborhoodSize - 1] = -2 - m_pSampleToCenter[-1 - indices[i]]; - } - graphOut.write((char*)neighbors, sizeof(int) * m_iNeighborhoodSize); - } - delete[]neighbors; - graphOut.close(); + newTrees.SaveTrees(folderPath + m_sBKTFilename); - return ErrorCode::Success; - } - - template - ErrorCode Index::MergeIndex(const char* p_indexFilePath1, const char* p_indexFilePath2) { - std::string folderPath1(p_indexFilePath1), folderPath2(p_indexFilePath2); - if (!folderPath1.empty() && *(folderPath1.rbegin()) != FolderSep) folderPath1 += FolderSep; - if (!folderPath2.empty() && *(folderPath2.rbegin()) != FolderSep) folderPath2 += FolderSep; - - Helper::IniReader p_configReader1, p_configReader2; - if (ErrorCode::Success != p_configReader1.LoadIniFile(folderPath1 + "/indexloader.ini")) - return ErrorCode::FailedOpenFile; - - if (ErrorCode::Success != p_configReader2.LoadIniFile(folderPath2 + "/indexloader.ini")) - return ErrorCode::FailedOpenFile; - - std::string empty(""); - if (!COMMON::DataUtils::MergeIndex(folderPath1 + p_configReader1.GetParameter("Index", "VectorFilePath", empty), - folderPath1 + p_configReader1.GetParameter("MetaData", "MetaDataFilePath", empty), - folderPath1 + p_configReader1.GetParameter("MetaData", "MetaDataIndexPath", empty), - folderPath2 + p_configReader1.GetParameter("Index", "VectorFilePath", empty), - folderPath2 + p_configReader1.GetParameter("MetaData", "MetaDataFilePath", empty), - folderPath2 + p_configReader1.GetParameter("MetaData", "MetaDataIndexPath", empty))) - return ErrorCode::Fail; - -#define DefineBKTParameter(VarName, VarType, DefaultValue, RepresentStr) \ - SetParameter(RepresentStr, \ - p_configReader1.GetParameter("Index", \ - RepresentStr, \ - std::string(#DefaultValue)).c_str()); \ - -#include "inc/Core/BKT/ParameterDefinitionList.h" -#undef DefineBKTParameter - - if (!LoadDataPoints(folderPath1 + p_configReader1.GetParameter("Index", "VectorFilePath", empty))) return ErrorCode::FailedOpenFile; - std::vector indices(m_iDataSize); - for (int i = 0; i < m_iDataSize; i++) indices[i] = i; - BuildBKT(indices, m_pBKTStart, m_pBKTRoots); - BuildRNG(); - - SaveBKT(folderPath1 + p_configReader1.GetParameter("Index", "TreeFilePath", empty), m_pBKTStart, m_pBKTRoots); - SaveRNG(folderPath1 + p_configReader1.GetParameter("Index", "GraphFilePath", empty)); + m_pGraph.RefineGraph(this, indices, reverseIndices, folderPath + m_sGraphFilename, + &(newTrees.GetSampleMap())); return ErrorCode::Success; } @@ -1104,10 +205,12 @@ namespace SPTAG const T* ptr_v = (const T*)p_vectors; #pragma omp parallel for schedule(dynamic) for (int i = 0; i < p_vectorNum; i++) { - COMMON::QueryResultSet query(ptr_v + i * m_iDataDimension, m_iCEF); + COMMON::QueryResultSet query(ptr_v + i * GetFeatureDim(), m_pGraph.m_iCEF); SearchIndex(query); - for (int i = 0; i < m_iCEF; i++) { + + for (int i = 0; i < m_pGraph.m_iCEF; i++) { if (query.GetResult(i)->Dist < 1e-6) { + std::lock_guard lock(m_dataLock); m_deletedID.insert(query.GetResult(i)->VID); } } @@ -1116,232 +219,79 @@ namespace SPTAG } template - ErrorCode Index::AddIndex(const void* p_vectors, int p_vectorNum, int p_dimension) { - if (m_pBKTRoots.size() == 0) { - return BuildIndex(p_vectors, p_vectorNum, p_dimension); - } - if (p_dimension != m_iDataDimension) return ErrorCode::FailedParseValue; - + ErrorCode Index::AddIndex(const void* p_vectors, int p_vectorNum, int p_dimension) + { int begin, end; { - std::lock_guard lock(m_dataAllocLock); - + std::lock_guard lock(m_dataLock); + + if (GetNumSamples() == 0) + return BuildIndex(p_vectors, p_vectorNum, p_dimension); + + if (p_dimension != GetFeatureDim()) + return ErrorCode::FailedParseValue; + + begin = GetNumSamples(); + end = GetNumSamples() + p_vectorNum; + m_pSamples.AddBatch((const T*)p_vectors, p_vectorNum); - m_pNeighborhoodGraph.AddBatch(p_vectorNum); + m_pGraph.AddBatch(p_vectorNum); - end = m_iDataSize + p_vectorNum; - if (m_pSamples.R() != end || m_pNeighborhoodGraph.R() != end) { + if (m_pSamples.R() != end || m_pGraph.R() != end) { std::cout << "Memory Error: Cannot alloc space for vectors" << std::endl; - m_pSamples.SetR(m_iDataSize); - m_pNeighborhoodGraph.SetR(m_iDataSize); + m_pSamples.SetR(begin); + m_pGraph.SetR(begin); return ErrorCode::Fail; } - begin = m_iDataSize; - m_iDataSize = end; - m_iGraphSize = end; - m_dataUpdateLock.resize(m_iDataSize); - } - if (DistCalcMethod::Cosine == m_iDistCalcMethod) - { - int base = COMMON::Utils::GetBase(); - for (int i = begin; i < end; i++) { - COMMON::Utils::Normalize((T*)m_pSamples[i], m_iDataDimension, base); + if (DistCalcMethod::Cosine == m_iDistCalcMethod) + { + int base = COMMON::Utils::GetBase(); + for (int i = begin; i < end; i++) { + COMMON::Utils::Normalize((T*)m_pSamples[i], GetFeatureDim(), base); + } } } - auto space = m_workSpacePool->Rent(); for (int node = begin; node < end; node++) { - RefineRNGNode(node, *(space.get()), true); + m_pGraph.RefineNode(this, node, true); } - m_workSpacePool->Return(space); std::cout << "Add " << p_vectorNum << " vectors" << std::endl; return ErrorCode::Success; } - template - void Index::RefineRNGNode(const int node, COMMON::WorkSpace &space, bool updateNeighbors) { - space.Reset(m_iMaxCheckForRefineGraph); - COMMON::QueryResultSet query((m_pSamples)[node], m_iCEF); - space.CheckAndSet(node); - for (int i = 0; i < m_iNeighborhoodSize; i++) { - int index = m_pNeighborhoodGraph[node][i]; - if (index < 0 || space.CheckAndSet(index)) continue; - space.m_NGQueue.insert(COMMON::HeapCell(index, m_fComputeDistance(query.GetTarget(), m_pSamples[index], m_iDataDimension))); - } - SearchIndex(query, space, m_deletedID); - RebuildRNGNodeNeighbors(m_pNeighborhoodGraph[node], query.GetResults(), m_iCEF); - - if (updateNeighbors) { - // update neighbors - for (int j = 0; j < m_iCEF; j++) - { - BasicResult* item = query.GetResult(j); - if (item->VID < 0) break; - - int insertID = node; - int* nodes = m_pNeighborhoodGraph[item->VID]; - std::lock_guard lock(m_dataUpdateLock[item->VID]); - for (int k = 0; k < m_iNeighborhoodSize; k++) - { - int tmpNode = nodes[k]; - if (tmpNode < -1) continue; - - if (tmpNode < 0) - { - bool good = true; - for (int t = 0; t < k; t++) { - if (m_fComputeDistance((m_pSamples)[insertID], (m_pSamples)[nodes[t]], m_iDataDimension) < item->Dist) { - good = false; - break; - } - } - if (good) { - nodes[k] = insertID; - } - break; - } - float tmpDist = m_fComputeDistance(m_pSamples[item->VID], m_pSamples[tmpNode], m_iDataDimension); - if (item->Dist < tmpDist || (item->Dist == tmpDist && insertID < tmpNode)) - { - bool good = true; - for (int t = 0; t < k; t++) { - if (m_fComputeDistance((m_pSamples)[insertID], (m_pSamples)[nodes[t]], m_iDataDimension) < item->Dist) { - good = false; - break; - } - } - if (good) { - nodes[k] = insertID; - insertID = tmpNode; - item->Dist = tmpDist; - } - else { - break; - } - } - } - } - } - } - - template - void Index::RebuildRNGNodeNeighbors(int* nodes, const BasicResult* queryResults, int numResults) { - int count = 0; - for (int j = 0; j < numResults && count < m_iNeighborhoodSize; j++) { - const BasicResult& item = queryResults[j]; - if (item.VID < 0) continue; - - bool good = true; - for (int k = 0; k < count; k++) { - if (m_fComputeDistance((m_pSamples)[nodes[k]], (m_pSamples)[item.VID], m_iDataDimension) <= item.Dist) { - good = false; - break; - } - } - if (good) nodes[count++] = item.VID; - } - for (int j = count; j < m_iNeighborhoodSize; j++) nodes[j] = -1; - } - - template - bool Index::SaveDataPoints(std::string sDataPointsFileName) - { - std::cout << "Save Data Points To " << sDataPointsFileName << std::endl; - - FILE * fp = fopen(sDataPointsFileName.c_str(), "wb"); - if (fp == NULL) return false; - - int R = m_pSamples.R(), C = m_pSamples.C(); - fwrite(&R, sizeof(int), 1, fp); - fwrite(&C, sizeof(int), 1, fp); - - // write point one by one in case for cache miss - for (int i = 0; i < R; i++) { - fwrite((m_pSamples)[i], sizeof(T), C, fp); - } - fclose(fp); - - std::cout << "Save Data Points (" << m_pSamples.R() << ", " << m_pSamples.C() << ") Finish!" << std::endl; - return true; - } - template ErrorCode - Index::SaveIndex(const std::string& p_folderPath) + Index::SaveIndex(const std::string& p_folderPath, std::ofstream& p_configout) { - std::string folderPath(p_folderPath); - if (!folderPath.empty() && *(folderPath.rbegin()) != FolderSep) - { - folderPath += FolderSep; - } - - if (!direxists(folderPath.c_str())) - { - mkdir(folderPath.c_str()); - } - - std::string loaderFilePath = folderPath + "indexloader.ini"; - - std::ofstream loaderFile(loaderFilePath); - if (!loaderFile.is_open()) - { - return ErrorCode::FailedCreateFile; - } - m_sDataPointsFilename = "vectors.bin"; m_sBKTFilename = "tree.bin"; m_sGraphFilename = "graph.bin"; - std::string metadataFile = "metadata.bin"; - std::string metadataIndexFile = "metadataIndex.bin"; - - loaderFile << "[Index]" << std::endl; - loaderFile << "IndexAlgoType=" << Helper::Convert::ConvertToString(IndexAlgoType::BKT) << std::endl; - loaderFile << "ValueType=" << Helper::Convert::ConvertToString(GetEnumValueType()) << std::endl; - loaderFile << std::endl; #define DefineBKTParameter(VarName, VarType, DefaultValue, RepresentStr) \ - loaderFile << RepresentStr << "=" << GetParameter(RepresentStr) << std::endl; + p_configout << RepresentStr << "=" << GetParameter(RepresentStr) << std::endl; #include "inc/Core/BKT/ParameterDefinitionList.h" #undef DefineBKTParameter - loaderFile << std::endl; - - if (nullptr != m_pMetadata) - { - loaderFile << "[MetaData]" << std::endl; - loaderFile << "MetaDataFilePath=" << metadataFile << std::endl; - loaderFile << "MetaDataIndexPath=" << metadataIndexFile << std::endl; - loaderFile << std::endl; - } - loaderFile.close(); + p_configout << std::endl; if (m_deletedID.size() > 0) { - RefineIndex(folderPath); + RefineIndex(p_folderPath); } - else { - if (!SaveDataPoints(folderPath + m_sDataPointsFilename)) return ErrorCode::Fail; - if (!SaveBKT(folderPath + m_sBKTFilename, m_pBKTStart, m_pBKTRoots)) return ErrorCode::Fail; - if (!SaveRNG(folderPath + m_sGraphFilename)) return ErrorCode::Fail; - if (nullptr != m_pMetadata) - { - m_pMetadata->SaveMetadata(folderPath + metadataFile, folderPath + metadataIndexFile); - } + else { + if (!m_pSamples.Save(p_folderPath + m_sDataPointsFilename)) return ErrorCode::Fail; + if (!m_pTrees.SaveTrees(p_folderPath + m_sBKTFilename)) return ErrorCode::Fail; + if (!m_pGraph.SaveGraph(p_folderPath + m_sGraphFilename)) return ErrorCode::Fail; } return ErrorCode::Success; } -#pragma endregion -#pragma endregion template ErrorCode Index::SetParameter(const char* p_param, const char* p_value) { - if (nullptr == p_param || nullptr == p_value) - { - return ErrorCode::Fail; - } + if (nullptr == p_param || nullptr == p_value) return ErrorCode::Fail; #define DefineBKTParameter(VarName, VarType, DefaultValue, RepresentStr) \ else if (SPTAG::Helper::StrUtils::StrEqualIgnoreCase(p_param, RepresentStr)) \ @@ -1366,10 +316,7 @@ namespace SPTAG std::string Index::GetParameter(const char* p_param) const { - if (nullptr == p_param) - { - return std::string(); - } + if (nullptr == p_param) return std::string(); #define DefineBKTParameter(VarName, VarType, DefaultValue, RepresentStr) \ else if (SPTAG::Helper::StrUtils::StrEqualIgnoreCase(p_param, RepresentStr)) \ diff --git a/AnnService/src/Core/Common/NeighborhoodGraph.cpp b/AnnService/src/Core/Common/NeighborhoodGraph.cpp new file mode 100644 index 00000000..20ed8630 --- /dev/null +++ b/AnnService/src/Core/Common/NeighborhoodGraph.cpp @@ -0,0 +1,14 @@ +#include "inc/Core/Common/NeighborhoodGraph.h" +#include "inc/Core/Common/RelativeNeighborhoodGraph.h" + +using namespace SPTAG::COMMON; + +std::shared_ptr NeighborhoodGraph::CreateInstance(std::string type) +{ + std::shared_ptr res; + if (type == "RNG") + { + res.reset(new RelativeNeighborhoodGraph); + } + return res; +} \ No newline at end of file diff --git a/AnnService/src/Core/KDT/KDTIndex.cpp b/AnnService/src/Core/KDT/KDTIndex.cpp index a6db67d1..00bb877f 100644 --- a/AnnService/src/Core/KDT/KDTIndex.cpp +++ b/AnnService/src/Core/KDT/KDTIndex.cpp @@ -1,9 +1,4 @@ #include "inc/Core/KDT/Index.h" -#include "inc/Core/Common/WorkSpacePool.h" -#include "inc/Core/MetadataSet.h" -#include "inc/Helper/StringConvert.h" -#include "inc/Helper/CommonHelper.h" -#include "inc/Helper/SimpleIniReader.h" #pragma warning(disable:4996) // 'fopen': This function or variable may be unsafe. Consider using fopen_s instead. To disable deprecation, use _CRT_SECURE_NO_WARNINGS. See online help for details. #pragma warning(disable:4242) // '=' : conversion from 'int' to 'short', possible loss of data @@ -14,322 +9,83 @@ namespace SPTAG { namespace KDT { -#pragma region Load data points, kd-tree, neighborhood graph template ErrorCode Index::LoadIndexFromMemory(const std::vector& p_indexBlobs) { - if (!LoadDataPoints((char*)p_indexBlobs[0])) return ErrorCode::FailedParseValue; - if (!LoadKDT((char*)p_indexBlobs[1])) return ErrorCode::FailedParseValue; - if (!LoadGraph((char*)p_indexBlobs[2])) return ErrorCode::FailedParseValue; + if (!m_pSamples.Load((char*)p_indexBlobs[0])) return ErrorCode::FailedParseValue; + if (!m_pTrees.LoadTrees((char*)p_indexBlobs[1])) return ErrorCode::FailedParseValue; + if (!m_pGraph.LoadGraph((char*)p_indexBlobs[2])) return ErrorCode::FailedParseValue; return ErrorCode::Success; } template - ErrorCode Index::LoadIndex(const std::string& p_folderPath) + ErrorCode Index::LoadIndex(const std::string& p_folderPath, Helper::IniReader& p_reader) { - std::string folderPath(p_folderPath); - if (!folderPath.empty() && *(folderPath.rbegin()) != FolderSep) - { - folderPath += FolderSep; - } - - Helper::IniReader p_configReader; - if (ErrorCode::Success != p_configReader.LoadIniFile(folderPath + "/indexloader.ini")) - { - return ErrorCode::FailedOpenFile; - } - - std::string metadataSection("MetaData"); - if (p_configReader.DoesSectionExist(metadataSection)) - { - std::string metadataFilePath = p_configReader.GetParameter(metadataSection, - "MetaDataFilePath", - std::string()); - std::string metadataIndexFilePath = p_configReader.GetParameter(metadataSection, - "MetaDataIndexPath", - std::string()); - - m_pMetadata.reset(new FileMetadataSet(folderPath + metadataFilePath, folderPath + metadataIndexFilePath)); - - if (!m_pMetadata->Available()) - { - std::cerr << "Error: Failed to load metadata." << std::endl; - return ErrorCode::Fail; - } - } - #define DefineKDTParameter(VarName, VarType, DefaultValue, RepresentStr) \ SetParameter(RepresentStr, \ - p_configReader.GetParameter("Index", \ - RepresentStr, \ - std::string(#DefaultValue)).c_str()); \ + p_reader.GetParameter("Index", \ + RepresentStr, \ + std::string(#DefaultValue)).c_str()); \ #include "inc/Core/KDT/ParameterDefinitionList.h" #undef DefineKDTParameter - if (DistCalcMethod::Undefined == m_iDistCalcMethod) - { - return ErrorCode::Fail; - } - - if (!LoadDataPoints(folderPath + m_sDataPointsFilename)) return ErrorCode::Fail; - if (!LoadKDT(folderPath + m_sKDTFilename)) return ErrorCode::Fail; - if (!LoadGraph(folderPath + m_sGraphFilename)) return ErrorCode::Fail; - - m_iDataSize = m_pSamples.R(); - m_iDataDimension = m_pSamples.C(); - m_dataUpdateLock.resize(m_iDataSize); + if (!m_pSamples.Load(p_folderPath + m_sDataPointsFilename)) return ErrorCode::Fail; + if (!m_pTrees.LoadTrees(p_folderPath + m_sKDTFilename)) return ErrorCode::Fail; + if (!m_pGraph.LoadGraph(p_folderPath + m_sGraphFilename)) return ErrorCode::Fail; m_workSpacePool.reset(new COMMON::WorkSpacePool(m_iMaxCheck, GetNumSamples())); m_workSpacePool->Init(m_iNumberOfThreads); return ErrorCode::Success; } - template - bool Index::LoadDataPoints(std::string sDataPointsFileName) - { - std::cout << "Load Data Points From " << sDataPointsFileName << std::endl; - FILE * fp = fopen(sDataPointsFileName.c_str(), "rb"); - if (fp == NULL) return false; - - int R, C; - fread(&R, sizeof(int), 1, fp); - fread(&C, sizeof(int), 1, fp); - - if (m_iDebugLoad > 0 && R > m_iDebugLoad) R = m_iDebugLoad; - - m_pSamples.Initialize(R, C); - int i = 0, batch = 10000; - while (i + batch < R) { - fread((m_pSamples)[i], sizeof(T), C * batch, fp); - i += batch; - } - fread((m_pSamples)[i], sizeof(T), C * (R - i), fp); - fclose(fp); - std::cout << "Load Data Points (" << m_pSamples.R() << ", " << m_pSamples.C() << ") Finish!" << std::endl; - return true; - } - - // Functions for loading models from memory mapped files - template - bool Index::LoadDataPoints(char* pDataPointsMemFile) - { - int R, C; - R = *((int*)pDataPointsMemFile); - pDataPointsMemFile += sizeof(int); - - C = *((int*)pDataPointsMemFile); - pDataPointsMemFile += sizeof(int); - - m_pSamples.Initialize(R, C, (T*)pDataPointsMemFile); - - return true; - } +#pragma region K-NN search - template - bool Index::LoadKDT(std::string sKDTFilename) - { - std::cout << "Load KDT From " << sKDTFilename << std::endl; - FILE *fp = fopen(sKDTFilename.c_str(), "rb"); - if (fp == NULL) return false; - int realKDTNumber; - fread(&realKDTNumber, sizeof(int), 1, fp); - if (realKDTNumber < m_iKDTNumber) m_iKDTNumber = realKDTNumber; - m_pKDTStart.resize(m_iKDTNumber + 1, -1); - for (int i = 0; i < m_iKDTNumber; i++) { - int treeNodeSize; - fread(&treeNodeSize, sizeof(int), 1, fp); - if (treeNodeSize > 0) { - m_pKDTStart[i] = (int)(m_pKDTRoots.size()); - m_pKDTRoots.resize(m_pKDTRoots.size() + treeNodeSize); - fread(&(m_pKDTRoots[m_pKDTStart[i]]), sizeof(KDTNode), treeNodeSize, fp); - } - } - if (m_pKDTRoots.size() > 0) m_pKDTStart[m_iKDTNumber] = (int)(m_pKDTRoots.size()); - fclose(fp); - std::cout << "Load KDT (" << m_iKDTNumber << ", " << m_pKDTRoots.size() << ") Finish!" << std::endl; - return true; - } +#define Search(CheckDeleted1) \ + m_pTrees.InitSearchTrees(this, p_query, p_space, m_iNumberOfInitialDynamicPivots); \ + while (!p_space.m_NGQueue.empty()) { \ + COMMON::HeapCell gnode = p_space.m_NGQueue.pop(); \ + const int *node = m_pGraph[gnode.node]; \ + _mm_prefetch((const char *)node, _MM_HINT_T0); \ + CheckDeleted1 { \ + if (!p_query.AddPoint(gnode.node, gnode.distance) && p_space.m_iNumberOfCheckedLeaves > p_space.m_iMaxCheck) { \ + p_query.SortResult(); return; \ + } \ + } \ + for (int i = 0; i < m_pGraph.m_iNeighborhoodSize; i++) \ + _mm_prefetch((const char *)(m_pSamples)[node[i]], _MM_HINT_T0); \ + bool bLocalOpt = true; \ + for (int i = 0; i < m_pGraph.m_iNeighborhoodSize; i++) { \ + int nn_index = node[i]; \ + if (nn_index < 0) break; \ + if (p_space.CheckAndSet(nn_index)) continue; \ + float distance2leaf = m_fComputeDistance(p_query.GetTarget(), (m_pSamples)[nn_index], GetFeatureDim()); \ + if (distance2leaf <= p_query.worstDist() || distance2leaf < gnode.distance) bLocalOpt = false; \ + p_space.m_iNumberOfCheckedLeaves++; \ + p_space.m_NGQueue.insert(COMMON::HeapCell(nn_index, distance2leaf)); \ + } \ + if (bLocalOpt) p_space.m_iNumOfContinuousNoBetterPropagation++; \ + else p_space.m_iNumOfContinuousNoBetterPropagation = 0; \ + if (p_space.m_iNumOfContinuousNoBetterPropagation > m_iThresholdOfNumberOfContinuousNoBetterPropagation) { \ + if (p_space.m_iNumberOfTreeCheckedLeaves <= p_space.m_iNumberOfCheckedLeaves / 10) { \ + m_pTrees.SearchTrees(this, p_query, p_space, m_iNumberOfOtherDynamicPivots + p_space.m_iNumberOfCheckedLeaves); \ + } else if (gnode.distance > p_query.worstDist()) { \ + break; \ + } \ + } \ + } \ + p_query.SortResult(); \ template - bool Index::LoadKDT(char* pKDTMemFile) + void Index::SearchIndexWithDeleted(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const tbb::concurrent_unordered_set &p_deleted) const { - int realKDTNumber = *((int*)pKDTMemFile); - pKDTMemFile += sizeof(int); - if (realKDTNumber < m_iKDTNumber) m_iKDTNumber = realKDTNumber; - m_pKDTStart.clear(); - for (int i = 0; i < m_iKDTNumber; i++) { - m_pKDTStart.push_back((int)(m_pKDTRoots.size())); - - int treeNodeSize = *((int*)pKDTMemFile); - pKDTMemFile += sizeof(int); - m_pKDTRoots.resize(m_pKDTRoots.size() + treeNodeSize); - std::memcpy(&(m_pKDTRoots[m_pKDTStart[i]]), pKDTMemFile, sizeof(KDTNode)*treeNodeSize); - pKDTMemFile += sizeof(KDTNode)*treeNodeSize; - } - m_pKDTStart.push_back((int)(m_pKDTRoots.size())); - return true; + Search(if (p_deleted.find(gnode.node) == p_deleted.end())) } template - bool Index::LoadGraph(std::string sGraphFilename) + void Index::SearchIndexWithoutDeleted(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space) const { - std::cout << "Load Graph From " << sGraphFilename << std::endl; - FILE * fp = fopen(sGraphFilename.c_str(), "rb"); - if (fp == NULL) return false; - fread(&m_iGraphSize, sizeof(int), 1, fp); - int KNNinGraph; - fread(&KNNinGraph, sizeof(int), 1, fp); - if (KNNinGraph < m_iNeighborhoodSize) m_iNeighborhoodSize = KNNinGraph; - - m_pNeighborhoodGraph.Initialize(m_iGraphSize, m_iNeighborhoodSize); - - std::vector unusedData(KNNinGraph); - for (int i = 0; i < m_iGraphSize; i++) - { - fread((m_pNeighborhoodGraph)[i], sizeof(int), m_iNeighborhoodSize, fp); - if (m_iNeighborhoodSize < KNNinGraph) - { - fread(&unusedData[0], sizeof(int), KNNinGraph - m_iNeighborhoodSize, fp); - } - } - fclose(fp); - std::cout << "Load Graph (" << m_iGraphSize << "," << m_iNeighborhoodSize << ") Finish!" << std::endl; - return true; - } - - template - bool Index::LoadGraph(char* pGraphMemFile) { - m_iGraphSize = *((int*)pGraphMemFile); - pGraphMemFile += sizeof(int); - - int KNNinGraph = *((int*)pGraphMemFile); - pGraphMemFile += sizeof(int); - - // In the memory mapped file mode, we'll not accept NeighborhoodSize in graph file that's larger than expected size (m_iNeighborhoodSize) - // as we don't want to make another copy to fit. - if (KNNinGraph > m_iNeighborhoodSize) return false; - - if (KNNinGraph < m_iNeighborhoodSize) m_iNeighborhoodSize = KNNinGraph; - - m_pNeighborhoodGraph.Initialize(m_iGraphSize, m_iNeighborhoodSize, (int*)pGraphMemFile); - - return true; - } -#pragma endregion - -#pragma region K-NN search - template - void Index::KDTSearch(const int node, const bool isInit, const float distBound, - COMMON::WorkSpace& p_space, COMMON::QueryResultSet &p_query, const tbb::concurrent_unordered_set &p_deleted) const { - if (node < 0) - { - int index = -node - 1; - if (index >= m_iDataSize) return; - -#ifdef PREFETCH - const char* data = (const char *)(m_pSamples[index]); - _mm_prefetch(data, _MM_HINT_T0); - _mm_prefetch(data + 64, _MM_HINT_T0); -#endif - if (p_space.CheckAndSet(index)) return; - - float distance = m_fComputeDistance(p_query.GetTarget(), (T*)data, m_iDataDimension); - if (p_deleted.find(index) == p_deleted.end()) p_query.AddPoint(index, distance); - ++p_space.m_iNumberOfTreeCheckedLeaves; - ++p_space.m_iNumberOfCheckedLeaves; - p_space.m_NGQueue.insert(COMMON::HeapCell(index, distance)); - return; - } - - auto& tnode = m_pKDTRoots[node]; - - float diff = (p_query.GetTarget())[tnode.split_dim] - tnode.split_value; - float distanceBound = distBound + diff * diff; - int otherChild, bestChild; - if (diff < 0) - { - bestChild = tnode.left; - otherChild = tnode.right; - } - else - { - otherChild = tnode.left; - bestChild = tnode.right; - } - - if (!isInit || distanceBound < p_query.worstDist()) - { - p_space.m_SPTQueue.insert(COMMON::HeapCell(otherChild, distanceBound)); - } - KDTSearch(bestChild, isInit, distBound, p_space, p_query, p_deleted); - } - - template - void Index::SearchIndex(COMMON::QueryResultSet &p_query, COMMON::WorkSpace &p_space, const tbb::concurrent_unordered_set &p_deleted) const - { - for (char i = 0; i < m_iKDTNumber; i++) { - KDTSearch(m_pKDTStart[i], true, 0, p_space, p_query, p_deleted); - } - - while (!p_space.m_SPTQueue.empty() && p_space.m_iNumberOfCheckedLeaves < g_iNumberOfInitialDynamicPivots) - { - auto& tcell = p_space.m_SPTQueue.pop(); - if (p_query.worstDist() < tcell.distance) break; - KDTSearch(tcell.node, true, tcell.distance, p_space, p_query, p_deleted); - } - - while (!p_space.m_NGQueue.empty()) { - bool bLocalOpt = true; - COMMON::HeapCell gnode = p_space.m_NGQueue.pop(); - const int *node = (m_pNeighborhoodGraph)[gnode.node]; - -#ifdef PREFETCH - _mm_prefetch((const char *)node, _MM_HINT_T0); - for (int i = 0; i < m_iNeighborhoodSize; i++) - { - _mm_prefetch((const char *)(m_pSamples)[node[i]], _MM_HINT_T0); - } -#endif - - for (int i = 0; i < m_iNeighborhoodSize; i++) - { - int nn_index = node[i]; - - // do not check it if it has been checked - if (nn_index < 0) break; - if (p_space.CheckAndSet(nn_index)) continue; - - // count the number of the computed nodes - float distance2leaf = m_fComputeDistance(p_query.GetTarget(), (m_pSamples)[nn_index], m_iDataDimension); - - if (p_deleted.find(nn_index) == p_deleted.end()) p_query.AddPoint(nn_index, distance2leaf); - if (distance2leaf <= p_query.worstDist()|| distance2leaf < gnode.distance) bLocalOpt = false; - p_space.m_iNumberOfCheckedLeaves++; - p_space.m_NGQueue.insert(COMMON::HeapCell(nn_index, distance2leaf)); - } - - if (bLocalOpt) p_space.m_iNumOfContinuousNoBetterPropagation++; - else p_space.m_iNumOfContinuousNoBetterPropagation = 0; - - if (p_space.m_iNumOfContinuousNoBetterPropagation > g_iThresholdOfNumberOfContinuousNoBetterPropagation) - { - if (p_space.m_iNumberOfTreeCheckedLeaves < p_space.m_iNumberOfCheckedLeaves / 10) - { - int nextNumberOfCheckedLeaves = g_iNumberOfOtherDynamicPivots + p_space.m_iNumberOfCheckedLeaves; - while (!p_space.m_SPTQueue.empty() && p_space.m_iNumberOfCheckedLeaves < nextNumberOfCheckedLeaves) - { - auto& tcell = p_space.m_SPTQueue.pop(); - KDTSearch(tcell.node, false, tcell.distance, p_space, p_query, p_deleted); - } - } - else if (gnode.distance > p_query.worstDist()) { - break; - } - } - if (p_space.m_iNumberOfCheckedLeaves >= p_space.m_iMaxCheck) break; - } - p_query.SortResult(); + Search(;) } template @@ -339,7 +95,11 @@ namespace SPTAG auto workSpace = m_workSpacePool->Rent(); workSpace->Reset(m_iMaxCheck); - SearchIndex(*((COMMON::QueryResultSet*)&p_query), *workSpace, m_deletedID); + if (m_deletedID.size() > 0) + SearchIndexWithDeleted(*((COMMON::QueryResultSet*)&p_query), *workSpace, m_deletedID); + else + SearchIndexWithoutDeleted(*((COMMON::QueryResultSet*)&p_query), *workSpace); + m_workSpacePool->Return(workSpace); if (p_query.WithMeta() && nullptr != m_pMetadata) @@ -350,523 +110,33 @@ namespace SPTAG p_query.SetMetadata(i, (result < 0) ? ByteArray::c_empty : m_pMetadata->GetMetadata(result)); } } - return ErrorCode::Success; } #pragma endregion -#pragma region Build/Save kd-tree & neighborhood graphs template ErrorCode Index::BuildIndex(const void* p_data, int p_vectorNum, int p_dimension) { - m_pSamples.Initialize(p_vectorNum, p_dimension); - std::memcpy(m_pSamples.GetData(), p_data, p_vectorNum * p_dimension * sizeof(T)); - m_iDataSize = m_pSamples.R(); - m_iDataDimension = m_pSamples.C(); - m_dataUpdateLock.resize(m_iDataSize); + omp_set_num_threads(m_iNumberOfThreads); + + m_pSamples.Initialize(p_vectorNum, p_dimension, (T*)p_data, false); if (DistCalcMethod::Cosine == m_iDistCalcMethod) { int base = COMMON::Utils::GetBase(); - for (int i = 0; i < m_iDataSize; i++) { - COMMON::Utils::Normalize(m_pSamples[i], m_iDataDimension, base); +#pragma omp parallel for + for (int i = 0; i < GetNumSamples(); i++) { + COMMON::Utils::Normalize(m_pSamples[i], GetFeatureDim(), base); } } - std::vector indices(m_iDataSize); - for (int j = 0; j < m_iDataSize; j++) indices[j] = j; - BuildKDT(indices, m_pKDTStart, m_pKDTRoots); - BuildRNG(); m_workSpacePool.reset(new COMMON::WorkSpacePool(m_iMaxCheck, GetNumSamples())); m_workSpacePool->Init(m_iNumberOfThreads); - return ErrorCode::Success; - } - -#pragma region Build/Save kd-tree - template - bool Index::SaveKDT(std::string sKDTFilename, std::vector& newStart, std::vector& newRoot) const - { - std::cout << "Save KDT to " << sKDTFilename << std::endl; - FILE *fp = fopen(sKDTFilename.c_str(), "wb"); - if (fp == NULL) return false; - fwrite(&m_iKDTNumber, sizeof(int), 1, fp); - for (int i = 0; i < m_iKDTNumber; i++) - { - int treeNodeSize = newStart[i + 1] - newStart[i]; - fwrite(&treeNodeSize, sizeof(int), 1, fp); - if (treeNodeSize > 0) fwrite(&(newRoot[newStart[i]]), sizeof(KDTNode), treeNodeSize, fp); - } - fclose(fp); - std::cout << "Save KDT Finish!" << std::endl; - return true; - } - - template - void Index::BuildKDT(std::vector& indices, std::vector& newStart, std::vector& newRoot) - { - omp_set_num_threads(m_iNumberOfThreads); - newRoot.resize(m_iKDTNumber * indices.size()); - if (indices.size() > 0) - newStart.resize(m_iKDTNumber + 1, (int)(newRoot.size())); - else - { - newStart.resize(m_iKDTNumber + 1, -1); - return; - } -#pragma omp parallel for - for (int i = 0; i < m_iKDTNumber; i++) - { - Sleep(i * 100); std::srand(clock()); - - std::vector pindices(indices.begin(), indices.end()); - std::random_shuffle(pindices.begin(), pindices.end()); - - newStart[i] = i * (int)pindices.size(); - std::cout << "Start to build tree " << i + 1 << std::endl; - int iTreeSize = newStart[i]; - DivideTree(newRoot.data(), pindices, 0, (int)pindices.size() - 1, newStart[i], iTreeSize); - std::cout << i + 1 << " trees built, " << iTreeSize - newStart[i] << " " << pindices.size() << std::endl; - } - } - - template - void Index::DivideTree(KDTNode* pTree, std::vector& indices, int first, int last, - int index, int &iTreeSize) { - ChooseDivision(pTree[index], indices, first, last); - int i = Subdivide(pTree[index], indices, first, last); - if (i - 1 <= first) - { - pTree[index].left = -indices[first] - 1; - } - else - { - iTreeSize++; - pTree[index].left = iTreeSize; - DivideTree(pTree, indices, first, i - 1, iTreeSize, iTreeSize); - } - if (last == i) - { - pTree[index].right = -indices[last] - 1; - } - else - { - iTreeSize++; - pTree[index].right = iTreeSize; - DivideTree(pTree, indices, i, last, iTreeSize, iTreeSize); - } - } - - template - void Index::ChooseDivision(KDTNode& node, const std::vector& indices, int first, int last) - { - std::vector meanValues(m_iDataDimension, 0); - std::vector varianceValues(m_iDataDimension, 0); - int end = min(first + m_numSamplesKDTSplitConsideration, last); - int count = end - first + 1; - // calculate the mean of each dimension - for (int j = first; j <= end; j++) - { - T* v = (m_pSamples)[indices[j]]; - for (int k = 0; k < m_iDataDimension; k++) - { - meanValues[k] += v[k]; - } - } - for (int k = 0; k < m_iDataDimension; k++) - { - meanValues[k] /= count; - } - // calculate the variance of each dimension - for (int j = first; j <= end; j++) - { - T* v = (m_pSamples)[indices[j]]; - for (int k = 0; k < m_iDataDimension; k++) - { - float dist = v[k] - meanValues[k]; - varianceValues[k] += dist*dist; - } - } - // choose the split dimension as one of the dimension inside TOP_DIM maximum variance - node.split_dim = SelectDivisionDimension(varianceValues); - // determine the threshold - node.split_value = meanValues[node.split_dim]; - } - - template - int Index::SelectDivisionDimension(const std::vector& varianceValues) const - { - // Record the top maximum variances - std::vector topind(m_numTopDimensionKDTSplit); - int num = 0; - // order the variances - for (int i = 0; i < m_iDataDimension; i++) - { - if (num < m_numTopDimensionKDTSplit || varianceValues[i] > varianceValues[topind[num - 1]]) - { - if (num < m_numTopDimensionKDTSplit) - { - topind[num++] = i; - } - else - { - topind[num - 1] = i; - } - int j = num - 1; - // order the TOP_DIM variances - while (j > 0 && varianceValues[topind[j]] > varianceValues[topind[j - 1]]) - { - std::swap(topind[j], topind[j - 1]); - j--; - } - } - } - // randomly choose a dimension from TOP_DIM - return topind[COMMON::Utils::rand_int(num)]; - } - - template - int Index::Subdivide(const KDTNode& node, std::vector& indices, const int first, const int last) - { - int i = first; - int j = last; - // decide which child one point belongs - while (i <= j) - { - int ind = indices[i]; - float val = (m_pSamples)[ind][node.split_dim]; - if (val < node.split_value) - { - i++; - } - else - { - std::swap(indices[i], indices[j]); - j--; - } - } - // if all the points in the node are equal,equally split the node into 2 - if ((i == first) || (i == last + 1)) - { - i = (first + last + 1) / 2; - } - return i; - } -#pragma endregion - -#pragma region Build/Save neighborhood graph - template - bool Index::SaveRNG(std::string sGraphFilename) const - { - std::cout << "Save Graph To " << sGraphFilename << std::endl; - FILE *fp = fopen(sGraphFilename.c_str(), "wb"); - if (fp == NULL) return false; - fwrite(&m_iGraphSize, sizeof(int), 1, fp); - fwrite(&m_iNeighborhoodSize, sizeof(int), 1, fp); - - for (int i = 0; i < m_iGraphSize; i++) - { - fwrite((m_pNeighborhoodGraph)[i], sizeof(int), m_iNeighborhoodSize, fp); - } - fclose(fp); - std::cout << "Save Graph Finish!" << std::endl; - return true; - } - - template - void Index::PartitionByTptree(std::vector& indices, - const int first, - const int last, - std::vector> & leaves) - { - if (last - first <= m_iTPTLeafSize) - { - leaves.push_back(std::make_pair(first, last)); - } - else - { - std::vector Mean(m_iDataDimension, 0); - - int iIteration = 100; - int end = min(first + m_numSamplesTPTSplitConsideration, last); - int count = end - first + 1; - // calculate the mean of each dimension - for (int j = first; j <= end; j++) - { - T* v = (m_pSamples)[indices[j]]; - for (int k = 0; k < m_iDataDimension; k++) - { - Mean[k] += v[k]; - } - } - for (int k = 0; k < m_iDataDimension; k++) - { - Mean[k] /= count; - } - std::vector Variance; - Variance.reserve(m_iDataDimension); - for (int j = 0; j < m_iDataDimension; j++) - { - Variance.push_back(BasicResult(j, 0)); - } - // calculate the variance of each dimension - for (int j = first; j <= end; j++) - { - T* v = (m_pSamples)[indices[j]]; - for (int k = 0; k < m_iDataDimension; k++) - { - float dist = v[k] - Mean[k]; - Variance[k].Dist += dist*dist; - } - } - std::sort(Variance.begin(), Variance.end(),COMMON::Compare); - std::vector index(m_numTopDimensionTPTSplit); - std::vector weight(m_numTopDimensionTPTSplit), bestweight(m_numTopDimensionTPTSplit); - float bestvariance = Variance[m_iDataDimension - 1].Dist; - for (int i = 0; i < m_numTopDimensionTPTSplit; i++) - { - index[i] = Variance[m_iDataDimension - 1 - i].VID; - bestweight[i] = 0; - } - bestweight[0] = 1; - float bestmean = Mean[index[0]]; - - std::vector Val(count); - for (int i = 0; i < iIteration; i++) - { - float sumweight = 0; - for (int j = 0; j < m_numTopDimensionTPTSplit; j++) - { - weight[j] = float(rand() % 10000) / 5000.0f - 1.0f; - sumweight += weight[j] * weight[j]; - } - sumweight = sqrt(sumweight); - for (int j = 0; j < m_numTopDimensionTPTSplit; j++) - { - weight[j] /= sumweight; - } - float mean = 0; - for (int j = 0; j < count; j++) - { - Val[j] = 0; - for (int k = 0; k < m_numTopDimensionTPTSplit; k++) - { - Val[j] += weight[k] * (m_pSamples)[indices[first + j]][index[k]]; - } - mean += Val[j]; - } - mean /= count; - float var = 0; - for (int j = 0; j < count; j++) - { - float dist = Val[j] - mean; - var += dist * dist; - } - if (var > bestvariance) - { - bestvariance = var; - bestmean = mean; - for (int j = 0; j < m_numTopDimensionTPTSplit; j++) - { - bestweight[j] = weight[j]; - } - } - } - int i = first; - int j = last; - // decide which child one point belongs - while (i <= j) - { - float val = 0; - for (int k = 0; k < m_numTopDimensionTPTSplit; k++) - { - val += bestweight[k] * (m_pSamples)[indices[i]][index[k]]; - } - if (val < bestmean) - { - i++; - } - else - { - std::swap(indices[i], indices[j]); - j--; - } - } - // if all the points in the node are equal,equally split the node into 2 - if ((i == first) || (i == last + 1)) - { - i = (first + last + 1) / 2; - } - - Mean.clear(); - Variance.clear(); - Val.clear(); - index.clear(); - weight.clear(); - bestweight.clear(); - - PartitionByTptree(indices, first, i - 1, leaves); - PartitionByTptree(indices, i, last, leaves); - } - } - - template - void Index::RefineRNG() { - std::vector spaces(m_iNumberOfThreads); - for (int i = 0; i < m_iNumberOfThreads; i++) spaces[i].Initialize(m_iMaxCheckForRefineGraph, m_iGraphSize); - -#pragma omp parallel for schedule(dynamic) - for (int i = 0; i < m_iGraphSize; i++) - { - RefineRNGNode(i, spaces[omp_get_thread_num()], false); - } - } - - template - void Index::BuildRNG() - { - std::cout << "build RNG graph!" << std::endl; - omp_set_num_threads(m_iNumberOfThreads); - - int graphScale = 16; - int cefScale = 4; - m_iNeighborhoodSize *= graphScale; - m_iGraphSize = m_iDataSize; - - m_pNeighborhoodGraph.Initialize(m_iGraphSize, m_iNeighborhoodSize); - if (m_iGraphSize < 1000) { - std::memset(m_pNeighborhoodGraph.GetData(), -1, m_iGraphSize * m_iNeighborhoodSize * sizeof(int)); - m_iNeighborhoodSize /= graphScale; - - COMMON::WorkSpace space; - space.Initialize(m_iMaxCheckForRefineGraph, m_iGraphSize); - for (int i = 0; i < m_iGraphSize; i++) - { - RefineRNGNode(i, space, true); - } - std::cout << "Build RNG Graph end!" << std::endl; - return; - } - - { - COMMON::Dataset NeighborhoodDists(m_iGraphSize, m_iNeighborhoodSize); - std::vector> TptreeDataIndices(m_iTPTNumber, std::vector(m_iGraphSize)); - std::vector>> TptreeLeafNodes(m_iTPTNumber, std::vector>()); - for (int i = 0; i < m_iGraphSize; i++) - { - for (int j = 0; j < m_iNeighborhoodSize; j++) - { - (m_pNeighborhoodGraph)[i][j] = -1; - (NeighborhoodDists)[i][j] = MaxDist; - } - TptreeDataIndices[0][i] = i; - } - for (int i = 1; i < m_iTPTNumber; i++) { - std::memcpy(TptreeDataIndices[i].data(), TptreeDataIndices[0].data(), sizeof(int) * m_iGraphSize); - } - - std::cout << "Parallel TpTree Partition begin " << std::endl; -#pragma omp parallel for schedule(dynamic) - for (int i = 0; i < m_iTPTNumber; i++) - { - Sleep(i * 100); std::srand(clock()); - std::random_shuffle(TptreeDataIndices[i].begin(), TptreeDataIndices[i].end()); - PartitionByTptree(TptreeDataIndices[i], 0, m_iGraphSize - 1, TptreeLeafNodes[i]); - std::cout << "Finish Getting Leaves for Tree " << i << std::endl; - } - std::cout << "Parallel TpTree Partition done" << std::endl; - - for (int i = 0; i < m_iTPTNumber; i++) - { -#pragma omp parallel for schedule(dynamic) - for (int j = 0; j < TptreeLeafNodes[i].size(); j++) - { - int start_index = TptreeLeafNodes[i][j].first; - int end_index = TptreeLeafNodes[i][j].second; - if (omp_get_thread_num() == 0) std::cout << "\rProcessing Tree " << i << ' ' << j * 100 / TptreeLeafNodes[i].size() << '%'; - for (int x = start_index; x < end_index; x++) - { - for (int y = x + 1; y <= end_index; y++) - { - int p1 = TptreeDataIndices[i][x]; - int p2 = TptreeDataIndices[i][y]; - float dist = m_fComputeDistance((m_pSamples)[p1], (m_pSamples)[p2], m_iDataDimension); - COMMON::Utils::AddNeighbor(p2, dist, (m_pNeighborhoodGraph)[p1], (NeighborhoodDists)[p1], m_iNeighborhoodSize); - COMMON::Utils::AddNeighbor(p1, dist, (m_pNeighborhoodGraph)[p2], (NeighborhoodDists)[p2], m_iNeighborhoodSize); } - } - } - TptreeDataIndices[i].clear(); - TptreeLeafNodes[i].clear(); - std::cout << std::endl; - } - TptreeDataIndices.clear(); - TptreeLeafNodes.clear(); - } - std::cout << "NNG acc:" << GraphAccuracyEstimation(100, false) << std::endl; - if (m_iMaxCheckForRefineGraph > 0) { - m_iCEF *= cefScale; - m_iMaxCheckForRefineGraph *= cefScale; - RefineRNG(); - std::cout << "Refine RNG, graph acc:" << GraphAccuracyEstimation(100, true) << std::endl; - - m_iCEF /= cefScale; - m_iMaxCheckForRefineGraph /= cefScale; - m_iNeighborhoodSize /= graphScale; - - //RefineRNG(); - //std::cout << "Refine RNG, graph acc:" << GraphAccuracyEstimation(100, true) << std::endl; - RefineRNG(); - std::cout << "Refine RNG, graph acc:" << GraphAccuracyEstimation(100, true) << std::endl; - } - std::cout << "Build RNG Graph end!" << std::endl; - } - - template - float Index::GraphAccuracyEstimation(int NSample, bool rng) { - int* correct = new int[NSample]; - -#pragma omp parallel for schedule(dynamic) - for (int i = 0; i < NSample; i++) - { - int x = COMMON::Utils::rand_int(m_iGraphSize); - //int x = i; - COMMON::QueryResultSet query((m_pSamples)[x], m_iCEF); - for (int y = 0; y < m_iGraphSize; y++) - { - if (y == x) continue; - float dist = m_fComputeDistance(query.GetTarget(), (m_pSamples)[y], m_iDataDimension); - query.AddPoint(y, dist); - } - query.SortResult(); - int * exact_rng = new int[m_iNeighborhoodSize]; - if (rng) { - RebuildRNGNodeNeighbors(exact_rng, query.GetResults(), m_iCEF); - } - else { - for (int j = 0; j < m_iNeighborhoodSize && j < m_iCEF; j++) { - exact_rng[j] = query.GetResult(j)->VID; - } - for (int j = m_iCEF; j < m_iNeighborhoodSize; j++) exact_rng[j] = -1; - } - correct[i] = 0; - for (int j = 0; j < m_iNeighborhoodSize; j++) { - if (exact_rng[j] == -1) { - correct[i] += m_iNeighborhoodSize - j; - break; - } - for (int k = 0; k < m_iNeighborhoodSize; k++) - if ((m_pNeighborhoodGraph)[x][k] == exact_rng[j]) { - correct[i]++; - break; - } - } - delete[] exact_rng; - } - float acc = 0; - for (int i = 0; i < NSample; i++) acc += float(correct[i]); - acc = acc / NSample / m_iNeighborhoodSize; - delete[] correct; - return acc; + m_pTrees.BuildTrees(this); + m_pGraph.BuildGraph(this); + + return ErrorCode::Success; } template @@ -882,138 +152,43 @@ namespace SPTAG { mkdir(folderPath.c_str()); } - tbb::concurrent_unordered_set deleted(m_deletedID.begin(), m_deletedID.end()); + + std::lock_guard lock(m_dataLock); + int newR = GetNumSamples(); + std::vector indices; - std::unordered_map old2new; - int newR = m_iDataSize; + std::vector reverseIndices(newR); for (int i = 0; i < newR; i++) { - if (deleted.find(i) == deleted.end()) { + if (m_deletedID.find(i) == m_deletedID.end()) { indices.push_back(i); - old2new[i] = i; + reverseIndices[i] = i; } else { - while (deleted.find(newR - 1) != deleted.end() && newR > i) newR--; + while (m_deletedID.find(newR - 1) != m_deletedID.end() && newR > i) newR--; if (newR == i) break; indices.push_back(newR - 1); - old2new[newR - 1] = i; + reverseIndices[newR - 1] = i; newR--; } } - old2new[-1] = -1; - std::cout << "Refine... from " << m_iDataSize << "->" << newR << std::endl; - std::ofstream vecOut(folderPath + m_sDataPointsFilename, std::ios::binary); - if (!vecOut.is_open()) return ErrorCode::FailedCreateFile; - vecOut.write((char*)&newR, sizeof(int)); - vecOut.write((char*)&m_iDataDimension, sizeof(int)); - for (int i = 0; i < newR; i++) { - vecOut.write((char*)(m_pSamples[indices[i]]), sizeof(T)*m_iDataDimension); - } - vecOut.close(); + std::cout << "Refine... from " << GetNumSamples() << "->" << newR << std::endl; - if (nullptr != m_pMetadata) - { - std::ofstream metaOut(folderPath + "metadata.bin_tmp", std::ios::binary); - std::ofstream metaIndexOut(folderPath + "metadataIndex.bin", std::ios::binary); - if (!metaOut.is_open() || !metaIndexOut.is_open()) return ErrorCode::FailedCreateFile; - metaIndexOut.write((char*)&newR, sizeof(int)); - std::uint64_t offset = 0; - for (int i = 0; i < newR; i++) { - metaIndexOut.write((char*)&offset, sizeof(std::uint64_t)); - ByteArray meta = m_pMetadata->GetMetadata(indices[i]); - metaOut.write((char*)meta.Data(), sizeof(uint8_t)*meta.Length()); - offset += meta.Length(); - } - metaOut.close(); - metaIndexOut.write((char*)&offset, sizeof(std::uint64_t)); - metaIndexOut.close(); - - SPTAG::MetadataSet::MetaCopy(folderPath + "metadata.bin_tmp", folderPath + "metadata.bin"); - } + if (false == m_pSamples.Refine(indices, folderPath + m_sDataPointsFilename)) return ErrorCode::FailedCreateFile; + if (nullptr != m_pMetadata && ErrorCode::Success != m_pMetadata->RefineMetadata(indices, folderPath)) return ErrorCode::FailedCreateFile; - std::ofstream graphOut(folderPath + m_sGraphFilename, std::ios::binary); - if (!graphOut.is_open()) return ErrorCode::FailedCreateFile; - graphOut.write((char*)&newR, sizeof(int)); - graphOut.write((char*)&m_iNeighborhoodSize, sizeof(int)); - - int *neighbors = new int[m_iNeighborhoodSize]; - COMMON::WorkSpace space; - space.Initialize(m_iMaxCheckForRefineGraph, m_iDataSize); - for (int i = 0; i < newR; i++) { - space.Reset(m_iMaxCheckForRefineGraph); - COMMON::QueryResultSet query((m_pSamples)[indices[i]], m_iCEF); - space.CheckAndSet(indices[i]); - for (int j = 0; j < m_iNeighborhoodSize; j++) { - int index = m_pNeighborhoodGraph[indices[i]][j]; - if (index < 0 || space.CheckAndSet(index)) continue; - space.m_NGQueue.insert(COMMON::HeapCell(index, m_fComputeDistance(query.GetTarget(), m_pSamples[index], m_iDataDimension))); - } - SearchIndex(query, space, deleted); - RebuildRNGNodeNeighbors(neighbors, query.GetResults(), m_iCEF); - for (int j = 0; j < m_iNeighborhoodSize; j++) - neighbors[j] = old2new[neighbors[j]]; - graphOut.write((char*)neighbors, sizeof(int) * m_iNeighborhoodSize); - } - delete[]neighbors; - graphOut.close(); - - std::vector newRoot; - std::vector newStart; - BuildKDT(indices, newStart, newRoot); - + m_pGraph.RefineGraph(this, indices, reverseIndices, folderPath + m_sGraphFilename); + + COMMON::KDTree newTrees(m_pTrees); + newTrees.BuildTrees(this, &indices); #pragma omp parallel for - for (int i = 0; i < m_iKDTNumber; i++) - { - for (int j = newStart[i]; j < newStart[i+1]; j++) { - if (newRoot[j].left < 0) - newRoot[j].left = -old2new[-newRoot[j].left - 1] - 1; - if (newRoot[j].right < 0) - newRoot[j].right = -old2new[-newRoot[j].right - 1] - 1; - } + for (int i = 0; i < newTrees.size(); i++) { + if (newTrees[i].left < 0) + newTrees[i].left = -reverseIndices[-newTrees[i].left - 1] - 1; + if (newTrees[i].right < 0) + newTrees[i].right = -reverseIndices[-newTrees[i].right - 1] - 1; } - SaveKDT(folderPath + m_sKDTFilename, newStart, newRoot); - return ErrorCode::Success; - } - - template - ErrorCode Index::MergeIndex(const char* p_indexFilePath1, const char* p_indexFilePath2) { - std::string folderPath1(p_indexFilePath1), folderPath2(p_indexFilePath2); - if (!folderPath1.empty() && *(folderPath1.rbegin()) != FolderSep) folderPath1 += FolderSep; - if (!folderPath2.empty() && *(folderPath2.rbegin()) != FolderSep) folderPath2 += FolderSep; - - Helper::IniReader p_configReader1, p_configReader2; - if (ErrorCode::Success != p_configReader1.LoadIniFile(folderPath1 + "/indexloader.ini")) - return ErrorCode::FailedOpenFile; - - if (ErrorCode::Success != p_configReader2.LoadIniFile(folderPath2 + "/indexloader.ini")) - return ErrorCode::FailedOpenFile; - - std::string empty(""); - if (!COMMON::DataUtils::MergeIndex(folderPath1 + p_configReader1.GetParameter("Index", "VectorFilePath", empty), - folderPath1 + p_configReader1.GetParameter("MetaData", "MetaDataFilePath", empty), - folderPath1 + p_configReader1.GetParameter("MetaData", "MetaDataIndexPath", empty), - folderPath2 + p_configReader1.GetParameter("Index", "VectorFilePath", empty), - folderPath2 + p_configReader1.GetParameter("MetaData", "MetaDataFilePath", empty), - folderPath2 + p_configReader1.GetParameter("MetaData", "MetaDataIndexPath", empty))) - return ErrorCode::Fail; - -#define DefineKDTParameter(VarName, VarType, DefaultValue, RepresentStr) \ - SetParameter(RepresentStr, \ - p_configReader1.GetParameter("Index", \ - RepresentStr, \ - std::string(#DefaultValue)).c_str()); \ - -#include "inc/Core/KDT/ParameterDefinitionList.h" -#undef DefineKDTParameter - - if (!LoadDataPoints(folderPath1 + p_configReader1.GetParameter("Index", "VectorFilePath", empty))) return ErrorCode::FailedOpenFile; - std::vector indices(m_iDataSize); - for (int j = 0; j < m_iDataSize; j++) indices[j] = j; - BuildKDT(indices, m_pKDTStart, m_pKDTRoots); - BuildRNG(); - - SaveKDT(folderPath1 + p_configReader1.GetParameter("Index", "TreeFilePath", empty), m_pKDTStart, m_pKDTRoots); - SaveRNG(folderPath1 + p_configReader1.GetParameter("Index", "GraphFilePath", empty)); + newTrees.SaveTrees(folderPath + m_sKDTFilename); return ErrorCode::Success; } @@ -1022,10 +197,12 @@ namespace SPTAG const T* ptr_v = (const T*)p_vectors; #pragma omp parallel for schedule(dynamic) for (int i = 0; i < p_vectorNum; i++) { - COMMON::QueryResultSet query(ptr_v + i * m_iDataDimension, m_iCEF); + COMMON::QueryResultSet query(ptr_v + i * GetFeatureDim(), m_pGraph.m_iCEF); SearchIndex(query); - for (int i = 0; i < m_iCEF; i++) { + + for (int i = 0; i < m_pGraph.m_iCEF; i++) { if (query.GetResult(i)->Dist < 1e-6) { + std::lock_guard lock(m_dataLock); m_deletedID.insert(query.GetResult(i)->VID); } } @@ -1034,230 +211,79 @@ namespace SPTAG } template - ErrorCode Index::AddIndex(const void* p_vectors, int p_vectorNum, int p_dimension) { - if (m_pKDTRoots.size() == 0) { - return BuildIndex(p_vectors, p_vectorNum, p_dimension); - } - if (p_dimension != m_iDataDimension) return ErrorCode::FailedParseValue; - + ErrorCode Index::AddIndex(const void* p_vectors, int p_vectorNum, int p_dimension) + { int begin, end; { - std::lock_guard lock(m_dataAllocLock); + std::lock_guard lock(m_dataLock); + + if (GetNumSamples() == 0) + return BuildIndex(p_vectors, p_vectorNum, p_dimension); + + if (p_dimension != GetFeatureDim()) + return ErrorCode::FailedParseValue; + + begin = GetNumSamples(); + end = GetNumSamples() + p_vectorNum; m_pSamples.AddBatch((const T*)p_vectors, p_vectorNum); - m_pNeighborhoodGraph.AddBatch(p_vectorNum); + m_pGraph.AddBatch(p_vectorNum); - end = m_iDataSize + p_vectorNum; - if (m_pSamples.R() != end || m_pNeighborhoodGraph.R() != end) { + if (m_pSamples.R() != end || m_pGraph.R() != end) { std::cout << "Memory Error: Cannot alloc space for vectors" << std::endl; - m_pSamples.SetR(m_iDataSize); - m_pNeighborhoodGraph.SetR(m_iDataSize); + m_pSamples.SetR(begin); + m_pGraph.SetR(begin); return ErrorCode::Fail; } - begin = m_iDataSize; - m_iDataSize = end; - m_iGraphSize = end; - m_dataUpdateLock.resize(m_iDataSize); - } - if (DistCalcMethod::Cosine == m_iDistCalcMethod) - { - int base = COMMON::Utils::GetBase(); - for (int i = begin; i < end; i++) { - COMMON::Utils::Normalize((T*)m_pSamples[i], m_iDataDimension, base); + if (DistCalcMethod::Cosine == m_iDistCalcMethod) + { + int base = COMMON::Utils::GetBase(); + for (int i = begin; i < end; i++) { + COMMON::Utils::Normalize((T*)m_pSamples[i], GetFeatureDim(), base); + } } } - auto space = m_workSpacePool->Rent(); for (int node = begin; node < end; node++) { - RefineRNGNode(node, *(space.get()), true); + m_pGraph.RefineNode(this, node, true); } - m_workSpacePool->Return(space); std::cout << "Add " << p_vectorNum << " vectors" << std::endl; return ErrorCode::Success; } - template - void Index::RefineRNGNode(const int node, COMMON::WorkSpace &space, bool updateNeighbors) { - space.Reset(m_iMaxCheckForRefineGraph); - COMMON::QueryResultSet query((m_pSamples)[node], m_iCEF); - space.CheckAndSet(node); - for (int i = 0; i < m_iNeighborhoodSize; i++) { - int index = m_pNeighborhoodGraph[node][i]; - if (index < 0 || space.CheckAndSet(index)) continue; - space.m_NGQueue.insert(COMMON::HeapCell(index, m_fComputeDistance(query.GetTarget(), m_pSamples[index], m_iDataDimension))); - } - SearchIndex(query, space, m_deletedID); - RebuildRNGNodeNeighbors(m_pNeighborhoodGraph[node], query.GetResults(), m_iCEF); - - if (updateNeighbors) { - // update neighbors - for (int j = 0; j < m_iCEF; j++) - { - BasicResult* item = query.GetResult(j); - if (item->VID < 0) break; - - int insertID = node; - int* nodes = m_pNeighborhoodGraph[item->VID]; - std::lock_guard lock(m_dataUpdateLock[item->VID]); - for (int k = 0; k < m_iNeighborhoodSize; k++) - { - int tmpNode = nodes[k]; - if (tmpNode < 0) - { - bool good = true; - for (int t = 0; t < k; t++) { - if (m_fComputeDistance((m_pSamples)[insertID], (m_pSamples)[nodes[t]], m_iDataDimension) < item->Dist) { - good = false; - break; - } - } - if (good) { - nodes[k] = insertID; - } - break; - } - float tmpDist = m_fComputeDistance(m_pSamples[item->VID], m_pSamples[tmpNode], m_iDataDimension); - if (item->Dist < tmpDist || (item->Dist == tmpDist && insertID < tmpNode)) - { - bool good = true; - for (int t = 0; t < k; t++) { - if (m_fComputeDistance((m_pSamples)[insertID], (m_pSamples)[nodes[t]], m_iDataDimension) < item->Dist) { - good = false; - break; - } - } - if (good) { - nodes[k] = insertID; - insertID = tmpNode; - item->Dist = tmpDist; - } - else { - break; - } - } - } - } - } - } - - template - void Index::RebuildRNGNodeNeighbors(int* nodes, const BasicResult* queryResults, int numResults) { - int count = 0; - for (int j = 0; j < numResults && count < m_iNeighborhoodSize; j++) { - const BasicResult& item = queryResults[j]; - if (item.VID < 0) continue; - - bool good = true; - for (int k = 0; k < count; k++) { - if (m_fComputeDistance((m_pSamples)[nodes[k]], (m_pSamples)[item.VID], m_iDataDimension) <= item.Dist) { - good = false; - break; - } - } - if (good) nodes[count++] = item.VID; - } - for (int j = count; j < m_iNeighborhoodSize; j++) nodes[j] = -1; - } - - template - bool Index::SaveDataPoints(std::string sDataPointsFileName) - { - std::cout << "Save Data Points To " << sDataPointsFileName << std::endl; - - FILE * fp = fopen(sDataPointsFileName.c_str(), "wb"); - if (fp == NULL) return false; - - int R = m_pSamples.R(), C = m_pSamples.C(); - fwrite(&R, sizeof(int), 1, fp); - fwrite(&C, sizeof(int), 1, fp); - - // write point one by one in case for cache miss - for (int i = 0; i < R; i++) { - fwrite((m_pSamples)[i], sizeof(T), C, fp); - } - fclose(fp); - - std::cout << "Save Data Points (" << m_pSamples.R() << ", " << m_pSamples.C() << ") Finish!" << std::endl; - return true; - } - template ErrorCode - Index::SaveIndex(const std::string& p_folderPath) + Index::SaveIndex(const std::string& p_folderPath, std::ofstream& p_configout) { - std::string folderPath(p_folderPath); - if (!folderPath.empty() && *(folderPath.rbegin()) != FolderSep) - { - folderPath += FolderSep; - } - - if (!direxists(folderPath.c_str())) - { - mkdir(folderPath.c_str()); - } - - std::string loaderFilePath = folderPath + "indexloader.ini"; - - std::ofstream loaderFile(loaderFilePath); - if (!loaderFile.is_open()) - { - return ErrorCode::FailedCreateFile; - } - m_sDataPointsFilename = "vectors.bin"; m_sKDTFilename = "tree.bin"; m_sGraphFilename = "graph.bin"; - std::string metadataFile = "metadata.bin"; - std::string metadataIndexFile = "metadataIndex.bin"; - - loaderFile << "[Index]" << std::endl; - loaderFile << "IndexAlgoType=" << Helper::Convert::ConvertToString(IndexAlgoType::KDT) << std::endl; - loaderFile << "ValueType=" << Helper::Convert::ConvertToString(GetEnumValueType()) << std::endl; - loaderFile << std::endl; #define DefineKDTParameter(VarName, VarType, DefaultValue, RepresentStr) \ - loaderFile << RepresentStr << "=" << GetParameter(RepresentStr) << std::endl; + p_configout << RepresentStr << "=" << GetParameter(RepresentStr) << std::endl; #include "inc/Core/KDT/ParameterDefinitionList.h" #undef DefineKDTParameter - loaderFile << std::endl; - - if (nullptr != m_pMetadata) - { - loaderFile << "[MetaData]" << std::endl; - loaderFile << "MetaDataFilePath=" << metadataFile << std::endl; - loaderFile << "MetaDataIndexPath=" << metadataIndexFile << std::endl; - loaderFile << std::endl; - } - loaderFile.close(); + p_configout << std::endl; if (m_deletedID.size() > 0) { - RefineIndex(folderPath); + RefineIndex(p_folderPath); } else { - if (!SaveDataPoints(folderPath + m_sDataPointsFilename)) return ErrorCode::Fail; - if (!SaveKDT(folderPath + m_sKDTFilename, m_pKDTStart, m_pKDTRoots)) return ErrorCode::Fail; - if (!SaveRNG(folderPath + m_sGraphFilename)) return ErrorCode::Fail; - if (nullptr != m_pMetadata) - { - m_pMetadata->SaveMetadata(folderPath + metadataFile, folderPath + metadataIndexFile); - } + if (!m_pSamples.Save(p_folderPath + m_sDataPointsFilename)) return ErrorCode::Fail; + if (!m_pTrees.SaveTrees(p_folderPath + m_sKDTFilename)) return ErrorCode::Fail; + if (!m_pGraph.SaveGraph(p_folderPath + m_sGraphFilename)) return ErrorCode::Fail; } return ErrorCode::Success; } -#pragma endregion -#pragma endregion template ErrorCode Index::SetParameter(const char* p_param, const char* p_value) { - if (nullptr == p_param || nullptr == p_value) - { - return ErrorCode::Fail; - } + if (nullptr == p_param || nullptr == p_value) return ErrorCode::Fail; #define DefineKDTParameter(VarName, VarType, DefaultValue, RepresentStr) \ else if (SPTAG::Helper::StrUtils::StrEqualIgnoreCase(p_param, RepresentStr)) \ @@ -1282,10 +308,7 @@ namespace SPTAG std::string Index::GetParameter(const char* p_param) const { - if (nullptr == p_param) - { - return std::string(); - } + if (nullptr == p_param) return std::string(); #define DefineKDTParameter(VarName, VarType, DefaultValue, RepresentStr) \ else if (SPTAG::Helper::StrUtils::StrEqualIgnoreCase(p_param, RepresentStr)) \ diff --git a/AnnService/src/Core/MetadataSet.cpp b/AnnService/src/Core/MetadataSet.cpp index 405698ed..999c5b47 100644 --- a/AnnService/src/Core/MetadataSet.cpp +++ b/AnnService/src/Core/MetadataSet.cpp @@ -5,6 +5,31 @@ using namespace SPTAG; +ErrorCode +MetadataSet::RefineMetadata(std::vector& indices, const std::string& p_folderPath) +{ + std::ofstream metaOut(p_folderPath + "metadata.bin_tmp", std::ios::binary); + std::ofstream metaIndexOut(p_folderPath + "metadataIndex.bin", std::ios::binary); + if (!metaOut.is_open() || !metaIndexOut.is_open()) return ErrorCode::FailedCreateFile; + + int R = (int)indices.size(); + metaIndexOut.write((char*)&R, sizeof(int)); + std::uint64_t offset = 0; + for (int i = 0; i < R; i++) { + metaIndexOut.write((char*)&offset, sizeof(std::uint64_t)); + ByteArray meta = GetMetadata(indices[i]); + metaOut.write((char*)meta.Data(), sizeof(uint8_t)*meta.Length()); + offset += meta.Length(); + } + metaOut.close(); + metaIndexOut.write((char*)&offset, sizeof(std::uint64_t)); + metaIndexOut.close(); + + SPTAG::MetadataSet::MetaCopy(p_folderPath + "metadata.bin_tmp", p_folderPath + "metadata.bin"); + return ErrorCode::Success; +} + + ErrorCode MetadataSet::MetaCopy(const std::string& p_src, const std::string& p_dst) { @@ -60,7 +85,7 @@ FileMetadataSet::FileMetadataSet(const std::string& p_metafile, const std::strin return; } - fpidx.read((char *)&m_count, sizeof(int)); + fpidx.read((char *)&m_count, sizeof(m_count)); m_pOffsets.resize(m_count + 1); fpidx.read((char *)m_pOffsets.data(), sizeof(std::uint64_t) * (m_count + 1)); fpidx.close(); @@ -82,7 +107,7 @@ FileMetadataSet::GetMetadata(IndexType p_vectorID) const { std::uint64_t startoff = m_pOffsets[p_vectorID]; std::uint64_t bytes = m_pOffsets[p_vectorID + 1] - startoff; - if (p_vectorID < m_count) { + if (p_vectorID < (IndexType)m_count) { m_fp->seekg(startoff, std::ios_base::beg); ByteArray b = ByteArray::Alloc((SizeType)bytes); m_fp->read((char*)b.Data(), bytes); @@ -142,7 +167,7 @@ FileMetadataSet::SaveMetadata(const std::string& p_metaFile, const std::string& std::ofstream dst(p_metaindexFile, std::ios::binary); m_count = static_cast(m_pOffsets.size()) - 1; m_newdata.clear(); - dst.write((char*)&m_count, sizeof(int)); + dst.write((char*)&m_count, sizeof(m_count)); dst.write((char*)m_pOffsets.data(), sizeof(std::uint64_t) * m_pOffsets.size()); return ret; } diff --git a/AnnService/src/Core/VectorIndex.cpp b/AnnService/src/Core/VectorIndex.cpp index 341d30c7..af6cf601 100644 --- a/AnnService/src/Core/VectorIndex.cpp +++ b/AnnService/src/Core/VectorIndex.cpp @@ -1,4 +1,5 @@ #include "inc/Core/VectorIndex.h" +#include "inc/Core/Common/DataUtils.h" #include "inc/Helper/CommonHelper.h" #include "inc/Helper/StringConvert.h" #include "inc/Helper/SimpleIniReader.h" @@ -21,20 +22,6 @@ VectorIndex::~VectorIndex() } -void -VectorIndex::SetIndexName(const std::string& p_indexName) -{ - m_indexName = p_indexName; -} - - -const std::string& -VectorIndex::GetIndexName() const -{ - return m_indexName; -} - - std::string VectorIndex::GetParameter(const std::string& p_param) const { @@ -65,6 +52,92 @@ VectorIndex::GetMetadata(IndexType p_vectorID) const { } +ErrorCode +VectorIndex::LoadIndex(const std::string& p_folderPath) +{ + std::string folderPath(p_folderPath); + if (!folderPath.empty() && *(folderPath.rbegin()) != FolderSep) + { + folderPath += FolderSep; + } + + Helper::IniReader p_configReader; + if (ErrorCode::Success != p_configReader.LoadIniFile(folderPath + "/indexloader.ini")) + { + return ErrorCode::FailedOpenFile; + } + + std::string metadataSection("MetaData"); + if (p_configReader.DoesSectionExist(metadataSection)) + { + std::string metadataFilePath = p_configReader.GetParameter(metadataSection, + "MetaDataFilePath", + std::string()); + std::string metadataIndexFilePath = p_configReader.GetParameter(metadataSection, + "MetaDataIndexPath", + std::string()); + + m_pMetadata.reset(new FileMetadataSet(folderPath + metadataFilePath, folderPath + metadataIndexFilePath)); + + if (!m_pMetadata->Available()) + { + std::cerr << "Error: Failed to load metadata." << std::endl; + return ErrorCode::Fail; + } + } + if (DistCalcMethod::Undefined == p_configReader.GetParameter("Index", "DistCalcMethod", DistCalcMethod::Undefined)) + { + std::cerr << "Error: Failed to load parameter DistCalcMethod." << std::endl; + return ErrorCode::Fail; + } + + return LoadIndex(folderPath, p_configReader); +} + + +ErrorCode VectorIndex::SaveIndex(const std::string& p_folderPath) +{ + std::string folderPath(p_folderPath); + if (!folderPath.empty() && *(folderPath.rbegin()) != FolderSep) + { + folderPath += FolderSep; + } + + if (!direxists(folderPath.c_str())) + { + mkdir(folderPath.c_str()); + } + + std::string loaderFilePath = folderPath + "indexloader.ini"; + + std::ofstream loaderFile(loaderFilePath); + if (!loaderFile.is_open()) + { + return ErrorCode::FailedCreateFile; + } + + if (nullptr != m_pMetadata) + { + std::string metadataFile = "metadata.bin"; + std::string metadataIndexFile = "metadataIndex.bin"; + loaderFile << "[MetaData]" << std::endl; + loaderFile << "MetaDataFilePath=" << metadataFile << std::endl; + loaderFile << "MetaDataIndexPath=" << metadataIndexFile << std::endl; + loaderFile << std::endl; + + m_pMetadata->SaveMetadata(folderPath + metadataFile, folderPath + metadataIndexFile); + } + + loaderFile << "[Index]" << std::endl; + loaderFile << "IndexAlgoType=" << Helper::Convert::ConvertToString(GetIndexAlgoType()) << std::endl; + loaderFile << "ValueType=" << Helper::Convert::ConvertToString(GetVectorValueType()) << std::endl; + loaderFile << std::endl; + + ErrorCode ret = SaveIndex(folderPath, loaderFile); + loaderFile.close(); + return ret; +} + ErrorCode VectorIndex::BuildIndex(std::shared_ptr p_vectorSet, std::shared_ptr p_metadataSet) @@ -192,3 +265,55 @@ VectorIndex::LoadIndex(const std::string& p_loaderFilePath, std::shared_ptr index = CreateInstance( + p_configReader1.GetParameter("Index", "IndexAlgoType", IndexAlgoType::Undefined), + p_configReader1.GetParameter("Index", "ValueType", VectorValueType::Undefined)); + if (index == nullptr) return ErrorCode::FailedParseValue; + + std::string empty(""); + if (!COMMON::DataUtils::MergeIndex(folderPath1 + p_configReader1.GetParameter("Index", "VectorFilePath", empty), + folderPath1 + p_configReader1.GetParameter("MetaData", "MetaDataFilePath", empty), + folderPath1 + p_configReader1.GetParameter("MetaData", "MetaDataIndexPath", empty), + folderPath2 + p_configReader1.GetParameter("Index", "VectorFilePath", empty), + folderPath2 + p_configReader1.GetParameter("MetaData", "MetaDataFilePath", empty), + folderPath2 + p_configReader1.GetParameter("MetaData", "MetaDataIndexPath", empty))) + return ErrorCode::Fail; + + for (const auto& iter : p_configReader1.GetParameters("Index")) + index->SetParameter(iter.first.c_str(), iter.second.c_str()); + + if (p_configReader1.DoesSectionExist("MetaData")) + { + for (const auto& iter : p_configReader1.GetParameters("MetaData")) + index->SetParameter(iter.first.c_str(), iter.second.c_str()); + index->SetMetadata(folderPath1 + p_configReader1.GetParameter("MetaData", "MetaDataFilePath", empty), + folderPath1 + p_configReader1.GetParameter("MetaData", "MetaDataIndexPath", empty)); + } + + std::ifstream vecIn(folderPath1 + p_configReader1.GetParameter("Index", "VectorFilePath", empty), std::ios::binary); + int R, C; + vecIn.read((char*)&R, sizeof(int)); + vecIn.read((char*)&C, sizeof(int)); + size_t size = R * C * GetValueTypeSize(index->GetVectorValueType()); + char* data = new char[size]; + vecIn.read(data, size); + vecIn.close(); + index->BuildIndex((void*)data, R, C); + index->SaveIndex(folderPath1); + return ErrorCode::Success; +} \ No newline at end of file diff --git a/AnnService/src/IndexBuilder/main.cpp b/AnnService/src/IndexBuilder/main.cpp index d370409d..c265d758 100644 --- a/AnnService/src/IndexBuilder/main.cpp +++ b/AnnService/src/IndexBuilder/main.cpp @@ -35,7 +35,7 @@ int main(int argc, char* argv[]) std::string paramName = param.substr(0, idx); std::string paramVal = param.substr(idx + 1); std::string sectionName; - idx = paramName.find("."); + idx = (int)paramName.find("."); if (idx >= 0) { sectionName = paramName.substr(0, idx); paramName = paramName.substr(idx + 1); diff --git a/PythonWrapper/PythonCore.vcxproj b/PythonWrapper/PythonCore.vcxproj index 4e23ba73..f614d325 100644 --- a/PythonWrapper/PythonCore.vcxproj +++ b/PythonWrapper/PythonCore.vcxproj @@ -113,7 +113,8 @@ - + + @@ -128,6 +129,7 @@ - + + \ No newline at end of file diff --git a/PythonWrapper/inc/CoreInterface.h b/PythonWrapper/inc/CoreInterface.h index 2a37e3c9..903cfe05 100644 --- a/PythonWrapper/inc/CoreInterface.h +++ b/PythonWrapper/inc/CoreInterface.h @@ -59,8 +59,6 @@ class AnnIndex bool Delete(ByteArray p_data, SizeType p_num); - bool Refine(const char* p_loaderFile); - static AnnIndex Load(const char* p_loaderFile); private: diff --git a/PythonWrapper/packages.config b/PythonWrapper/packages.config index d780ec4a..667ef75b 100644 --- a/PythonWrapper/packages.config +++ b/PythonWrapper/packages.config @@ -3,4 +3,6 @@ + + \ No newline at end of file diff --git a/PythonWrapper/src/CoreInterface.cpp b/PythonWrapper/src/CoreInterface.cpp index 45dc6504..fa14e415 100644 --- a/PythonWrapper/src/CoreInterface.cpp +++ b/PythonWrapper/src/CoreInterface.cpp @@ -208,9 +208,3 @@ AnnIndex::Delete(ByteArray p_data, SizeType p_num) } return false; } - -bool -AnnIndex::Refine(const char* p_loaderFile) -{ - return (SPTAG::ErrorCode::Success == m_index->RefineIndex(std::string(p_loaderFile))); -} \ No newline at end of file diff --git a/README.md b/README.md index 331ddae8..c066dd29 100644 --- a/README.md +++ b/README.md @@ -1,7 +1,7 @@ # SPTAG: A library for fast approximate nearest neighbor search -[![MIT licensed](https://img.shields.io/badge/license-MIT-yellow.svg)](https://github.com/Microsoft/nni/blob/master/LICENSE) -[![Build status](https://sysdnn.visualstudio.com/SPTAG/_apis/build/status/SPTAG-CI)](https://sysdnn.visualstudio.com/SPTAG/_build/latest?definitionId=2) +[![MIT licensed](https://img.shields.io/badge/license-MIT-yellow.svg)](https://github.com/Microsoft/SPTAG/blob/master/LICENSE) +[![Build status](https://sysdnn.visualstudio.com/SPTAG/_apis/build/status/SPTAG-GITHUB)](https://sysdnn.visualstudio.com/SPTAG/_build/latest?definitionId=2) ## **SPTAG** SPTAG (Space Partition Tree And Graph) is a library for large scale vector approximate nearest neighbor search scenerio, which is written in C++ and wrapped by Python.