| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,178 @@ | ||
| /* Authors: Greg Hamerly and Jonathan Drake | ||
| * Feedback: hamerly@cs.baylor.edu | ||
| * See: http://cs.baylor.edu/~hamerly/software/kmeans.php | ||
| * Copyright 2014 | ||
| */ | ||
|
|
||
| #include "hamerly_kmeans.h" | ||
| #include "kmeans_general_functions.h" | ||
| #include <cmath> | ||
|
|
||
| /* Hamerly's algorithm that is a 'simplification' of Elkan's, in that it keeps | ||
| * the following bounds: | ||
| * - One upper bound per clustered record on the distance between the record | ||
| * and its closest center. It is always greater than or equal to the true | ||
| * distance between the record and its closest center. This is the same as in | ||
| * Elkan's algorithm. | ||
| * - *One* lower bound per clustered record on the distance between the record | ||
| * and its *second*-closest center. It is always less than or equal to the | ||
| * true distance between the record and its second closest center. This is | ||
| * different information than Elkan's algorithm -- his algorithm keeps k | ||
| * lower bounds for each record, for a total of (n*k) lower bounds. | ||
| * | ||
| * The basic ideas are: | ||
| * - when lower(x) <= upper(x), we need to recalculate the closest centers for | ||
| * the record x, and reset lower(x) and upper(x) to their boundary values | ||
| * - whenever a center moves | ||
| * - calculate the distance it moves 'd' | ||
| * - for each record x assigned to that center, update its upper bound | ||
| * - upper(x) = upper(x) + d | ||
| * - after each iteration | ||
| * - find the center that has moved the most (with distance 'd') | ||
| * - update the lower bound for all (?) records: | ||
| * - lower(x) = lower(x) - d | ||
| * | ||
| * Parameters: | ||
| * - threadId: the index of the thread that is running | ||
| * - maxIterations: a bound on the number of iterations to perform | ||
| * | ||
| * Return value: the number of iterations performed (always at least 1) | ||
| */ | ||
| // this version only updates center locations when necessary | ||
| int HamerlyKmeans::runThread(int threadId, int maxIterations) { | ||
| int iterations = 0; | ||
|
|
||
| int startNdx = start(threadId); | ||
| int endNdx = end(threadId); | ||
|
|
||
| while ((iterations < maxIterations) && ! converged) { | ||
| ++iterations; | ||
|
|
||
| // compute the inter-center distances, keeping only the closest distances | ||
| update_s(threadId); | ||
| synchronizeAllThreads(); | ||
|
|
||
| // loop over all records | ||
| for (int i = startNdx; i < endNdx; ++i) { | ||
| unsigned short closest = assignment[i]; | ||
|
|
||
| // if upper[i] is less than the greater of these two, then we can | ||
| // ignore record i | ||
| double upper_comparison_bound = std::max(s[closest], lower[i]); | ||
|
|
||
| // first check: if u(x) <= s(c(x)) or u(x) <= lower(x), then ignore | ||
| // x, because its closest center must still be closest | ||
| if (upper[i] <= upper_comparison_bound) { | ||
| continue; | ||
| } | ||
|
|
||
| // otherwise, compute the real distance between this record and its | ||
| // closest center, and update upper | ||
| double u2 = pointCenterDist2(i, closest); | ||
| upper[i] = sqrt(u2); | ||
|
|
||
| // if (u(x) <= s(c(x))) or (u(x) <= lower(x)), then ignore x | ||
| if (upper[i] <= upper_comparison_bound) { | ||
| continue; | ||
| } | ||
|
|
||
| // now update the lower bound by looking at all other centers | ||
| double l2 = std::numeric_limits<double>::max(); // the squared lower bound | ||
| for (int j = 0; j < k; ++j) { | ||
| if (j == closest) { continue; } | ||
|
|
||
| double dist2 = pointCenterDist2(i, j); | ||
|
|
||
| if (dist2 < u2) { | ||
| // another center is closer than the current assignment | ||
|
|
||
| // change the lower bound to be the current upper bound | ||
| // (since the current upper bound is the distance to the | ||
| // now-second-closest known center) | ||
| l2 = u2; | ||
|
|
||
| // adjust the upper bound and the current assignment | ||
| u2 = dist2; | ||
| closest = j; | ||
| } else if (dist2 < l2) { | ||
| // we must reduce the lower bound on the distance to the | ||
| // *second* closest center to x[i] | ||
| l2 = dist2; | ||
| } | ||
| } | ||
|
|
||
| // we have been dealing in squared distances; need to convert | ||
| lower[i] = sqrt(l2); | ||
|
|
||
| // if the assignment for i has changed, then adjust the counts and | ||
| // locations of each center's accumulated mass | ||
| if (assignment[i] != closest) { | ||
| upper[i] = sqrt(u2); | ||
| changeAssignment(i, closest, threadId); | ||
| } | ||
| } | ||
|
|
||
| verifyAssignment(iterations, startNdx, endNdx); | ||
|
|
||
| // ELKAN 4, 5, AND 6 | ||
| // calculate the new center locations | ||
| synchronizeAllThreads(); | ||
| if (threadId == 0) { | ||
| int furthestMovingCenter = move_centers(); | ||
| converged = (0.0 == centerMovement[furthestMovingCenter]); | ||
| } | ||
|
|
||
| synchronizeAllThreads(); | ||
|
|
||
| if (! converged) { | ||
| update_bounds(startNdx, endNdx); | ||
| } | ||
|
|
||
| synchronizeAllThreads(); | ||
| } | ||
|
|
||
| return iterations; | ||
| } | ||
|
|
||
|
|
||
| /* This method does the following: | ||
| * - finds the furthest-moving center | ||
| * - finds the distances moved by the two furthest-moving centers | ||
| * - updates the upper/lower bounds for each record | ||
| * | ||
| * Parameters: | ||
| * - startNdx: the first index of the dataset this thread is responsible for | ||
| * - endNdx: one past the last index of the dataset this thread is responsible for | ||
| */ | ||
| void HamerlyKmeans::update_bounds(int startNdx, int endNdx) { | ||
| double longest = centerMovement[0], secondLongest = (1 < k) ? centerMovement[1] : centerMovement[0]; | ||
| int furthestMovingCenter = 0; | ||
|
|
||
| if (longest < secondLongest) { | ||
| furthestMovingCenter = 1; | ||
| std::swap(longest, secondLongest); | ||
| } | ||
|
|
||
| for (int j = 2; j < k; ++j) { | ||
| if (longest < centerMovement[j]) { | ||
| secondLongest = longest; | ||
| longest = centerMovement[j]; | ||
| furthestMovingCenter = j; | ||
| } else if (secondLongest < centerMovement[j]) { | ||
| secondLongest = centerMovement[j]; | ||
| } | ||
| } | ||
|
|
||
| // update upper/lower bounds | ||
| for (int i = startNdx; i < endNdx; ++i) { | ||
| // the upper bound increases by the amount that its center moved | ||
| upper[i] += centerMovement[assignment[i]]; | ||
|
|
||
| // The lower bound decreases by the maximum amount that any center | ||
| // moved, unless the furthest-moving center is the one it's assigned | ||
| // to. In the latter case, the lower bound decreases by the amount | ||
| // of the second-furthest-moving center. | ||
| lower[i] -= (assignment[i] == furthestMovingCenter) ? secondLongest : longest; | ||
| } | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,29 @@ | ||
| #ifndef HAMERLY_KMEANS_H | ||
| #define HAMERLY_KMEANS_H | ||
|
|
||
| /* Authors: Greg Hamerly and Jonathan Drake | ||
| * Feedback: hamerly@cs.baylor.edu | ||
| * See: http://cs.baylor.edu/~hamerly/software/kmeans.php | ||
| * Copyright 2014 | ||
| * | ||
| * HamerlyKmeans implements Hamerly's k-means algorithm that uses one lower | ||
| * bound per point. | ||
| */ | ||
|
|
||
| #include "triangle_inequality_base_kmeans.h" | ||
|
|
||
| class HamerlyKmeans : public TriangleInequalityBaseKmeans { | ||
| public: | ||
| HamerlyKmeans() { numLowerBounds = 1; } | ||
| virtual ~HamerlyKmeans() { free(); } | ||
| virtual std::string getName() const { return "hamerly"; } | ||
|
|
||
| protected: | ||
| // Update the upper and lower bounds for the given range of points. | ||
| void update_bounds(int startNdx, int endNdx); | ||
|
|
||
| virtual int runThread(int threadId, int maxIterations); | ||
| }; | ||
|
|
||
| #endif | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,165 @@ | ||
| /* Authors: Greg Hamerly and Jonathan Drake | ||
| * Feedback: hamerly@cs.baylor.edu | ||
| * See: http://cs.baylor.edu/~hamerly/software/kmeans.php | ||
| * Copyright 2014 | ||
| */ | ||
|
|
||
| #include "kmeans.h" | ||
| #include "kmeans_general_functions.h" | ||
| #include <cassert> | ||
| #include <cmath> | ||
|
|
||
| Kmeans::Kmeans() : x(NULL), n(0), k(0), d(0), numThreads(0), converged(false), | ||
| clusterSize(NULL), centerMovement(NULL), assignment(NULL) { | ||
| #ifdef COUNT_DISTANCES | ||
| numDistances = 0; | ||
| #endif | ||
| } | ||
|
|
||
| void Kmeans::free() { | ||
| delete [] centerMovement; | ||
| for (int t = 0; t < numThreads; ++t) { | ||
| delete [] clusterSize[t]; | ||
| } | ||
| delete [] clusterSize; | ||
| centerMovement = NULL; | ||
| clusterSize = NULL; | ||
| assignment = NULL; | ||
| n = k = d = numThreads = 0; | ||
| } | ||
|
|
||
|
|
||
|
|
||
| void Kmeans::initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads) { | ||
| free(); | ||
|
|
||
| converged = false; | ||
| x = aX; | ||
| n = x->n; | ||
| d = x->d; | ||
| k = aK; | ||
|
|
||
| #ifdef USE_THREADS | ||
| numThreads = aNumThreads; | ||
| pthread_barrier_init(&barrier, NULL, numThreads); | ||
| #else | ||
| numThreads = 1; | ||
| #endif | ||
|
|
||
| assignment = initialAssignment; | ||
| centerMovement = new double[k]; | ||
| clusterSize = new int *[numThreads]; | ||
| for (int t = 0; t < numThreads; ++t) { | ||
| clusterSize[t] = new int[k]; | ||
| std::fill(clusterSize[t], clusterSize[t] + k, 0); | ||
| for (int i = start(t); i < end(t); ++i) { | ||
| assert(assignment[i] < k); | ||
| ++clusterSize[t][assignment[i]]; | ||
| } | ||
| } | ||
|
|
||
|
|
||
| #ifdef COUNT_DISTANCES | ||
| numDistances = 0; | ||
| #endif | ||
| } | ||
|
|
||
| void Kmeans::changeAssignment(int xIndex, int closestCluster, int threadId) { | ||
| --clusterSize[threadId][assignment[xIndex]]; | ||
| ++clusterSize[threadId][closestCluster]; | ||
| assignment[xIndex] = closestCluster; | ||
| } | ||
|
|
||
| #ifdef USE_THREADS | ||
| struct ThreadInfo { | ||
| public: | ||
| ThreadInfo() : km(NULL), threadId(0), pthread_id(0) {} | ||
| Kmeans *km; | ||
| int threadId; | ||
| pthread_t pthread_id; | ||
| int numIterations; | ||
| int maxIterations; | ||
| }; | ||
| #endif | ||
|
|
||
| void *Kmeans::runner(void *args) { | ||
| #ifdef USE_THREADS | ||
| ThreadInfo *ti = (ThreadInfo *)args; | ||
| ti->numIterations = ti->km->runThread(ti->threadId, ti->maxIterations); | ||
| #endif | ||
| return NULL; | ||
| } | ||
|
|
||
| int Kmeans::run(int maxIterations) { | ||
| int iterations = 0; | ||
| #ifdef USE_THREADS | ||
| { | ||
| ThreadInfo *info = new ThreadInfo[numThreads]; | ||
| for (int t = 0; t < numThreads; ++t) { | ||
| info[t].km = this; | ||
| info[t].threadId = t; | ||
| info[t].maxIterations = maxIterations; | ||
| pthread_create(&info[t].pthread_id, NULL, Kmeans::runner, &info[t]); | ||
| } | ||
| // wait for everything to finish... | ||
| for (int t = 0; t < numThreads; ++t) { | ||
| pthread_join(info[t].pthread_id, NULL); | ||
| } | ||
| iterations = info[0].numIterations; | ||
| delete [] info; | ||
| } | ||
| #else | ||
| { | ||
| iterations = runThread(0, maxIterations); | ||
| } | ||
| #endif | ||
|
|
||
| return iterations; | ||
| } | ||
|
|
||
| double Kmeans::getSSE() const { | ||
| double sse = 0.0; | ||
| for (int i = 0; i < n; ++i) { | ||
| sse += pointCenterDist2(i, assignment[i]); | ||
| } | ||
| return sse; | ||
| } | ||
|
|
||
| void Kmeans::verifyAssignment(int iteration, int startNdx, int endNdx) const { | ||
| #ifdef VERIFY_ASSIGNMENTS | ||
| for (int i = startNdx; i < endNdx; ++i) { | ||
| // keep track of the squared distance and identity of the closest-seen | ||
| // cluster (so far) | ||
| int closest = assignment[i]; | ||
| double closest_dist2 = pointCenterDist2(i, closest); | ||
| double original_closest_dist2 = closest_dist2; | ||
| // look at all centers | ||
| for (int j = 0; j < k; ++j) { | ||
| if (j == closest) { | ||
| continue; | ||
| } | ||
| double d2 = pointCenterDist2(i, j); | ||
|
|
||
| // determine if we found a closer center | ||
| if (d2 < closest_dist2) { | ||
| closest = j; | ||
| closest_dist2 = d2; | ||
| } | ||
| } | ||
|
|
||
| // if we have found a discrepancy, then print out information and crash | ||
| // the program | ||
| if (closest != assignment[i]) { | ||
| std::cerr << "assignment error:" << std::endl; | ||
| std::cerr << "iteration = " << iteration << std::endl; | ||
| std::cerr << "point index = " << i << std::endl; | ||
| std::cerr << "closest center = " << closest << std::endl; | ||
| std::cerr << "closest center dist2 = " << closest_dist2 << std::endl; | ||
| std::cerr << "assigned center = " << assignment[i] << std::endl; | ||
| std::cerr << "assigned center dist2 = " << original_closest_dist2 << std::endl; | ||
| assert(false); | ||
| } | ||
| } | ||
| #endif | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,147 @@ | ||
| #ifndef KMEANS_H | ||
| #define KMEANS_H | ||
|
|
||
| /* Authors: Greg Hamerly and Jonathan Drake | ||
| * Feedback: hamerly@cs.baylor.edu | ||
| * See: http://cs.baylor.edu/~hamerly/software/kmeans.php | ||
| * Copyright 2014 | ||
| * | ||
| * Kmeans is an abstract base class for algorithms which implement Lloyd's | ||
| * k-means algorithm. Subclasses provide functionality in the "runThread()" | ||
| * method. | ||
| */ | ||
|
|
||
| #include "kmeans_dataset.h" | ||
| #include <limits> | ||
| #include <string> | ||
| #ifdef USE_THREADS | ||
| #include <pthread.h> | ||
| #endif | ||
|
|
||
| class Kmeans { | ||
| public: | ||
| // Construct a K-means object to operate on the given dataset | ||
| Kmeans(); | ||
| virtual ~Kmeans() { free(); } | ||
|
|
||
| // This method kicks off the threads that do the clustering and run | ||
| // until convergence (or until reaching maxIterations). It returns the | ||
| // number of iterations performed. | ||
| int run(int aMaxIterations = std::numeric_limits<int>::max()); | ||
|
|
||
| // Get the cluster assignment for the given point index. | ||
| int getAssignment(int xIndex) const { return assignment[xIndex]; } | ||
|
|
||
| // Initialize the algorithm at the beginning of the run(), with the | ||
| // given data and initial assignment. The parameter initialAssignment | ||
| // will be modified by this algorithm and will at the end contain the | ||
| // final assignment of clusters. | ||
| virtual void initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads); | ||
|
|
||
| // Free all memory being used by the object. | ||
| virtual void free(); | ||
|
|
||
| // This method verifies that the current assignment is correct, by | ||
| // checking every point-center distance. For debugging. | ||
| virtual void verifyAssignment(int iteration, int startNdx, int endNdx) const; | ||
|
|
||
| // Compute the sum of squared errors for the data on the centers (not | ||
| // designed to be fast). | ||
| virtual double getSSE() const; | ||
|
|
||
| // Get the name of this clustering algorithm (to be overridden by | ||
| // subclasses). | ||
| virtual std::string getName() const = 0; | ||
|
|
||
| // Virtual methods for computing inner products (depending on the kernel | ||
| // being used, e.g.). For vanilla k-means these will be the standard dot | ||
| // product; for more exotic applications these will be other kernel | ||
| // functions. | ||
| virtual double pointPointInnerProduct(int x1, int x2) const = 0; | ||
| virtual double pointCenterInnerProduct(int xndx, unsigned short cndx) const = 0; | ||
| virtual double centerCenterInnerProduct(unsigned short c1, unsigned short c2) const = 0; | ||
|
|
||
| // Use the inner products to compute squared distances between a point | ||
| // and center. | ||
| virtual double pointCenterDist2(int x1, unsigned short cndx) const { | ||
| #ifdef COUNT_DISTANCES | ||
| ++numDistances; | ||
| #endif | ||
| return pointPointInnerProduct(x1, x1) - 2 * pointCenterInnerProduct(x1, cndx) + centerCenterInnerProduct(cndx, cndx); | ||
| } | ||
|
|
||
| // Use the inner products to compute squared distances between two | ||
| // centers. | ||
| virtual double centerCenterDist2(unsigned short c1, unsigned short c2) const { | ||
| #ifdef COUNT_DISTANCES | ||
| ++numDistances; | ||
| #endif | ||
| return centerCenterInnerProduct(c1, c1) - 2 * centerCenterInnerProduct(c1, c2) + centerCenterInnerProduct(c2, c2); | ||
| } | ||
|
|
||
| #ifdef COUNT_DISTANCES | ||
| #ifdef USE_THREADS | ||
| // Note: numDistances is NOT thread-safe, but it is not meant to be | ||
| // enabled in performant code. | ||
| #error Counting distances and using multiple threads is not supported. | ||
| #endif | ||
| mutable long long numDistances; | ||
| #endif | ||
|
|
||
| virtual Dataset const *getCenters() const { return NULL; } | ||
|
|
||
| protected: | ||
| // The dataset to cluster. | ||
| const Dataset *x; | ||
|
|
||
| // Local copies for convenience. | ||
| int n, k, d; | ||
|
|
||
| // Pthread primitives for multithreading. | ||
| int numThreads; | ||
| #ifdef USE_THREADS | ||
| pthread_barrier_t barrier; | ||
| #endif | ||
|
|
||
| // To communicate (to all threads) that we have converged. | ||
| bool converged; | ||
|
|
||
| // Keep track of how many points are in each cluster, divided over each | ||
| // thread. | ||
| int **clusterSize; | ||
|
|
||
| // centerMovement is computed in move_centers() and used to detect | ||
| // convergence (if max(centerMovement) == 0.0) and update point-center | ||
| // distance bounds (in subclasses that use them). | ||
| double *centerMovement; | ||
|
|
||
| // For each point in x, keep which cluster it is assigned to. By using a | ||
| // short, we assume a limited number of clusters (fewer than 2^16). | ||
| unsigned short *assignment; | ||
|
|
||
|
|
||
| // This is where each thread does its work. | ||
| virtual int runThread(int threadId, int maxIterations) = 0; | ||
|
|
||
| // Static entry method for pthread_create(). | ||
| static void *runner(void *args); | ||
|
|
||
| // Assign point at xIndex to cluster newCluster, working within thread threadId. | ||
| virtual void changeAssignment(int xIndex, int newCluster, int threadId); | ||
|
|
||
| // Over what range in [0, n) does this thread have ownership of the | ||
| // points? end() returns one past the last owned point. | ||
| int start(int threadId) const { return n * threadId / numThreads; } | ||
| int end(int threadId) const { return start(threadId + 1); } | ||
| int whichThread(int index) const { return index * numThreads / n; } | ||
|
|
||
| // Convenience method for causing all threads to synchronize. | ||
| void synchronizeAllThreads() { | ||
| #ifdef USE_THREADS | ||
| pthread_barrier_wait(&barrier); | ||
| #endif | ||
| } | ||
| }; | ||
|
|
||
| #endif | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,95 @@ | ||
| /* Authors: Greg Hamerly and Jonathan Drake | ||
| * Feedback: hamerly@cs.baylor.edu | ||
| * See: http://cs.baylor.edu/~hamerly/software/kmeans.php | ||
| * Copyright 2014 | ||
| */ | ||
|
|
||
| #include "kmeans_dataset.h" | ||
| // #include <iostream> | ||
| #include <iomanip> | ||
| #include <cassert> | ||
| #include <cstring> | ||
|
|
||
| // print the dataset to standard output (cout), using formatting to keep the | ||
| // data in matrix format | ||
| void Dataset::print(std::ostream &out) const { | ||
| //std::ostream &out = std::cout; | ||
| out.precision(6); | ||
| int ndx = 0; | ||
| for (int i = 0; i < n; ++i) { | ||
| for (int j = 0; j < d; ++j) { | ||
| out << std::setw(13) << data[ndx++] << " "; | ||
| } | ||
| out << std::endl; | ||
| } | ||
| } | ||
|
|
||
| // returns a (modifiable) reference to the value in dimension "dim" from record | ||
| // "ndx" | ||
| double &Dataset::operator()(int ndx, int dim) { | ||
| # ifdef DEBUG | ||
| assert(ndx < n); | ||
| assert(dim < d); | ||
| # endif | ||
| return data[ndx * d + dim]; | ||
| } | ||
|
|
||
| // returns a (const) reference to the value in dimension "dim" from record "ndx" | ||
| const double &Dataset::operator()(int ndx, int dim) const { | ||
| # ifdef DEBUG | ||
| assert(ndx < n); | ||
| assert(dim < d); | ||
| # endif | ||
| return data[ndx * d + dim]; | ||
| } | ||
|
|
||
| // fill the entire dataset with value. Does NOT update sumDataSquared. | ||
| void Dataset::fill(double value) { | ||
| for (int i = 0; i < nd; ++i) { | ||
| data[i] = value; | ||
| } | ||
| } | ||
|
|
||
| // copy constructor -- makes a deep copy of everything in x | ||
| Dataset::Dataset(Dataset const &x) { | ||
| n = d = nd = 0; | ||
| data = sumDataSquared = NULL; | ||
| *this = x; | ||
| } | ||
|
|
||
| // operator= is the standard deep-copy assignment operator, which | ||
| // returns a const reference to *this. | ||
| Dataset const &Dataset::operator=(Dataset const &x) { | ||
| if (this != &x) { | ||
|
|
||
| // reallocate sumDataSquared and data as necessary | ||
| if (n != x.n) { | ||
| delete [] sumDataSquared; | ||
| sumDataSquared = x.sumDataSquared ? new double[x.n] : NULL; | ||
| } | ||
|
|
||
| if (nd != x.nd) { | ||
| delete [] data; | ||
| data = x.data ? new double[x.nd] : NULL; | ||
| } | ||
|
|
||
| // reflect the new sizes | ||
| n = x.n; | ||
| d = x.d; | ||
| nd = x.nd; | ||
|
|
||
| // copy data as appropriate | ||
| if (x.sumDataSquared) { | ||
| memcpy(sumDataSquared, x.sumDataSquared, x.n * sizeof(double)); | ||
| } | ||
|
|
||
| if (x.data) { | ||
| memcpy(data, x.data, x.nd * sizeof(double)); | ||
| } | ||
|
|
||
| } | ||
|
|
||
| // return a reference for chaining assignments | ||
| return *this; | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,85 @@ | ||
| #ifndef DATASET_H | ||
| #define DATASET_H | ||
|
|
||
| /* Authors: Greg Hamerly and Jonathan Drake | ||
| * Feedback: hamerly@cs.baylor.edu | ||
| * See: http://cs.baylor.edu/~hamerly/software/kmeans.php | ||
| * Copyright 2014 | ||
| * | ||
| * A Dataset class represents a collection of multidimensional records, as is | ||
| * typical in metric machine learning. Every record has the same number of | ||
| * dimensions (values), and every value must be numeric. Undefined values are | ||
| * not allowed. | ||
| * | ||
| * This particular implementation keeps all the data in a 1-dimensional array, | ||
| * and also optionally keeps extra storage for the sum of the squared values of | ||
| * each record. However, the Dataset class does NOT automatically populate or | ||
| * update the sumDataSquared values. | ||
| */ | ||
|
|
||
| #include <cstddef> | ||
| #include <iostream> | ||
|
|
||
| class Dataset { | ||
| public: | ||
| // default constructor -- constructs a completely empty dataset with no | ||
| // records | ||
| Dataset() : n(0), d(0), nd(0), data(NULL), sumDataSquared(NULL) {} | ||
|
|
||
| // construct a dataset of a particular size, and determine whether to | ||
| // keep the sumDataSquared | ||
| Dataset(int aN, int aD, bool keepSDS = false) : n(aN), d(aD), nd(n * d), | ||
| data(new double[nd]), | ||
| sumDataSquared(keepSDS ? new double[n] : NULL) {} | ||
|
|
||
| // copy constructor -- makes a deep copy of everything in x | ||
| Dataset(Dataset const &x); | ||
|
|
||
| // destroys the dataset safely | ||
| ~Dataset() { | ||
| n = d = nd = 0; | ||
| double *dp = data, *sdsp = sumDataSquared; | ||
| data = sumDataSquared = NULL; | ||
| delete [] dp; | ||
| delete [] sdsp; | ||
| } | ||
|
|
||
| // operator= is the standard deep-copy assignment operator, which | ||
| // returns a const reference to *this. | ||
| Dataset const &operator=(Dataset const &x); | ||
|
|
||
| // allows modification of the record ndx and dimension dim | ||
| double &operator()(int ndx, int dim); | ||
|
|
||
| // allows const access to record ndx and dimension dim | ||
| const double &operator()(int ndx, int dim) const; | ||
|
|
||
| // fill the entire dataset with value. Does NOT update sumDataSquared. | ||
| void fill(double value); | ||
|
|
||
| // print the dataset to standard output (cout), using formatting to keep the | ||
| // data in matrix format | ||
| void print(std::ostream &out = std::cout) const; | ||
|
|
||
| // n represents the number of records | ||
| // d represents the dimension | ||
| // nd is a shortcut for the value n * d | ||
| int n, d, nd; | ||
|
|
||
| // data is an array of length n*d that stores all of the records in | ||
| // record-major (row-major) order. Thus data[0]...data[d-1] are the | ||
| // values associated with the first record. | ||
| double *data; | ||
|
|
||
| // sumDataSquared is an (optional) sum of squared values for every | ||
| // record. Thus, | ||
| // sumDataSquared[0] = data[0]^2 + data[1]^2 + ... + data[d-1]^2 | ||
| // sumDataSquared[1] = data[d]^2 + data[d+1]^2 + ... + data[2*d-1]^2 | ||
| // and so on. Note that this is the *intended* use of the sumDataSquared | ||
| // field, but that the Dataset class does NOT automatically populate or | ||
| // update the values in sumDataSquared. | ||
| double *sumDataSquared; | ||
| }; | ||
|
|
||
| #endif | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,330 @@ | ||
| /* Authors: Greg Hamerly and Jonathan Drake | ||
| * Feedback: hamerly@cs.baylor.edu | ||
| * See: http://cs.baylor.edu/~hamerly/software/kmeans.php | ||
| * Copyright 2014 | ||
| */ | ||
|
|
||
| #include "kmeans_dataset.h" | ||
| #include "kmeans.h" | ||
| #include "kmeans_general_functions.h" | ||
| #include <cassert> | ||
| #include <cmath> | ||
| #include <algorithm> | ||
| #include <numeric> | ||
| #include <cstring> | ||
| #include <cstdio> | ||
| #include <unistd.h> | ||
|
|
||
| void addVectors(double *a, double const *b, int d) { | ||
| double const *end = a + d; | ||
| while (a < end) { | ||
| *(a++) += *(b++); | ||
| } | ||
| } | ||
|
|
||
| void subVectors(double *a, double const *b, int d) { | ||
| double const *end = a + d; | ||
| while (a < end) { | ||
| *(a++) -= *(b++); | ||
| } | ||
| } | ||
|
|
||
| double distance2silent(double const *a, double const *b, int d) { | ||
| double d2 = 0.0, diff; | ||
| double const *end = a + d; // one past the last valid entry in a | ||
| while (a < end) { | ||
| diff = *(a++) - *(b++); | ||
| d2 += diff * diff; | ||
| } | ||
| return d2; | ||
| } | ||
|
|
||
|
|
||
| void centerDataset(Dataset *x) { | ||
| double *xCentroid = new double[x->d]; | ||
|
|
||
| for (int d = 0; d < x->d; ++d) { | ||
| xCentroid[d] = 0.0; | ||
| } | ||
|
|
||
| for (int i = 0; i < x->n; ++i) { | ||
| addVectors(xCentroid, x->data + i * x->d, x->d); | ||
| } | ||
|
|
||
| // compute average (divide by n) | ||
| for (int d = 0; d < x->d; ++d) { | ||
| xCentroid[d] /= x->n; | ||
| } | ||
|
|
||
| // re-center the dataset | ||
| const double *xEnd = x->data + x->n * x->d; | ||
| for (double *xp = x->data; xp != xEnd; xp += x->d) { | ||
| subVectors(xp, xCentroid, x->d); | ||
| } | ||
|
|
||
| delete [] xCentroid; | ||
| } | ||
|
|
||
| Dataset *init_centers(Dataset const &x, unsigned short k) { | ||
| int *chosen_pts = new int[k]; | ||
| Dataset *c = new Dataset(k, x.d); | ||
| for (int i = 0; i < k; ++i) { | ||
| bool acceptable = true; | ||
| do { | ||
| acceptable = true; | ||
| chosen_pts[i] = rand() % x.n; | ||
| for (int j = 0; j < i; ++j) { | ||
| if (chosen_pts[i] == chosen_pts[j]) { | ||
| acceptable = false; | ||
| break; | ||
| } | ||
| } | ||
| } while (! acceptable); | ||
| double *cdp = c->data + i * x.d; | ||
| memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d); | ||
| if (c->sumDataSquared) { | ||
| c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0); | ||
| } | ||
| } | ||
|
|
||
| delete [] chosen_pts; | ||
|
|
||
| return c; | ||
| } | ||
|
|
||
|
|
||
| Dataset *init_centers_kmeanspp(Dataset const &x, unsigned short k) { | ||
| int *chosen_pts = new int[k]; | ||
| std::pair<double, int> *dist2 = new std::pair<double, int>[x.n]; | ||
| double *distribution = new double[x.n]; | ||
|
|
||
| // initialize dist2 | ||
| for (int i = 0; i < x.n; ++i) { | ||
| dist2[i].first = std::numeric_limits<double>::max(); | ||
| dist2[i].second = i; | ||
| } | ||
|
|
||
| // choose the first point randomly | ||
| int ndx = 1; | ||
| chosen_pts[ndx - 1] = rand() % x.n; | ||
|
|
||
| while (ndx < k) { | ||
| double sum_distribution = 0.0; | ||
| // look for the point that is furthest from any center | ||
| for (int i = 0; i < x.n; ++i) { | ||
| int example = dist2[i].second; | ||
| double d2 = 0.0, diff; | ||
| for (int j = 0; j < x.d; ++j) { | ||
| diff = x(example,j) - x(chosen_pts[ndx - 1],j); | ||
| d2 += diff * diff; | ||
| } | ||
| if (d2 < dist2[i].first) { | ||
| dist2[i].first = d2; | ||
| } | ||
|
|
||
| sum_distribution += dist2[i].first; | ||
| } | ||
|
|
||
| // sort the examples by their distance from centers | ||
| sort(dist2, dist2 + x.n); | ||
|
|
||
| // turn distribution into a CDF | ||
| distribution[0] = dist2[0].first / sum_distribution; | ||
| for (int i = 1; i < x.n; ++i) { | ||
| distribution[i] = distribution[i - 1] + dist2[i].first / sum_distribution; | ||
| } | ||
|
|
||
| // choose a random interval according to the new distribution | ||
| double r = (double)rand() / (double)RAND_MAX; | ||
| double *new_center_ptr = std::lower_bound(distribution, distribution + x.n, r); | ||
| int distribution_ndx = (int)(new_center_ptr - distribution); | ||
| chosen_pts[ndx] = dist2[distribution_ndx].second; | ||
| /* | ||
| cout << "chose " << distribution_ndx << " which is actually " | ||
| << chosen_pts[ndx] << " with distance " | ||
| << dist2[distribution_ndx].first << std::endl; | ||
| */ | ||
|
|
||
| ++ndx; | ||
| } | ||
|
|
||
| Dataset *c = new Dataset(k, x.d); | ||
|
|
||
| for (int i = 0; i < k; ++i) { | ||
| double *cdp = c->data + i * x.d; | ||
| memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d); | ||
| if (c->sumDataSquared) { | ||
| c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0); | ||
| } | ||
| } | ||
|
|
||
| delete [] chosen_pts; | ||
| delete [] dist2; | ||
| delete [] distribution; | ||
|
|
||
| return c; | ||
| } | ||
|
|
||
|
|
||
| Dataset *init_centers_kmeanspp_v2(Dataset const &x, unsigned short k) { | ||
| int *chosen_pts = new int[k]; | ||
| std::pair<double, int> *dist2 = new std::pair<double, int>[x.n]; | ||
|
|
||
| // initialize dist2 | ||
| for (int i = 0; i < x.n; ++i) { | ||
| dist2[i].first = std::numeric_limits<double>::max(); | ||
| dist2[i].second = i; | ||
| } | ||
|
|
||
| // choose the first point randomly | ||
| int ndx = 1; | ||
| chosen_pts[ndx - 1] = rand() % x.n; | ||
|
|
||
| while (ndx < k) { | ||
| double sum_distribution = 0.0; | ||
| // look for the point that is furthest from any center | ||
| double max_dist = 0.0; | ||
| for (int i = 0; i < x.n; ++i) { | ||
| int example = dist2[i].second; | ||
| double d2 = 0.0, diff; | ||
| for (int j = 0; j < x.d; ++j) { | ||
| diff = x(example,j) - x(chosen_pts[ndx - 1],j); | ||
| d2 += diff * diff; | ||
| } | ||
| if (d2 < dist2[i].first) { | ||
| dist2[i].first = d2; | ||
| } | ||
|
|
||
| if (dist2[i].first > max_dist) { | ||
| max_dist = dist2[i].first; | ||
| } | ||
|
|
||
| sum_distribution += dist2[i].first; | ||
| } | ||
|
|
||
| bool unique = true; | ||
|
|
||
| do { | ||
| // choose a random interval according to the new distribution | ||
| double r = sum_distribution * (double)rand() / (double)RAND_MAX; | ||
| double sum_cdf = dist2[0].first; | ||
| int cdf_ndx = 0; | ||
| while (sum_cdf < r) { | ||
| sum_cdf += dist2[++cdf_ndx].first; | ||
| } | ||
| chosen_pts[ndx] = cdf_ndx; | ||
|
|
||
| for (int i = 0; i < ndx; ++i) { | ||
| unique = unique && (chosen_pts[ndx] != chosen_pts[i]); | ||
| } | ||
| } while (! unique); | ||
|
|
||
| ++ndx; | ||
| } | ||
|
|
||
| Dataset *c = new Dataset(k, x.d); | ||
| for (int i = 0; i < c->n; ++i) { | ||
| double *cdp = c->data + i * x.d; | ||
| memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d); | ||
| if (c->sumDataSquared) { | ||
| c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0); | ||
| } | ||
| } | ||
|
|
||
| delete [] chosen_pts; | ||
| delete [] dist2; | ||
|
|
||
| return c; | ||
| } | ||
|
|
||
|
|
||
| /** | ||
| * in MB | ||
| */ | ||
| double getMemoryUsage() { | ||
| char buf[30]; | ||
| snprintf(buf, 30, "/proc/%u/statm", (unsigned)getpid()); | ||
| FILE* pf = fopen(buf, "r"); | ||
| unsigned int totalProgramSizeInPages = 0; | ||
| unsigned int residentSetSizeInPages = 0; | ||
| if (pf) { | ||
| int numScanned = fscanf(pf, "%u %u" /* %u %u %u %u %u"*/, &totalProgramSizeInPages, &residentSetSizeInPages); | ||
| if (numScanned != 2) { | ||
| return 0.0; | ||
| } | ||
| } | ||
|
|
||
| fclose(pf); | ||
| pf = NULL; | ||
|
|
||
| double sizeInKilobytes = residentSetSizeInPages * 4.0; // assumes 4096 byte page | ||
| // getconf PAGESIZE | ||
|
|
||
| return sizeInKilobytes; | ||
| } | ||
|
|
||
|
|
||
| void assign(Dataset const &x, Dataset const &c, unsigned short *assignment) { | ||
| for (int i = 0; i < x.n; ++i) { | ||
| double shortestDist2 = std::numeric_limits<double>::max(); | ||
| int closest = 0; | ||
| for (int j = 0; j < c.n; ++j) { | ||
| double d2 = 0.0, *a = x.data + i * x.d, *b = c.data + j * x.d; | ||
| for (; a != x.data + (i + 1) * x.d; ++a, ++b) { | ||
| d2 += (*a - *b) * (*a - *b); | ||
| } | ||
| if (d2 < shortestDist2) { | ||
| shortestDist2 = d2; | ||
| closest = j; | ||
| } | ||
| } | ||
| assignment[i] = closest; | ||
| } | ||
| } | ||
|
|
||
| rusage get_time() { | ||
| rusage now; | ||
| getrusage(RUSAGE_SELF, &now); | ||
| return now; | ||
| } | ||
|
|
||
|
|
||
| double get_wall_time(){ | ||
| struct timeval time; | ||
| if (gettimeofday(&time,NULL)){ | ||
| // Handle error | ||
| return 0; | ||
| } | ||
| return (double)time.tv_sec + (double)time.tv_usec * .000001; | ||
| } | ||
|
|
||
| int timeval_subtract(timeval *result, timeval *x, timeval *y) { | ||
| /* Perform the carry for the later subtraction by updating y. */ | ||
| if (x->tv_usec < y->tv_usec) { | ||
| int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1; | ||
| y->tv_usec -= 1000000 * nsec; | ||
| y->tv_sec += nsec; | ||
| } | ||
| if (x->tv_usec - y->tv_usec > 1000000) { | ||
| int nsec = (x->tv_usec - y->tv_usec) / 1000000; | ||
| y->tv_usec += 1000000 * nsec; | ||
| y->tv_sec -= nsec; | ||
| } | ||
|
|
||
| /* Compute the time remaining to wait. tv_usec is certainly positive. */ | ||
| result->tv_sec = x->tv_sec - y->tv_sec; | ||
| result->tv_usec = x->tv_usec - y->tv_usec; | ||
|
|
||
| /* Return 1 if result | ||
| * is negative. */ | ||
| return x->tv_sec < y->tv_sec; | ||
| } | ||
|
|
||
| double elapsed_time(rusage *start) { | ||
| rusage now; | ||
| timeval diff; | ||
| getrusage(RUSAGE_SELF, &now); | ||
| timeval_subtract(&diff, &now.ru_utime, &start->ru_utime); | ||
|
|
||
| return (double)diff.tv_sec + (double)diff.tv_usec / 1e6; | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,91 @@ | ||
| #ifndef GENERAL_KMEANS_FUNCTIONS_H | ||
| #define GENERAL_KMEANS_FUNCTIONS_H | ||
|
|
||
| /* Authors: Greg Hamerly and Jonathan Drake | ||
| * Feedback: hamerly@cs.baylor.edu | ||
| * See: http://cs.baylor.edu/~hamerly/software/kmeans.php | ||
| * Copyright 2014 | ||
| * | ||
| * Generally useful functions. | ||
| */ | ||
|
|
||
|
|
||
| #include <iostream> | ||
| #include <string> | ||
| #include <sys/resource.h> | ||
| #include <sys/time.h> | ||
| #include "kmeans_dataset.h" | ||
|
|
||
| /* Add together two vectors, and put the result in the first argument. | ||
| * Calculates a = a + b | ||
| * | ||
| * Parameters: | ||
| * a -- vector to add, and the result of the operation | ||
| * b -- vector to add to a | ||
| * d -- the dimension | ||
| * Return value: none | ||
| */ | ||
| void addVectors(double *a, double const *b, int d); | ||
|
|
||
| /* Subtract two vectors, and put the result in the first argument. Calculates | ||
| * a = a - b | ||
| * | ||
| * Parameters: | ||
| * a -- vector to subtract from, and the result of the operation | ||
| * b -- vector to subtract | ||
| * d -- the dimension | ||
| * Return value: none | ||
| */ | ||
| void subVectors(double *a, double const *b, int d); | ||
|
|
||
| /* Initialize the centers randomly. Choose random records from x as the initial | ||
| * values for the centers. Assumes that c uses the sumDataSquared field. | ||
| * | ||
| * Parameters: | ||
| * x -- records that are being clustered (n * d) | ||
| * c -- centers to be initialized. Should be pre-allocated with the number of | ||
| * centers desired, and dimension. | ||
| * Return value: none | ||
| */ | ||
| Dataset *init_centers(Dataset const &x, unsigned short k); | ||
|
|
||
| /* Initialize the centers randomly using K-means++. | ||
| * | ||
| * Parameters: | ||
| * x -- records that are being clustered (n * d) | ||
| * c -- centers to be initialized. Should be pre-allocated with the number of | ||
| * centers desired, and dimension. | ||
| * Return value: none | ||
| */ | ||
| Dataset *init_centers_kmeanspp(Dataset const &x, unsigned short k); | ||
| Dataset *init_centers_kmeanspp_v2(Dataset const &x, unsigned short k); | ||
|
|
||
| /* Print an array (templated). Convenience function. | ||
| * | ||
| * Parameters: | ||
| * arr -- the array to print | ||
| * length -- the length of the array | ||
| * separator -- the string to put between each pair of printed elements | ||
| * Return value: none | ||
| */ | ||
| template <class T> | ||
| void printArray(T const *arr, int length, std::string separator) { | ||
| for (int i = 0; i < length; ++i) { | ||
| if (i > 0) { | ||
| std::cout << separator; | ||
| } | ||
| std::cout << arr[i]; | ||
| } | ||
| } | ||
|
|
||
| double getMemoryUsage(); | ||
| rusage get_time(); | ||
| double get_wall_time(); | ||
| int timeval_subtract(timeval *result, timeval *x, timeval *y); | ||
| double elapsed_time(rusage *start); | ||
|
|
||
| void centerDataset(Dataset *x); | ||
|
|
||
| void assign(Dataset const &x, Dataset const &c, unsigned short *assignment); | ||
|
|
||
| #endif |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,106 @@ | ||
| /* Authors: Greg Hamerly and Jonathan Drake | ||
| * Feedback: hamerly@cs.baylor.edu | ||
| * See: http://cs.baylor.edu/~hamerly/software/kmeans.php | ||
| * Copyright 2014 | ||
| */ | ||
|
|
||
| #include "original_space_kmeans.h" | ||
| #include "kmeans_general_functions.h" | ||
| #include <cmath> | ||
| #include <cassert> | ||
| #include <numeric> | ||
|
|
||
| OriginalSpaceKmeans::OriginalSpaceKmeans() : centers(NULL), sumNewCenters(NULL) { } | ||
|
|
||
| void OriginalSpaceKmeans::free() { | ||
| for (int t = 0; t < numThreads; ++t) { | ||
| delete sumNewCenters[t]; | ||
| } | ||
| Kmeans::free(); | ||
| delete centers; | ||
| delete [] sumNewCenters; | ||
| centers = NULL; | ||
| sumNewCenters = NULL; | ||
| } | ||
|
|
||
| /* This method moves the newCenters to their new locations, based on the | ||
| * sufficient statistics in sumNewCenters. It also computes the centerMovement | ||
| * and the center that moved the furthest. | ||
| * | ||
| * Parameters: none | ||
| * | ||
| * Return value: index of the furthest-moving centers | ||
| */ | ||
| int OriginalSpaceKmeans::move_centers() { | ||
| int furthestMovingCenter = 0; | ||
| for (int j = 0; j < k; ++j) { | ||
| centerMovement[j] = 0.0; | ||
| int totalClusterSize = 0; | ||
| for (int t = 0; t < numThreads; ++t) { | ||
| totalClusterSize += clusterSize[t][j]; | ||
| } | ||
| if (totalClusterSize > 0) { | ||
| for (int dim = 0; dim < d; ++dim) { | ||
| double z = 0.0; | ||
| for (int t = 0; t < numThreads; ++t) { | ||
| z += (*sumNewCenters[t])(j,dim); | ||
| } | ||
| z /= totalClusterSize; | ||
| centerMovement[j] += (z - (*centers)(j, dim)) * (z - (*centers)(j, dim)); | ||
| (*centers)(j, dim) = z; | ||
| } | ||
| } | ||
| centerMovement[j] = sqrt(centerMovement[j]); | ||
|
|
||
| if (centerMovement[furthestMovingCenter] < centerMovement[j]) { | ||
| furthestMovingCenter = j; | ||
| } | ||
| } | ||
|
|
||
| #ifdef COUNT_DISTANCES | ||
| numDistances += k; | ||
| #endif | ||
|
|
||
| return furthestMovingCenter; | ||
| } | ||
|
|
||
| void OriginalSpaceKmeans::initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads) { | ||
| Kmeans::initialize(aX, aK, initialAssignment, aNumThreads); | ||
|
|
||
| centers = new Dataset(k, d); | ||
| sumNewCenters = new Dataset *[numThreads]; | ||
| centers->fill(0.0); | ||
|
|
||
| for (int t = 0; t < numThreads; ++t) { | ||
| sumNewCenters[t] = new Dataset(k, d, false); | ||
| sumNewCenters[t]->fill(0.0); | ||
| for (int i = start(t); i < end(t); ++i) { | ||
| addVectors(sumNewCenters[t]->data + assignment[i] * d, x->data + i * d, d); | ||
| } | ||
| } | ||
|
|
||
| // put the centers at their initial locations, based on clusterSize and | ||
| // sumNewCenters | ||
| move_centers(); | ||
| } | ||
|
|
||
| void OriginalSpaceKmeans::changeAssignment(int xIndex, int closestCluster, int threadId) { | ||
| unsigned short oldAssignment = assignment[xIndex]; | ||
| Kmeans::changeAssignment(xIndex, closestCluster, threadId); | ||
| double *xp = x->data + xIndex * d; | ||
| subVectors(sumNewCenters[threadId]->data + oldAssignment * d, xp, d); | ||
| addVectors(sumNewCenters[threadId]->data + closestCluster * d, xp, d); | ||
| } | ||
|
|
||
| double OriginalSpaceKmeans::pointPointInnerProduct(int x1, int x2) const { | ||
| return std::inner_product(x->data + x1 * d, x->data + (x1 + 1) * d, x->data + x2 * d, 0.0); | ||
| } | ||
|
|
||
| double OriginalSpaceKmeans::pointCenterInnerProduct(int xndx, unsigned short cndx) const { | ||
| return std::inner_product(x->data + xndx * d, x->data + (xndx + 1) * d, centers->data + cndx * d, 0.0); | ||
| } | ||
|
|
||
| double OriginalSpaceKmeans::centerCenterInnerProduct(unsigned short c1, unsigned short c2) const { | ||
| return std::inner_product(centers->data + c1 * d, centers->data + (c1 + 1) * d, centers->data + c2 * d, 0.0); | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,54 @@ | ||
| #ifndef ORIGINAL_SPACE_KMEANS_H | ||
| #define ORIGINAL_SPACE_KMEANS_H | ||
|
|
||
| /* Authors: Greg Hamerly and Jonathan Drake | ||
| * Feedback: hamerly@cs.baylor.edu | ||
| * See: http://cs.baylor.edu/~hamerly/software/kmeans.php | ||
| * Copyright 2014 | ||
| * | ||
| * OriginalSpaceKmeans is a base class for other algorithms that operate in the | ||
| * same space as the data being clustered (as opposed to kernelized k-means | ||
| * algorithms, which operate in kernel space). | ||
| */ | ||
|
|
||
| #include "kmeans.h" | ||
|
|
||
| /* Cluster with the cluster centers living in the original space (with the | ||
| * data). This is as opposed to a kernelized version of k-means, where the | ||
| * center points might not be explicitly represented. This is also an abstract | ||
| * class. | ||
| */ | ||
| class OriginalSpaceKmeans : public Kmeans { | ||
| public: | ||
| OriginalSpaceKmeans(); | ||
| virtual ~OriginalSpaceKmeans() { free(); } | ||
| virtual void free(); | ||
| virtual void initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads); | ||
|
|
||
| virtual double pointPointInnerProduct(int x1ndx, int x2ndx) const; | ||
| virtual double pointCenterInnerProduct(int xndx, unsigned short cndx) const; | ||
| virtual double centerCenterInnerProduct(unsigned short c1ndx, unsigned short c2ndx) const; | ||
|
|
||
| virtual Dataset const *getCenters() const { return centers; } | ||
|
|
||
| protected: | ||
| // Move the centers to the average of their current assigned points, | ||
| // compute the distance moved by each center, and return the index of | ||
| // the furthest-moving center. | ||
| int move_centers(); | ||
|
|
||
| virtual void changeAssignment(int xIndex, int closestCluster, int threadId); | ||
|
|
||
| // The set of centers we are operating on. | ||
| Dataset *centers; | ||
|
|
||
| // sumNewCenters and centerCount provide sufficient statistics to | ||
| // quickly calculate the changing locations of the centers. Whenever a | ||
| // point changes cluster membership, we subtract (add) it from (to) the | ||
| // row in sumNewCenters associated with its old (new) cluster. We also | ||
| // decrement (increment) centerCount for the old (new) cluster. | ||
| Dataset **sumNewCenters; | ||
|
|
||
| }; | ||
|
|
||
| #endif |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,79 @@ | ||
| /* Authors: Greg Hamerly and Jonathan Drake | ||
| * Feedback: hamerly@cs.baylor.edu | ||
| * See: http://cs.baylor.edu/~hamerly/software/kmeans.php | ||
| * Copyright 2014 | ||
| */ | ||
|
|
||
| #include "triangle_inequality_base_kmeans.h" | ||
| #include "kmeans_general_functions.h" | ||
| #include <cassert> | ||
| #include <limits> | ||
| #include <cmath> | ||
|
|
||
| void TriangleInequalityBaseKmeans::free() { | ||
| OriginalSpaceKmeans::free(); | ||
| delete [] s; | ||
| delete [] upper; | ||
| delete [] lower; | ||
| s = NULL; | ||
| upper = NULL; | ||
| lower = NULL; | ||
| } | ||
|
|
||
| /* This function computes the inter-center distances, keeping only the closest | ||
| * distances, and updates "s". After this, s[j] will contain the distance | ||
| * between center j and its closest other center, divided by two. The division | ||
| * here saves repeated work later, since we always will need the distance / 2. | ||
| * | ||
| * Parameters: none | ||
| * | ||
| * Return value: none | ||
| */ | ||
| // TODO: parallelize this | ||
| void TriangleInequalityBaseKmeans::update_s(int threadId) { | ||
| // initialize | ||
| for (int c1 = 0; c1 < k; ++c1) { | ||
| if (c1 % numThreads == threadId) { | ||
| s[c1] = std::numeric_limits<double>::max(); | ||
| } | ||
| } | ||
| // compute inter-center squared distances between all pairs | ||
| for (int c1 = 0; c1 < k; ++c1) { | ||
| if (c1 % numThreads == threadId) { | ||
| for (int c2 = 0; c2 < k; ++c2) { | ||
| if (c2 == c1) { | ||
| continue; | ||
| } | ||
| double d2 = centerCenterDist2(c1, c2); | ||
| if (d2 < s[c1]) { s[c1] = d2; } | ||
| } | ||
| // take the root and divide by two | ||
| s[c1] = sqrt(s[c1]) / 2.0; | ||
| } | ||
| } | ||
| } | ||
|
|
||
|
|
||
| /* This function initializes the upper/lower bounds, assignment, centerCounts, | ||
| * and sumNewCenters. It sets the bounds to invalid values which will force the | ||
| * first iteration of k-means to set them correctly. NB: subclasses should set | ||
| * numLowerBounds appropriately before entering this function. | ||
| * | ||
| * Parameters: none | ||
| * | ||
| * Return value: none | ||
| */ | ||
| void TriangleInequalityBaseKmeans::initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads) { | ||
| OriginalSpaceKmeans::initialize(aX, aK, initialAssignment, aNumThreads); | ||
|
|
||
| s = new double[k]; | ||
| upper = new double[n]; | ||
| lower = new double[n * numLowerBounds]; | ||
|
|
||
| // start with invalid bounds and assignments which will force the first | ||
| // iteration of k-means to do all its standard work | ||
| std::fill(s, s + k, 0.0); | ||
| std::fill(upper, upper + n, std::numeric_limits<double>::max()); | ||
| std::fill(lower, lower + n * numLowerBounds, 0.0); | ||
| } | ||
|
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,43 @@ | ||
| #ifndef TRIANGLE_INEQUALITY_BASE_KMEANS_H | ||
| #define TRIANGLE_INEQUALITY_BASE_KMEANS_H | ||
|
|
||
| /* Authors: Greg Hamerly and Jonathan Drake | ||
| * Feedback: hamerly@cs.baylor.edu | ||
| * See: http://cs.baylor.edu/~hamerly/software/kmeans.php | ||
| * Copyright 2014 | ||
| * | ||
| * This class is an abstract base class for several other algorithms that use | ||
| * upper & lower bounds to avoid distance calculations in k-means. | ||
| */ | ||
|
|
||
| #include "original_space_kmeans.h" | ||
|
|
||
| class TriangleInequalityBaseKmeans : public OriginalSpaceKmeans { | ||
| public: | ||
| TriangleInequalityBaseKmeans() : numLowerBounds(0), s(NULL), upper(NULL), lower(NULL) {} | ||
| virtual ~TriangleInequalityBaseKmeans() { free(); } | ||
|
|
||
| virtual void initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads); | ||
| virtual void free(); | ||
|
|
||
| protected: | ||
| void update_s(int threadId); | ||
|
|
||
| // The number of lower bounds being used by this algorithm. | ||
| int numLowerBounds; | ||
|
|
||
| // Half the distance between each center and its closest other center. | ||
| double *s; | ||
|
|
||
| // One upper bound for each point on the distance between that point and | ||
| // its assigned (closest) center. | ||
| double *upper; | ||
|
|
||
| // Lower bound(s) for each point on the distance between that point and | ||
| // the centers being tracked for lower bounds, which may be 1 to k. | ||
| // Actual size is n * numLowerBounds. | ||
| double *lower; | ||
| }; | ||
|
|
||
| #endif | ||
|
|