178 changes: 178 additions & 0 deletions contrib/kmeans/hamerly_kmeans.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,178 @@
/* Authors: Greg Hamerly and Jonathan Drake
* Feedback: hamerly@cs.baylor.edu
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2014
*/

#include "hamerly_kmeans.h"
#include "kmeans_general_functions.h"
#include <cmath>

/* Hamerly's algorithm that is a 'simplification' of Elkan's, in that it keeps
* the following bounds:
* - One upper bound per clustered record on the distance between the record
* and its closest center. It is always greater than or equal to the true
* distance between the record and its closest center. This is the same as in
* Elkan's algorithm.
* - *One* lower bound per clustered record on the distance between the record
* and its *second*-closest center. It is always less than or equal to the
* true distance between the record and its second closest center. This is
* different information than Elkan's algorithm -- his algorithm keeps k
* lower bounds for each record, for a total of (n*k) lower bounds.
*
* The basic ideas are:
* - when lower(x) <= upper(x), we need to recalculate the closest centers for
* the record x, and reset lower(x) and upper(x) to their boundary values
* - whenever a center moves
* - calculate the distance it moves 'd'
* - for each record x assigned to that center, update its upper bound
* - upper(x) = upper(x) + d
* - after each iteration
* - find the center that has moved the most (with distance 'd')
* - update the lower bound for all (?) records:
* - lower(x) = lower(x) - d
*
* Parameters:
* - threadId: the index of the thread that is running
* - maxIterations: a bound on the number of iterations to perform
*
* Return value: the number of iterations performed (always at least 1)
*/
// this version only updates center locations when necessary
int HamerlyKmeans::runThread(int threadId, int maxIterations) {
int iterations = 0;

int startNdx = start(threadId);
int endNdx = end(threadId);

while ((iterations < maxIterations) && ! converged) {
++iterations;

// compute the inter-center distances, keeping only the closest distances
update_s(threadId);
synchronizeAllThreads();

// loop over all records
for (int i = startNdx; i < endNdx; ++i) {
unsigned short closest = assignment[i];

// if upper[i] is less than the greater of these two, then we can
// ignore record i
double upper_comparison_bound = std::max(s[closest], lower[i]);

// first check: if u(x) <= s(c(x)) or u(x) <= lower(x), then ignore
// x, because its closest center must still be closest
if (upper[i] <= upper_comparison_bound) {
continue;
}

// otherwise, compute the real distance between this record and its
// closest center, and update upper
double u2 = pointCenterDist2(i, closest);
upper[i] = sqrt(u2);

// if (u(x) <= s(c(x))) or (u(x) <= lower(x)), then ignore x
if (upper[i] <= upper_comparison_bound) {
continue;
}

// now update the lower bound by looking at all other centers
double l2 = std::numeric_limits<double>::max(); // the squared lower bound
for (int j = 0; j < k; ++j) {
if (j == closest) { continue; }

double dist2 = pointCenterDist2(i, j);

if (dist2 < u2) {
// another center is closer than the current assignment

// change the lower bound to be the current upper bound
// (since the current upper bound is the distance to the
// now-second-closest known center)
l2 = u2;

// adjust the upper bound and the current assignment
u2 = dist2;
closest = j;
} else if (dist2 < l2) {
// we must reduce the lower bound on the distance to the
// *second* closest center to x[i]
l2 = dist2;
}
}

// we have been dealing in squared distances; need to convert
lower[i] = sqrt(l2);

// if the assignment for i has changed, then adjust the counts and
// locations of each center's accumulated mass
if (assignment[i] != closest) {
upper[i] = sqrt(u2);
changeAssignment(i, closest, threadId);
}
}

verifyAssignment(iterations, startNdx, endNdx);

// ELKAN 4, 5, AND 6
// calculate the new center locations
synchronizeAllThreads();
if (threadId == 0) {
int furthestMovingCenter = move_centers();
converged = (0.0 == centerMovement[furthestMovingCenter]);
}

synchronizeAllThreads();

if (! converged) {
update_bounds(startNdx, endNdx);
}

synchronizeAllThreads();
}

return iterations;
}


/* This method does the following:
* - finds the furthest-moving center
* - finds the distances moved by the two furthest-moving centers
* - updates the upper/lower bounds for each record
*
* Parameters:
* - startNdx: the first index of the dataset this thread is responsible for
* - endNdx: one past the last index of the dataset this thread is responsible for
*/
void HamerlyKmeans::update_bounds(int startNdx, int endNdx) {
double longest = centerMovement[0], secondLongest = (1 < k) ? centerMovement[1] : centerMovement[0];
int furthestMovingCenter = 0;

if (longest < secondLongest) {
furthestMovingCenter = 1;
std::swap(longest, secondLongest);
}

for (int j = 2; j < k; ++j) {
if (longest < centerMovement[j]) {
secondLongest = longest;
longest = centerMovement[j];
furthestMovingCenter = j;
} else if (secondLongest < centerMovement[j]) {
secondLongest = centerMovement[j];
}
}

// update upper/lower bounds
for (int i = startNdx; i < endNdx; ++i) {
// the upper bound increases by the amount that its center moved
upper[i] += centerMovement[assignment[i]];

// The lower bound decreases by the maximum amount that any center
// moved, unless the furthest-moving center is the one it's assigned
// to. In the latter case, the lower bound decreases by the amount
// of the second-furthest-moving center.
lower[i] -= (assignment[i] == furthestMovingCenter) ? secondLongest : longest;
}
}

29 changes: 29 additions & 0 deletions contrib/kmeans/hamerly_kmeans.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,29 @@
#ifndef HAMERLY_KMEANS_H
#define HAMERLY_KMEANS_H

/* Authors: Greg Hamerly and Jonathan Drake
* Feedback: hamerly@cs.baylor.edu
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2014
*
* HamerlyKmeans implements Hamerly's k-means algorithm that uses one lower
* bound per point.
*/

#include "triangle_inequality_base_kmeans.h"

class HamerlyKmeans : public TriangleInequalityBaseKmeans {
public:
HamerlyKmeans() { numLowerBounds = 1; }
virtual ~HamerlyKmeans() { free(); }
virtual std::string getName() const { return "hamerly"; }

protected:
// Update the upper and lower bounds for the given range of points.
void update_bounds(int startNdx, int endNdx);

virtual int runThread(int threadId, int maxIterations);
};

#endif

165 changes: 165 additions & 0 deletions contrib/kmeans/kmeans.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,165 @@
/* Authors: Greg Hamerly and Jonathan Drake
* Feedback: hamerly@cs.baylor.edu
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2014
*/

#include "kmeans.h"
#include "kmeans_general_functions.h"
#include <cassert>
#include <cmath>

Kmeans::Kmeans() : x(NULL), n(0), k(0), d(0), numThreads(0), converged(false),
clusterSize(NULL), centerMovement(NULL), assignment(NULL) {
#ifdef COUNT_DISTANCES
numDistances = 0;
#endif
}

void Kmeans::free() {
delete [] centerMovement;
for (int t = 0; t < numThreads; ++t) {
delete [] clusterSize[t];
}
delete [] clusterSize;
centerMovement = NULL;
clusterSize = NULL;
assignment = NULL;
n = k = d = numThreads = 0;
}



void Kmeans::initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads) {
free();

converged = false;
x = aX;
n = x->n;
d = x->d;
k = aK;

#ifdef USE_THREADS
numThreads = aNumThreads;
pthread_barrier_init(&barrier, NULL, numThreads);
#else
numThreads = 1;
#endif

assignment = initialAssignment;
centerMovement = new double[k];
clusterSize = new int *[numThreads];
for (int t = 0; t < numThreads; ++t) {
clusterSize[t] = new int[k];
std::fill(clusterSize[t], clusterSize[t] + k, 0);
for (int i = start(t); i < end(t); ++i) {
assert(assignment[i] < k);
++clusterSize[t][assignment[i]];
}
}


#ifdef COUNT_DISTANCES
numDistances = 0;
#endif
}

void Kmeans::changeAssignment(int xIndex, int closestCluster, int threadId) {
--clusterSize[threadId][assignment[xIndex]];
++clusterSize[threadId][closestCluster];
assignment[xIndex] = closestCluster;
}

#ifdef USE_THREADS
struct ThreadInfo {
public:
ThreadInfo() : km(NULL), threadId(0), pthread_id(0) {}
Kmeans *km;
int threadId;
pthread_t pthread_id;
int numIterations;
int maxIterations;
};
#endif

void *Kmeans::runner(void *args) {
#ifdef USE_THREADS
ThreadInfo *ti = (ThreadInfo *)args;
ti->numIterations = ti->km->runThread(ti->threadId, ti->maxIterations);
#endif
return NULL;
}

int Kmeans::run(int maxIterations) {
int iterations = 0;
#ifdef USE_THREADS
{
ThreadInfo *info = new ThreadInfo[numThreads];
for (int t = 0; t < numThreads; ++t) {
info[t].km = this;
info[t].threadId = t;
info[t].maxIterations = maxIterations;
pthread_create(&info[t].pthread_id, NULL, Kmeans::runner, &info[t]);
}
// wait for everything to finish...
for (int t = 0; t < numThreads; ++t) {
pthread_join(info[t].pthread_id, NULL);
}
iterations = info[0].numIterations;
delete [] info;
}
#else
{
iterations = runThread(0, maxIterations);
}
#endif

return iterations;
}

double Kmeans::getSSE() const {
double sse = 0.0;
for (int i = 0; i < n; ++i) {
sse += pointCenterDist2(i, assignment[i]);
}
return sse;
}

void Kmeans::verifyAssignment(int iteration, int startNdx, int endNdx) const {
#ifdef VERIFY_ASSIGNMENTS
for (int i = startNdx; i < endNdx; ++i) {
// keep track of the squared distance and identity of the closest-seen
// cluster (so far)
int closest = assignment[i];
double closest_dist2 = pointCenterDist2(i, closest);
double original_closest_dist2 = closest_dist2;
// look at all centers
for (int j = 0; j < k; ++j) {
if (j == closest) {
continue;
}
double d2 = pointCenterDist2(i, j);

// determine if we found a closer center
if (d2 < closest_dist2) {
closest = j;
closest_dist2 = d2;
}
}

// if we have found a discrepancy, then print out information and crash
// the program
if (closest != assignment[i]) {
std::cerr << "assignment error:" << std::endl;
std::cerr << "iteration = " << iteration << std::endl;
std::cerr << "point index = " << i << std::endl;
std::cerr << "closest center = " << closest << std::endl;
std::cerr << "closest center dist2 = " << closest_dist2 << std::endl;
std::cerr << "assigned center = " << assignment[i] << std::endl;
std::cerr << "assigned center dist2 = " << original_closest_dist2 << std::endl;
assert(false);
}
}
#endif
}

147 changes: 147 additions & 0 deletions contrib/kmeans/kmeans.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,147 @@
#ifndef KMEANS_H
#define KMEANS_H

/* Authors: Greg Hamerly and Jonathan Drake
* Feedback: hamerly@cs.baylor.edu
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2014
*
* Kmeans is an abstract base class for algorithms which implement Lloyd's
* k-means algorithm. Subclasses provide functionality in the "runThread()"
* method.
*/

#include "kmeans_dataset.h"
#include <limits>
#include <string>
#ifdef USE_THREADS
#include <pthread.h>
#endif

class Kmeans {
public:
// Construct a K-means object to operate on the given dataset
Kmeans();
virtual ~Kmeans() { free(); }

// This method kicks off the threads that do the clustering and run
// until convergence (or until reaching maxIterations). It returns the
// number of iterations performed.
int run(int aMaxIterations = std::numeric_limits<int>::max());

// Get the cluster assignment for the given point index.
int getAssignment(int xIndex) const { return assignment[xIndex]; }

// Initialize the algorithm at the beginning of the run(), with the
// given data and initial assignment. The parameter initialAssignment
// will be modified by this algorithm and will at the end contain the
// final assignment of clusters.
virtual void initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads);

// Free all memory being used by the object.
virtual void free();

// This method verifies that the current assignment is correct, by
// checking every point-center distance. For debugging.
virtual void verifyAssignment(int iteration, int startNdx, int endNdx) const;

// Compute the sum of squared errors for the data on the centers (not
// designed to be fast).
virtual double getSSE() const;

// Get the name of this clustering algorithm (to be overridden by
// subclasses).
virtual std::string getName() const = 0;

// Virtual methods for computing inner products (depending on the kernel
// being used, e.g.). For vanilla k-means these will be the standard dot
// product; for more exotic applications these will be other kernel
// functions.
virtual double pointPointInnerProduct(int x1, int x2) const = 0;
virtual double pointCenterInnerProduct(int xndx, unsigned short cndx) const = 0;
virtual double centerCenterInnerProduct(unsigned short c1, unsigned short c2) const = 0;

// Use the inner products to compute squared distances between a point
// and center.
virtual double pointCenterDist2(int x1, unsigned short cndx) const {
#ifdef COUNT_DISTANCES
++numDistances;
#endif
return pointPointInnerProduct(x1, x1) - 2 * pointCenterInnerProduct(x1, cndx) + centerCenterInnerProduct(cndx, cndx);
}

// Use the inner products to compute squared distances between two
// centers.
virtual double centerCenterDist2(unsigned short c1, unsigned short c2) const {
#ifdef COUNT_DISTANCES
++numDistances;
#endif
return centerCenterInnerProduct(c1, c1) - 2 * centerCenterInnerProduct(c1, c2) + centerCenterInnerProduct(c2, c2);
}

#ifdef COUNT_DISTANCES
#ifdef USE_THREADS
// Note: numDistances is NOT thread-safe, but it is not meant to be
// enabled in performant code.
#error Counting distances and using multiple threads is not supported.
#endif
mutable long long numDistances;
#endif

virtual Dataset const *getCenters() const { return NULL; }

protected:
// The dataset to cluster.
const Dataset *x;

// Local copies for convenience.
int n, k, d;

// Pthread primitives for multithreading.
int numThreads;
#ifdef USE_THREADS
pthread_barrier_t barrier;
#endif

// To communicate (to all threads) that we have converged.
bool converged;

// Keep track of how many points are in each cluster, divided over each
// thread.
int **clusterSize;

// centerMovement is computed in move_centers() and used to detect
// convergence (if max(centerMovement) == 0.0) and update point-center
// distance bounds (in subclasses that use them).
double *centerMovement;

// For each point in x, keep which cluster it is assigned to. By using a
// short, we assume a limited number of clusters (fewer than 2^16).
unsigned short *assignment;


// This is where each thread does its work.
virtual int runThread(int threadId, int maxIterations) = 0;

// Static entry method for pthread_create().
static void *runner(void *args);

// Assign point at xIndex to cluster newCluster, working within thread threadId.
virtual void changeAssignment(int xIndex, int newCluster, int threadId);

// Over what range in [0, n) does this thread have ownership of the
// points? end() returns one past the last owned point.
int start(int threadId) const { return n * threadId / numThreads; }
int end(int threadId) const { return start(threadId + 1); }
int whichThread(int index) const { return index * numThreads / n; }

// Convenience method for causing all threads to synchronize.
void synchronizeAllThreads() {
#ifdef USE_THREADS
pthread_barrier_wait(&barrier);
#endif
}
};

#endif

95 changes: 95 additions & 0 deletions contrib/kmeans/kmeans_dataset.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,95 @@
/* Authors: Greg Hamerly and Jonathan Drake
* Feedback: hamerly@cs.baylor.edu
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2014
*/

#include "kmeans_dataset.h"
// #include <iostream>
#include <iomanip>
#include <cassert>
#include <cstring>

// print the dataset to standard output (cout), using formatting to keep the
// data in matrix format
void Dataset::print(std::ostream &out) const {
//std::ostream &out = std::cout;
out.precision(6);
int ndx = 0;
for (int i = 0; i < n; ++i) {
for (int j = 0; j < d; ++j) {
out << std::setw(13) << data[ndx++] << " ";
}
out << std::endl;
}
}

// returns a (modifiable) reference to the value in dimension "dim" from record
// "ndx"
double &Dataset::operator()(int ndx, int dim) {
# ifdef DEBUG
assert(ndx < n);
assert(dim < d);
# endif
return data[ndx * d + dim];
}

// returns a (const) reference to the value in dimension "dim" from record "ndx"
const double &Dataset::operator()(int ndx, int dim) const {
# ifdef DEBUG
assert(ndx < n);
assert(dim < d);
# endif
return data[ndx * d + dim];
}

// fill the entire dataset with value. Does NOT update sumDataSquared.
void Dataset::fill(double value) {
for (int i = 0; i < nd; ++i) {
data[i] = value;
}
}

// copy constructor -- makes a deep copy of everything in x
Dataset::Dataset(Dataset const &x) {
n = d = nd = 0;
data = sumDataSquared = NULL;
*this = x;
}

// operator= is the standard deep-copy assignment operator, which
// returns a const reference to *this.
Dataset const &Dataset::operator=(Dataset const &x) {
if (this != &x) {

// reallocate sumDataSquared and data as necessary
if (n != x.n) {
delete [] sumDataSquared;
sumDataSquared = x.sumDataSquared ? new double[x.n] : NULL;
}

if (nd != x.nd) {
delete [] data;
data = x.data ? new double[x.nd] : NULL;
}

// reflect the new sizes
n = x.n;
d = x.d;
nd = x.nd;

// copy data as appropriate
if (x.sumDataSquared) {
memcpy(sumDataSquared, x.sumDataSquared, x.n * sizeof(double));
}

if (x.data) {
memcpy(data, x.data, x.nd * sizeof(double));
}

}

// return a reference for chaining assignments
return *this;
}

85 changes: 85 additions & 0 deletions contrib/kmeans/kmeans_dataset.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,85 @@
#ifndef DATASET_H
#define DATASET_H

/* Authors: Greg Hamerly and Jonathan Drake
* Feedback: hamerly@cs.baylor.edu
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2014
*
* A Dataset class represents a collection of multidimensional records, as is
* typical in metric machine learning. Every record has the same number of
* dimensions (values), and every value must be numeric. Undefined values are
* not allowed.
*
* This particular implementation keeps all the data in a 1-dimensional array,
* and also optionally keeps extra storage for the sum of the squared values of
* each record. However, the Dataset class does NOT automatically populate or
* update the sumDataSquared values.
*/

#include <cstddef>
#include <iostream>

class Dataset {
public:
// default constructor -- constructs a completely empty dataset with no
// records
Dataset() : n(0), d(0), nd(0), data(NULL), sumDataSquared(NULL) {}

// construct a dataset of a particular size, and determine whether to
// keep the sumDataSquared
Dataset(int aN, int aD, bool keepSDS = false) : n(aN), d(aD), nd(n * d),
data(new double[nd]),
sumDataSquared(keepSDS ? new double[n] : NULL) {}

// copy constructor -- makes a deep copy of everything in x
Dataset(Dataset const &x);

// destroys the dataset safely
~Dataset() {
n = d = nd = 0;
double *dp = data, *sdsp = sumDataSquared;
data = sumDataSquared = NULL;
delete [] dp;
delete [] sdsp;
}

// operator= is the standard deep-copy assignment operator, which
// returns a const reference to *this.
Dataset const &operator=(Dataset const &x);

// allows modification of the record ndx and dimension dim
double &operator()(int ndx, int dim);

// allows const access to record ndx and dimension dim
const double &operator()(int ndx, int dim) const;

// fill the entire dataset with value. Does NOT update sumDataSquared.
void fill(double value);

// print the dataset to standard output (cout), using formatting to keep the
// data in matrix format
void print(std::ostream &out = std::cout) const;

// n represents the number of records
// d represents the dimension
// nd is a shortcut for the value n * d
int n, d, nd;

// data is an array of length n*d that stores all of the records in
// record-major (row-major) order. Thus data[0]...data[d-1] are the
// values associated with the first record.
double *data;

// sumDataSquared is an (optional) sum of squared values for every
// record. Thus,
// sumDataSquared[0] = data[0]^2 + data[1]^2 + ... + data[d-1]^2
// sumDataSquared[1] = data[d]^2 + data[d+1]^2 + ... + data[2*d-1]^2
// and so on. Note that this is the *intended* use of the sumDataSquared
// field, but that the Dataset class does NOT automatically populate or
// update the values in sumDataSquared.
double *sumDataSquared;
};

#endif

330 changes: 330 additions & 0 deletions contrib/kmeans/kmeans_general_functions.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,330 @@
/* Authors: Greg Hamerly and Jonathan Drake
* Feedback: hamerly@cs.baylor.edu
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2014
*/

#include "kmeans_dataset.h"
#include "kmeans.h"
#include "kmeans_general_functions.h"
#include <cassert>
#include <cmath>
#include <algorithm>
#include <numeric>
#include <cstring>
#include <cstdio>
#include <unistd.h>

void addVectors(double *a, double const *b, int d) {
double const *end = a + d;
while (a < end) {
*(a++) += *(b++);
}
}

void subVectors(double *a, double const *b, int d) {
double const *end = a + d;
while (a < end) {
*(a++) -= *(b++);
}
}

double distance2silent(double const *a, double const *b, int d) {
double d2 = 0.0, diff;
double const *end = a + d; // one past the last valid entry in a
while (a < end) {
diff = *(a++) - *(b++);
d2 += diff * diff;
}
return d2;
}


void centerDataset(Dataset *x) {
double *xCentroid = new double[x->d];

for (int d = 0; d < x->d; ++d) {
xCentroid[d] = 0.0;
}

for (int i = 0; i < x->n; ++i) {
addVectors(xCentroid, x->data + i * x->d, x->d);
}

// compute average (divide by n)
for (int d = 0; d < x->d; ++d) {
xCentroid[d] /= x->n;
}

// re-center the dataset
const double *xEnd = x->data + x->n * x->d;
for (double *xp = x->data; xp != xEnd; xp += x->d) {
subVectors(xp, xCentroid, x->d);
}

delete [] xCentroid;
}

Dataset *init_centers(Dataset const &x, unsigned short k) {
int *chosen_pts = new int[k];
Dataset *c = new Dataset(k, x.d);
for (int i = 0; i < k; ++i) {
bool acceptable = true;
do {
acceptable = true;
chosen_pts[i] = rand() % x.n;
for (int j = 0; j < i; ++j) {
if (chosen_pts[i] == chosen_pts[j]) {
acceptable = false;
break;
}
}
} while (! acceptable);
double *cdp = c->data + i * x.d;
memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d);
if (c->sumDataSquared) {
c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0);
}
}

delete [] chosen_pts;

return c;
}


Dataset *init_centers_kmeanspp(Dataset const &x, unsigned short k) {
int *chosen_pts = new int[k];
std::pair<double, int> *dist2 = new std::pair<double, int>[x.n];
double *distribution = new double[x.n];

// initialize dist2
for (int i = 0; i < x.n; ++i) {
dist2[i].first = std::numeric_limits<double>::max();
dist2[i].second = i;
}

// choose the first point randomly
int ndx = 1;
chosen_pts[ndx - 1] = rand() % x.n;

while (ndx < k) {
double sum_distribution = 0.0;
// look for the point that is furthest from any center
for (int i = 0; i < x.n; ++i) {
int example = dist2[i].second;
double d2 = 0.0, diff;
for (int j = 0; j < x.d; ++j) {
diff = x(example,j) - x(chosen_pts[ndx - 1],j);
d2 += diff * diff;
}
if (d2 < dist2[i].first) {
dist2[i].first = d2;
}

sum_distribution += dist2[i].first;
}

// sort the examples by their distance from centers
sort(dist2, dist2 + x.n);

// turn distribution into a CDF
distribution[0] = dist2[0].first / sum_distribution;
for (int i = 1; i < x.n; ++i) {
distribution[i] = distribution[i - 1] + dist2[i].first / sum_distribution;
}

// choose a random interval according to the new distribution
double r = (double)rand() / (double)RAND_MAX;
double *new_center_ptr = std::lower_bound(distribution, distribution + x.n, r);
int distribution_ndx = (int)(new_center_ptr - distribution);
chosen_pts[ndx] = dist2[distribution_ndx].second;
/*
cout << "chose " << distribution_ndx << " which is actually "
<< chosen_pts[ndx] << " with distance "
<< dist2[distribution_ndx].first << std::endl;
*/

++ndx;
}

Dataset *c = new Dataset(k, x.d);

for (int i = 0; i < k; ++i) {
double *cdp = c->data + i * x.d;
memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d);
if (c->sumDataSquared) {
c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0);
}
}

delete [] chosen_pts;
delete [] dist2;
delete [] distribution;

return c;
}


Dataset *init_centers_kmeanspp_v2(Dataset const &x, unsigned short k) {
int *chosen_pts = new int[k];
std::pair<double, int> *dist2 = new std::pair<double, int>[x.n];

// initialize dist2
for (int i = 0; i < x.n; ++i) {
dist2[i].first = std::numeric_limits<double>::max();
dist2[i].second = i;
}

// choose the first point randomly
int ndx = 1;
chosen_pts[ndx - 1] = rand() % x.n;

while (ndx < k) {
double sum_distribution = 0.0;
// look for the point that is furthest from any center
double max_dist = 0.0;
for (int i = 0; i < x.n; ++i) {
int example = dist2[i].second;
double d2 = 0.0, diff;
for (int j = 0; j < x.d; ++j) {
diff = x(example,j) - x(chosen_pts[ndx - 1],j);
d2 += diff * diff;
}
if (d2 < dist2[i].first) {
dist2[i].first = d2;
}

if (dist2[i].first > max_dist) {
max_dist = dist2[i].first;
}

sum_distribution += dist2[i].first;
}

bool unique = true;

do {
// choose a random interval according to the new distribution
double r = sum_distribution * (double)rand() / (double)RAND_MAX;
double sum_cdf = dist2[0].first;
int cdf_ndx = 0;
while (sum_cdf < r) {
sum_cdf += dist2[++cdf_ndx].first;
}
chosen_pts[ndx] = cdf_ndx;

for (int i = 0; i < ndx; ++i) {
unique = unique && (chosen_pts[ndx] != chosen_pts[i]);
}
} while (! unique);

++ndx;
}

Dataset *c = new Dataset(k, x.d);
for (int i = 0; i < c->n; ++i) {
double *cdp = c->data + i * x.d;
memcpy(cdp, x.data + chosen_pts[i] * x.d, sizeof(double) * x.d);
if (c->sumDataSquared) {
c->sumDataSquared[i] = std::inner_product(cdp, cdp + x.d, cdp, 0.0);
}
}

delete [] chosen_pts;
delete [] dist2;

return c;
}


/**
* in MB
*/
double getMemoryUsage() {
char buf[30];
snprintf(buf, 30, "/proc/%u/statm", (unsigned)getpid());
FILE* pf = fopen(buf, "r");
unsigned int totalProgramSizeInPages = 0;
unsigned int residentSetSizeInPages = 0;
if (pf) {
int numScanned = fscanf(pf, "%u %u" /* %u %u %u %u %u"*/, &totalProgramSizeInPages, &residentSetSizeInPages);
if (numScanned != 2) {
return 0.0;
}
}

fclose(pf);
pf = NULL;

double sizeInKilobytes = residentSetSizeInPages * 4.0; // assumes 4096 byte page
// getconf PAGESIZE

return sizeInKilobytes;
}


void assign(Dataset const &x, Dataset const &c, unsigned short *assignment) {
for (int i = 0; i < x.n; ++i) {
double shortestDist2 = std::numeric_limits<double>::max();
int closest = 0;
for (int j = 0; j < c.n; ++j) {
double d2 = 0.0, *a = x.data + i * x.d, *b = c.data + j * x.d;
for (; a != x.data + (i + 1) * x.d; ++a, ++b) {
d2 += (*a - *b) * (*a - *b);
}
if (d2 < shortestDist2) {
shortestDist2 = d2;
closest = j;
}
}
assignment[i] = closest;
}
}

rusage get_time() {
rusage now;
getrusage(RUSAGE_SELF, &now);
return now;
}


double get_wall_time(){
struct timeval time;
if (gettimeofday(&time,NULL)){
// Handle error
return 0;
}
return (double)time.tv_sec + (double)time.tv_usec * .000001;
}

int timeval_subtract(timeval *result, timeval *x, timeval *y) {
/* Perform the carry for the later subtraction by updating y. */
if (x->tv_usec < y->tv_usec) {
int nsec = (y->tv_usec - x->tv_usec) / 1000000 + 1;
y->tv_usec -= 1000000 * nsec;
y->tv_sec += nsec;
}
if (x->tv_usec - y->tv_usec > 1000000) {
int nsec = (x->tv_usec - y->tv_usec) / 1000000;
y->tv_usec += 1000000 * nsec;
y->tv_sec -= nsec;
}

/* Compute the time remaining to wait. tv_usec is certainly positive. */
result->tv_sec = x->tv_sec - y->tv_sec;
result->tv_usec = x->tv_usec - y->tv_usec;

/* Return 1 if result
* is negative. */
return x->tv_sec < y->tv_sec;
}

double elapsed_time(rusage *start) {
rusage now;
timeval diff;
getrusage(RUSAGE_SELF, &now);
timeval_subtract(&diff, &now.ru_utime, &start->ru_utime);

return (double)diff.tv_sec + (double)diff.tv_usec / 1e6;
}
91 changes: 91 additions & 0 deletions contrib/kmeans/kmeans_general_functions.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,91 @@
#ifndef GENERAL_KMEANS_FUNCTIONS_H
#define GENERAL_KMEANS_FUNCTIONS_H

/* Authors: Greg Hamerly and Jonathan Drake
* Feedback: hamerly@cs.baylor.edu
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2014
*
* Generally useful functions.
*/


#include <iostream>
#include <string>
#include <sys/resource.h>
#include <sys/time.h>
#include "kmeans_dataset.h"

/* Add together two vectors, and put the result in the first argument.
* Calculates a = a + b
*
* Parameters:
* a -- vector to add, and the result of the operation
* b -- vector to add to a
* d -- the dimension
* Return value: none
*/
void addVectors(double *a, double const *b, int d);

/* Subtract two vectors, and put the result in the first argument. Calculates
* a = a - b
*
* Parameters:
* a -- vector to subtract from, and the result of the operation
* b -- vector to subtract
* d -- the dimension
* Return value: none
*/
void subVectors(double *a, double const *b, int d);

/* Initialize the centers randomly. Choose random records from x as the initial
* values for the centers. Assumes that c uses the sumDataSquared field.
*
* Parameters:
* x -- records that are being clustered (n * d)
* c -- centers to be initialized. Should be pre-allocated with the number of
* centers desired, and dimension.
* Return value: none
*/
Dataset *init_centers(Dataset const &x, unsigned short k);

/* Initialize the centers randomly using K-means++.
*
* Parameters:
* x -- records that are being clustered (n * d)
* c -- centers to be initialized. Should be pre-allocated with the number of
* centers desired, and dimension.
* Return value: none
*/
Dataset *init_centers_kmeanspp(Dataset const &x, unsigned short k);
Dataset *init_centers_kmeanspp_v2(Dataset const &x, unsigned short k);

/* Print an array (templated). Convenience function.
*
* Parameters:
* arr -- the array to print
* length -- the length of the array
* separator -- the string to put between each pair of printed elements
* Return value: none
*/
template <class T>
void printArray(T const *arr, int length, std::string separator) {
for (int i = 0; i < length; ++i) {
if (i > 0) {
std::cout << separator;
}
std::cout << arr[i];
}
}

double getMemoryUsage();
rusage get_time();
double get_wall_time();
int timeval_subtract(timeval *result, timeval *x, timeval *y);
double elapsed_time(rusage *start);

void centerDataset(Dataset *x);

void assign(Dataset const &x, Dataset const &c, unsigned short *assignment);

#endif
106 changes: 106 additions & 0 deletions contrib/kmeans/original_space_kmeans.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,106 @@
/* Authors: Greg Hamerly and Jonathan Drake
* Feedback: hamerly@cs.baylor.edu
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2014
*/

#include "original_space_kmeans.h"
#include "kmeans_general_functions.h"
#include <cmath>
#include <cassert>
#include <numeric>

OriginalSpaceKmeans::OriginalSpaceKmeans() : centers(NULL), sumNewCenters(NULL) { }

void OriginalSpaceKmeans::free() {
for (int t = 0; t < numThreads; ++t) {
delete sumNewCenters[t];
}
Kmeans::free();
delete centers;
delete [] sumNewCenters;
centers = NULL;
sumNewCenters = NULL;
}

/* This method moves the newCenters to their new locations, based on the
* sufficient statistics in sumNewCenters. It also computes the centerMovement
* and the center that moved the furthest.
*
* Parameters: none
*
* Return value: index of the furthest-moving centers
*/
int OriginalSpaceKmeans::move_centers() {
int furthestMovingCenter = 0;
for (int j = 0; j < k; ++j) {
centerMovement[j] = 0.0;
int totalClusterSize = 0;
for (int t = 0; t < numThreads; ++t) {
totalClusterSize += clusterSize[t][j];
}
if (totalClusterSize > 0) {
for (int dim = 0; dim < d; ++dim) {
double z = 0.0;
for (int t = 0; t < numThreads; ++t) {
z += (*sumNewCenters[t])(j,dim);
}
z /= totalClusterSize;
centerMovement[j] += (z - (*centers)(j, dim)) * (z - (*centers)(j, dim));
(*centers)(j, dim) = z;
}
}
centerMovement[j] = sqrt(centerMovement[j]);

if (centerMovement[furthestMovingCenter] < centerMovement[j]) {
furthestMovingCenter = j;
}
}

#ifdef COUNT_DISTANCES
numDistances += k;
#endif

return furthestMovingCenter;
}

void OriginalSpaceKmeans::initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads) {
Kmeans::initialize(aX, aK, initialAssignment, aNumThreads);

centers = new Dataset(k, d);
sumNewCenters = new Dataset *[numThreads];
centers->fill(0.0);

for (int t = 0; t < numThreads; ++t) {
sumNewCenters[t] = new Dataset(k, d, false);
sumNewCenters[t]->fill(0.0);
for (int i = start(t); i < end(t); ++i) {
addVectors(sumNewCenters[t]->data + assignment[i] * d, x->data + i * d, d);
}
}

// put the centers at their initial locations, based on clusterSize and
// sumNewCenters
move_centers();
}

void OriginalSpaceKmeans::changeAssignment(int xIndex, int closestCluster, int threadId) {
unsigned short oldAssignment = assignment[xIndex];
Kmeans::changeAssignment(xIndex, closestCluster, threadId);
double *xp = x->data + xIndex * d;
subVectors(sumNewCenters[threadId]->data + oldAssignment * d, xp, d);
addVectors(sumNewCenters[threadId]->data + closestCluster * d, xp, d);
}

double OriginalSpaceKmeans::pointPointInnerProduct(int x1, int x2) const {
return std::inner_product(x->data + x1 * d, x->data + (x1 + 1) * d, x->data + x2 * d, 0.0);
}

double OriginalSpaceKmeans::pointCenterInnerProduct(int xndx, unsigned short cndx) const {
return std::inner_product(x->data + xndx * d, x->data + (xndx + 1) * d, centers->data + cndx * d, 0.0);
}

double OriginalSpaceKmeans::centerCenterInnerProduct(unsigned short c1, unsigned short c2) const {
return std::inner_product(centers->data + c1 * d, centers->data + (c1 + 1) * d, centers->data + c2 * d, 0.0);
}

54 changes: 54 additions & 0 deletions contrib/kmeans/original_space_kmeans.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,54 @@
#ifndef ORIGINAL_SPACE_KMEANS_H
#define ORIGINAL_SPACE_KMEANS_H

/* Authors: Greg Hamerly and Jonathan Drake
* Feedback: hamerly@cs.baylor.edu
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2014
*
* OriginalSpaceKmeans is a base class for other algorithms that operate in the
* same space as the data being clustered (as opposed to kernelized k-means
* algorithms, which operate in kernel space).
*/

#include "kmeans.h"

/* Cluster with the cluster centers living in the original space (with the
* data). This is as opposed to a kernelized version of k-means, where the
* center points might not be explicitly represented. This is also an abstract
* class.
*/
class OriginalSpaceKmeans : public Kmeans {
public:
OriginalSpaceKmeans();
virtual ~OriginalSpaceKmeans() { free(); }
virtual void free();
virtual void initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads);

virtual double pointPointInnerProduct(int x1ndx, int x2ndx) const;
virtual double pointCenterInnerProduct(int xndx, unsigned short cndx) const;
virtual double centerCenterInnerProduct(unsigned short c1ndx, unsigned short c2ndx) const;

virtual Dataset const *getCenters() const { return centers; }

protected:
// Move the centers to the average of their current assigned points,
// compute the distance moved by each center, and return the index of
// the furthest-moving center.
int move_centers();

virtual void changeAssignment(int xIndex, int closestCluster, int threadId);

// The set of centers we are operating on.
Dataset *centers;

// sumNewCenters and centerCount provide sufficient statistics to
// quickly calculate the changing locations of the centers. Whenever a
// point changes cluster membership, we subtract (add) it from (to) the
// row in sumNewCenters associated with its old (new) cluster. We also
// decrement (increment) centerCount for the old (new) cluster.
Dataset **sumNewCenters;

};

#endif
1,001 changes: 1,001 additions & 0 deletions contrib/kmeans/smallDataset.txt

Large diffs are not rendered by default.

79 changes: 79 additions & 0 deletions contrib/kmeans/triangle_inequality_base_kmeans.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,79 @@
/* Authors: Greg Hamerly and Jonathan Drake
* Feedback: hamerly@cs.baylor.edu
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2014
*/

#include "triangle_inequality_base_kmeans.h"
#include "kmeans_general_functions.h"
#include <cassert>
#include <limits>
#include <cmath>

void TriangleInequalityBaseKmeans::free() {
OriginalSpaceKmeans::free();
delete [] s;
delete [] upper;
delete [] lower;
s = NULL;
upper = NULL;
lower = NULL;
}

/* This function computes the inter-center distances, keeping only the closest
* distances, and updates "s". After this, s[j] will contain the distance
* between center j and its closest other center, divided by two. The division
* here saves repeated work later, since we always will need the distance / 2.
*
* Parameters: none
*
* Return value: none
*/
// TODO: parallelize this
void TriangleInequalityBaseKmeans::update_s(int threadId) {
// initialize
for (int c1 = 0; c1 < k; ++c1) {
if (c1 % numThreads == threadId) {
s[c1] = std::numeric_limits<double>::max();
}
}
// compute inter-center squared distances between all pairs
for (int c1 = 0; c1 < k; ++c1) {
if (c1 % numThreads == threadId) {
for (int c2 = 0; c2 < k; ++c2) {
if (c2 == c1) {
continue;
}
double d2 = centerCenterDist2(c1, c2);
if (d2 < s[c1]) { s[c1] = d2; }
}
// take the root and divide by two
s[c1] = sqrt(s[c1]) / 2.0;
}
}
}


/* This function initializes the upper/lower bounds, assignment, centerCounts,
* and sumNewCenters. It sets the bounds to invalid values which will force the
* first iteration of k-means to set them correctly. NB: subclasses should set
* numLowerBounds appropriately before entering this function.
*
* Parameters: none
*
* Return value: none
*/
void TriangleInequalityBaseKmeans::initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads) {
OriginalSpaceKmeans::initialize(aX, aK, initialAssignment, aNumThreads);

s = new double[k];
upper = new double[n];
lower = new double[n * numLowerBounds];

// start with invalid bounds and assignments which will force the first
// iteration of k-means to do all its standard work
std::fill(s, s + k, 0.0);
std::fill(upper, upper + n, std::numeric_limits<double>::max());
std::fill(lower, lower + n * numLowerBounds, 0.0);
}

43 changes: 43 additions & 0 deletions contrib/kmeans/triangle_inequality_base_kmeans.h
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
#ifndef TRIANGLE_INEQUALITY_BASE_KMEANS_H
#define TRIANGLE_INEQUALITY_BASE_KMEANS_H

/* Authors: Greg Hamerly and Jonathan Drake
* Feedback: hamerly@cs.baylor.edu
* See: http://cs.baylor.edu/~hamerly/software/kmeans.php
* Copyright 2014
*
* This class is an abstract base class for several other algorithms that use
* upper & lower bounds to avoid distance calculations in k-means.
*/

#include "original_space_kmeans.h"

class TriangleInequalityBaseKmeans : public OriginalSpaceKmeans {
public:
TriangleInequalityBaseKmeans() : numLowerBounds(0), s(NULL), upper(NULL), lower(NULL) {}
virtual ~TriangleInequalityBaseKmeans() { free(); }

virtual void initialize(Dataset const *aX, unsigned short aK, unsigned short *initialAssignment, int aNumThreads);
virtual void free();

protected:
void update_s(int threadId);

// The number of lower bounds being used by this algorithm.
int numLowerBounds;

// Half the distance between each center and its closest other center.
double *s;

// One upper bound for each point on the distance between that point and
// its assigned (closest) center.
double *upper;

// Lower bound(s) for each point on the distance between that point and
// the centers being tracked for lower bounds, which may be 1 to k.
// Actual size is n * numLowerBounds.
double *lower;
};

#endif

13 changes: 10 additions & 3 deletions src/src.pro
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,8 @@ INCLUDEPATH += ../qwt/src \
../contrib/qtsolutions/qwtcurve \
../contrib/lmfit \
../contrib/levmar \
../contrib/boost
../contrib/boost \
../contrib/kmeans

DEFINES += QXT_STATIC

Expand Down Expand Up @@ -752,7 +753,10 @@ HEADERS += ../contrib/qtsolutions/codeeditor/codeeditor.h ../contrib/qtsolutions
../contrib/qzip/zipwriter.h ../contrib/lmfit/lmcurve.h ../contrib/lmfit/lmcurve_tyd.h \
../contrib/lmfit/lmmin.h ../contrib/lmfit/lmstruct.h \
../contrib/levmar/compiler.h ../contrib/levmar/levmar.h ../contrib/levmar/lm.h ../contrib/levmar/misc.h \
../contrib/boost/GeometricTools_BSplineCurve.h
../contrib/boost/GeometricTools_BSplineCurve.h \
../contrib/kmeans/kmeans_dataset.h ../contrib/kmeans/kmeans_general_functions.h ../contrib/kmeans/hamerly_kmeans.h \
../contrib/kmeans/kmeans.h ../contrib/kmeans/original_space_kmeans.h ../contrib/kmeans/triangle_inequality_base_kmeans.h


# Train View
HEADERS += Train/AddDeviceWizard.h Train/CalibrationData.h Train/ComputrainerController.h Train/Computrainer.h Train/DeviceConfiguration.h \
Expand Down Expand Up @@ -859,7 +863,10 @@ SOURCES += ../contrib/qtsolutions/codeeditor/codeeditor.cpp ../contrib/qtsolutio
../contrib/levmar/Axb.c ../contrib/levmar/lm_core.c ../contrib/levmar/lmbc_core.c \
../contrib/levmar/lmblec_core.c ../contrib/levmar/lmbleic_core.c ../contrib/levmar/lmlec.c ../contrib/levmar/misc.c \
../contrib/levmar/Axb_core.c ../contrib/levmar/lm.c ../contrib/levmar/lmbc.c ../contrib/levmar/lmblec.c ../contrib/levmar/lmbleic.c \
../contrib/levmar/lmlec_core.c ../contrib/levmar/misc_core.c
../contrib/levmar/lmlec_core.c ../contrib/levmar/misc_core.c \
../contrib/kmeans/kmeans_dataset.cpp ../contrib/kmeans/kmeans_general_functions.cpp ../contrib/kmeans/hamerly_kmeans.cpp \
../contrib/kmeans/kmeans.cpp ../contrib/kmeans/original_space_kmeans.cpp ../contrib/kmeans/triangle_inequality_base_kmeans.cpp


## Train View Components
SOURCES += Train/AddDeviceWizard.cpp Train/CalibrationData.cpp Train/ComputrainerController.cpp Train/Computrainer.cpp Train/DeviceConfiguration.cpp \
Expand Down