Skip to content

Commit

Permalink
DataFilter - kmeans()
Browse files Browse the repository at this point in the history
.. kmeans(centers|assignments, k, dim1, dim2 .. dimn)

   perform a k means cluster on data with multiple dimensions
   and return the centers, or the assignments.

   the return values are ordered so they can be displayed
   easily in an overview table e.g.

   values {
       kmeans(centers, 3, metrics(TSS), metrics(IF));
   }

.. will look at how we might plot these in charts with either
   color coding of points or perhaps voronoi diagrams.
  • Loading branch information
liversedge committed Sep 28, 2021
1 parent 4c6c8e6 commit 4c72088
Show file tree
Hide file tree
Showing 6 changed files with 259 additions and 4 deletions.
2 changes: 1 addition & 1 deletion contrib/kmeans/kmeans_general_functions.cpp
Expand Up @@ -237,7 +237,7 @@ Dataset *init_centers_kmeanspp_v2(Dataset const &x, unsigned short k) {
}


void assign(Dataset const &x, Dataset const &c, unsigned short *assignment) {
void kmeans_assign(Dataset const &x, Dataset const &c, unsigned short *assignment) {
for (int i = 0; i < x.n; ++i) {
double shortestDist2 = std::numeric_limits<double>::max();
int closest = 0;
Expand Down
2 changes: 1 addition & 1 deletion contrib/kmeans/kmeans_general_functions.h
Expand Up @@ -78,6 +78,6 @@ void printArray(T const *arr, int length, std::string separator) {

void centerDataset(Dataset *x);

void assign(Dataset const &x, Dataset const &c, unsigned short *assignment);
void kmeans_assign(Dataset const &x, Dataset const &c, unsigned short *assignment);

#endif
46 changes: 46 additions & 0 deletions src/Core/DataFilter.cpp
Expand Up @@ -37,6 +37,7 @@
#include "lmcurve.h"
#include "LTMTrend.h" // for LR when copying CP chart filtering mechanism
#include "WPrime.h" // for LR when copying CP chart filtering mechanism
#include "FastKmeans.h" // for kmeans(...)

#ifdef GC_HAVE_SAMPLERATE
// we have libsamplerate
Expand Down Expand Up @@ -379,6 +380,9 @@ static struct {
{ "pdfgamma", 3 }, // pdfgamma(a,b, x) as above for the gamma distribution
{ "cdfgamma", 3 }, // cdfgamma(a,b, x) as above for the gamma distribution

{ "kmeans", 0 }, // kmeans(centers|assignments, k, dim1, dim2, dim3 .. dimn) - return the centers or cluster assignment
// from a k means cluser of the data with n dimensions (but commonly just 2- x and y)


// add new ones above this line
{ "", -1 }
Expand Down Expand Up @@ -2023,6 +2027,21 @@ void Leaf::validateFilter(Context *context, DataFilterRuntime *df, Leaf *leaf)
}
}
}
} else if (leaf->function == "kmeans") {

if (leaf->fparms.count() < 4 || leaf->fparms[0]->type != Leaf::Symbol) {
leaf->inerror = true;
DataFiltererrors << QString(tr("kmeans(centers|assignments, k, dim1, dim2, dimn)"));
} else {
QString symbol=*(leaf->fparms[0]->lvalue.n);
if (symbol != "centers" && symbol != "assignments") {
leaf->inerror = true;
DataFiltererrors << QString(tr("kmeans(centers|assignments, k, dim1, dim2, dimn) - %s unknown")).arg(symbol);
} else {
for(int i=1; i<leaf->fparms.count(); i++) validateFilter(context, df, leaf->fparms[i]);
}
}

} else if (leaf->function == "metrics" || leaf->function == "metricstrings" ||
leaf->function == "aggmetrics" || leaf->function == "aggmetricstrings") {

Expand Down Expand Up @@ -4613,6 +4632,33 @@ Result Leaf::eval(DataFilterRuntime *df, Leaf *leaf, const Result &x, long it, R
return returning;
}

if (leaf->function == "kmeans") {
// kmeans(centers|assignments, k, dim1, dim2, dim3)

Result returning(0);

QString symbol = *(leaf->fparms[0]->lvalue.n);
bool wantcenters=false;
if (symbol == "centers") wantcenters=true;

// get k
int k = eval(df, leaf->fparms[1],x, it, m, p, c, s, d).number();

FastKmeans *kmeans = new FastKmeans();

// loop through the dimensions
for(int i=2; i<leaf->fparms.count(); i++)
kmeans->addDimension(eval(df, leaf->fparms[i],x, it, m, p, c, s, d).asNumeric());

// calculate
if (kmeans->run(k)) {
if (wantcenters) returning = kmeans->centers();
else returning = kmeans->assignments();
}

return returning;
}

if (leaf->function == "metrics" || leaf->function == "metricstrings" ||
leaf->function == "aggmetrics" || leaf->function == "aggmetricstrings") {

Expand Down
141 changes: 141 additions & 0 deletions src/Metrics/FastKmeans.cpp
@@ -0,0 +1,141 @@
/*
* Copyright (c) 2021 Mark Liversedge (liversedge@gmail.com)
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc., 51
* Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/

#include "FastKmeans.h"

#include "kmeans_general_functions.h"

FastKmeans::FastKmeans() : kmeans(NULL), data(NULL), assignments_(NULL), centers_(NULL), length_(-1), k_(-1) {}
FastKmeans::~FastKmeans()
{
if (data) delete data;
if (kmeans) delete kmeans;
if (centers_) delete centers_;
if (assignments_) delete [] assignments_;
}

// all dimensions are resized to the largest
// and filled with zeroes, but really the caller
// should make sure they match
void
FastKmeans::addDimension(QVector<double> &data)
{
// take a copy, and init for first dimension
dimension.append(data);
int index = dimension.count() - 1;

// first init length, no resizing needed
if (dimension.count() == 1) length_=data.length();
else {
if (data.length() > length_) {

// if longer, we need to resize everyone else
for(int i=0; i<index; i++) {
dimension[i].resize(data.length());
for(int j=length_; j<data.length(); j++)
dimension[i][j]=0;
}
length_ = data.length();

} else if (data.length() < length_) {

// if shorter we need to resize ours
dimension[index].resize(length_);
for(int j=length_; j<data.length(); j++)
dimension[index][j]=0;
}
}
}

// find centers and assignments for k clusters
bool
FastKmeans::run(int k)
{
// no data, or dimensions
if (k <2 || length_ <= 0 || dimension.count() <= 0) return false;

// set number if clusters we looked for
k_ = k;

// if we have old data, delete it
if (data) delete data;
if (kmeans) delete kmeans;
if (centers_) delete centers_;
if (assignments_) delete [] assignments_;

// lets get a new one
kmeans = new HamerlyKmeans();
data = new Dataset(length(), dim());

// now fill the data set with our data
int index=0;
for(int i=0; i<length(); i++)
for(int j=0; j<dim(); j++)
data->data[index++] = dimension[j][i];

// initialise centers
centers_ = init_centers_kmeanspp_v2(*data, k_);

// initialise assignments
assignments_ = new unsigned short[length()];

// setup
kmeans_assign(*data, *centers_, assignments_);
kmeans->initialize(data, k, assignments_, 1);

// run the algorithm, max out at 10,000 iterations
// it returns true or false if it succeeded
return kmeans->run(10000);
}

// get centers (k x dimensions)
QVector<double>
FastKmeans::centers()
{
QVector<double> returning;

if (kmeans == NULL) return returning;

Dataset const *finalcenters = kmeans->getCenters();

// lets reorganise them to d1,d1,d1,d2,d2,d2,d2,d3,d3,d3
// from d1,d2,d3,d1,d2,d3,d1,d2,d3
for(int d=0; d<dim(); d++)
for(int n=0; n<k(); n++)
returning << finalcenters->data[(n * dim()) + d];

return returning;
}

// get assignments - n indexes
QVector<double>
FastKmeans::assignments()
{
QVector<double> returning;

if (kmeans == NULL) return returning;

Dataset const *finalcenters = kmeans->getCenters();
kmeans_assign(*data, *finalcenters, assignments_);

// lets reorganise and convert to doubles (datafilter likes these)
for (int i = 0; i < data->n; ++i) returning << assignments_[i];

return returning;
}

68 changes: 68 additions & 0 deletions src/Metrics/FastKmeans.h
@@ -0,0 +1,68 @@
/*
* Copyright (c) 2021 Mark Liversedge (liversedge@gmail.com)
*
* This program is free software; you can redistribute it and/or modify it
* under the terms of the GNU General Public License as published by the Free
* Software Foundation; either version 2 of the License, or (at your option)
* any later version.
*
* This program is distributed in the hope that it will be useful, but WITHOUT
* ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
* FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for
* more details.
*
* You should have received a copy of the GNU General Public License along
* with this program; if not, write to the Free Software Foundation, Inc., 51
* Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA
*/

#include "hamerly_kmeans.h"
#include "kmeans_dataset.h"
#include <QVector>
#include <QList>

#ifndef _GC_FastKmeans_h
#define _GC_FastKmeans_h 1

class FastKmeans
{
public:

// since we are a wrapper around algorithms we need
// to initialise and cleanup subordinates, especially
// since they typically use non-QT containers
FastKmeans();
~FastKmeans();

// all dimensions are resized to the largest
// and filled with zeroes, but really the caller
// should make sure they match
void addDimension(QVector<double> &data);

// find centers and assignments for k clusters
bool run(int k);

// get centers (k x dimensions)
QVector<double> centers();

// get assignments - n indexes in datafilter order (use in overview tables etc)
QVector<double> assignments();

int length() const { return length_; } ; // number of points
int dim() const { return dimension.count(); } // number of dimensions to a point
int k() const { return k_; } // number of clusters used

private:

HamerlyKmeans *kmeans; // the algorithm we use
Dataset *data;
unsigned short *assignments_; // updated with the cluster assignments
Dataset *centers_;

QList<QVector<double> > dimension;

int length_; // updated as we add dimensions, but really should be the same
int k_; // updated when we run
};

#endif
4 changes: 2 additions & 2 deletions src/src.pro
Expand Up @@ -741,7 +741,7 @@ HEADERS += Gui/AboutDialog.h Gui/AddIntervalDialog.h Gui/AnalysisSidebar.h Gui/C
HEADERS += Metrics/Banister.h Metrics/CPSolver.h Metrics/Estimator.h Metrics/ExtendedCriticalPower.h Metrics/HrZones.h Metrics/PaceZones.h \
Metrics/PDModel.h Metrics/PMCData.h Metrics/PowerProfile.h Metrics/RideMetadata.h Metrics/RideMetric.h Metrics/SpecialFields.h \
Metrics/Statistic.h Metrics/UserMetricParser.h Metrics/UserMetricSettings.h Metrics/VDOTCalculator.h Metrics/WPrime.h Metrics/Zones.h \
Metrics/BlinnSolver.h
Metrics/BlinnSolver.h Metrics/FastKmeans.h

## Planning and Compliance
HEADERS += Planning/PlanningWindow.h
Expand Down Expand Up @@ -850,7 +850,7 @@ SOURCES += Metrics/aBikeScore.cpp Metrics/aCoggan.cpp Metrics/AerobicDecoupling.
Metrics/SwimMetrics.cpp Metrics/SpecialFields.cpp Metrics/Statistic.cpp Metrics/SustainMetric.cpp Metrics/SwimScore.cpp \
Metrics/TimeInZone.cpp Metrics/TRIMPPoints.cpp Metrics/UserMetric.cpp Metrics/UserMetricParser.cpp Metrics/VDOTCalculator.cpp \
Metrics/VDOT.cpp Metrics/WattsPerKilogram.cpp Metrics/WPrime.cpp Metrics/Zones.cpp Metrics/HrvMetrics.cpp Metrics/BlinnSolver.cpp \
Metrics/RowMetrics.cpp
Metrics/RowMetrics.cpp Metrics/FastKmeans.cpp

## Planning and Compliance
SOURCES += Planning/PlanningWindow.cpp
Expand Down

0 comments on commit 4c72088

Please sign in to comment.