Permalink
Browse files

Rough draft of Clustering package

  • Loading branch information...
0 parents commit 641ba8f67da06670b0243226780d76609e6a2cc8 @johnmyleswhite johnmyleswhite committed Nov 24, 2012
Showing with 233 additions and 0 deletions.
  1. +22 −0 LICENSE.md
  2. +25 −0 README.md
  3. 0 REQUIRE
  4. +21 −0 examples/dp_means.jl
  5. +12 −0 examples/k_means.jl
  6. +9 −0 src/Clustering.jl
  7. +74 −0 src/dp_means.jl
  8. +62 −0 src/k_means.jl
  9. +8 −0 src/types.jl
  10. 0 test/01.jl
@@ -0,0 +1,22 @@
+Clustering.jl is licensed under the MIT License:
+
+> Copyright (c) 2012: John Myles White and other contributors.
+>
+> Permission is hereby granted, free of charge, to any person obtaining
+> a copy of this software and associated documentation files (the
+> "Software"), to deal in the Software without restriction, including
+> without limitation the rights to use, copy, modify, merge, publish,
+> distribute, sublicense, and/or sell copies of the Software, and to
+> permit persons to whom the Software is furnished to do so, subject to
+> the following conditions:
+>
+> The above copyright notice and this permission notice shall be
+> included in all copies or substantial portions of the Software.
+>
+> THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+> EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+> MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND
+> NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE
+> LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION
+> OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION
+> WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE.
@@ -0,0 +1,25 @@
+# Installation
+
+ require("pkg")
+ Pkg.add("Clustering")
+
+# Functionality
+
+* k_means
+* dp_means
+
+# Examples
+
+ load("Clustering")
+ using Clustering
+
+ srand(1)
+
+ n = 100
+
+ x = vcat(hcat(randn(n), randn(n)),
+ hcat(randn(n) + 10, randn(n) + 10))
+ y = vcat(zeros(n), ones(n))
+
+ k_means(x, 2)
+ dp_means(x, 3.0)
No changes.
@@ -0,0 +1,21 @@
+load("Clustering")
+using Clustering
+
+function generate_data(n::Int64)
+ data = zeros(n, 2)
+ mu_x = [0.0, 5.0, 10.0, 15.0]
+ mu_y = [0.0, 5.0, 0.0, -5.0]
+
+ for i = 1:n
+ assignment = randi(4)
+ data[i, 1] = randn(1)[1] + mu_x[assignment]
+ data[i, 2] = randn(1)[1] + mu_y[assignment]
+ end
+
+ data
+end
+
+generate_data() = generate_data(1000)
+
+data = generate_data()
+dp_means(data, 0.5)
@@ -0,0 +1,12 @@
+load("Clustering")
+using Clustering
+
+srand(1)
+
+n = 100
+
+x = vcat(hcat(randn(n), randn(n)),
+ hcat(randn(n) + 10, randn(n) + 10))
+y = vcat(zeros(n), ones(n))
+
+k_means(x, 2)
@@ -0,0 +1,9 @@
+module Clustering
+ using Base
+
+ export k_means, dp_means
+
+ load("Clustering/src/types.jl")
+ load("Clustering/src/k_means.jl")
+ load("Clustering/src/dp_means.jl")
+end
@@ -0,0 +1,74 @@
+# Currently assumes data is 2D.
+# Tolerance good one?
+
+function dp_means(data, lambda, max_iterations, tolerance)
+ n = size(data, 1)
+ k = 1
+ assignments = ones(n)
+ mu_x = [mean(data, 1)[1]]
+ mu_y = [mean(data, 1)[2]]
+
+ converged = false
+ iteration = 0
+
+ ss_old = Inf
+ ss_new = Inf
+
+ while !converged && iteration < max_iterations
+ iteration = iteration + 1
+
+ for i = 1:n
+ distances = zeros(k)
+
+ for j = 1:k
+ distances[j] = (data[i, 1] - mu_x[j])^2 + (data[i, 2] - mu_y[j])^2
+ end
+
+ if min(distances) > lambda
+ k = k + 1
+ assignments[i] = k
+ mu_x = [mu_x, data[i, 1]]
+ mu_y = [mu_y, data[i, 2]]
+ else
+ assignments[i] = find(distances .== min(distances))[1]
+ end
+ end
+
+ for j = 1:k
+ if sum(assignments .== j) > 0
+ indices = find(assignments .== j)
+ mu_x[j] = mean(data[indices, 1])
+ mu_y[j] = mean(data[indices, 2])
+ end
+ end
+
+ ss_old = ss_new
+ ss_new = 0
+
+ for i = 1:n
+ ss_new = ss_new + (data[i, 1] - mu_x[assignments[i]])^2 + (data[i, 2] - mu_y[assignments[i]])^2
+ end
+
+ ss_change = ss_old - ss_new
+
+ if !isnan(ss_change) && ss_change < tolerance
+ converged = true
+ end
+ end
+
+ centers = Dict()
+ centers[:x] = mu_x
+ centers[:y] = mu_y
+
+ results = Dict()
+ results[:centers] = centers
+ results[:assignments] = assignments
+ results[:k] = k
+ results[:iterations] = iteration
+ results[:converged] = converged
+ results[:ss] = ss_new
+
+ results
+end
+
+dp_means(data, lambda) = dp_means(data, lambda, 1000, 10e-3)
@@ -0,0 +1,62 @@
+# Compute RSS
+# Squared Euclidean distance between point and its assigned center, summed over all points.
+# When current_rss and previous_rss are within tolerance, stop.
+
+function k_means(x, k)
+ n = size(x, 1)
+ p = size(x, 2)
+
+ # Random initializations of assignments.
+ y = int((k - 1) * rand(n)) + 1
+
+ # Compute centers of initial clusters.
+ centers = zeros(k, p)
+
+ converged = false
+
+ iter = 0
+ max_iter = 1000
+
+ previous_rss = Inf
+ current_rss = Inf
+ delta_rss = Inf
+
+ while delta_rss > 10e-6 && iter < max_iter
+ # Recompute cluster assignments given centers.
+ for j = 1:k
+ indices = find(y .== j)
+ centers[j, :] = mean(x[indices, :], 1)
+ end
+
+ # Reassign points to closest cluster.
+ for i = 1:n
+ distances = map(j -> norm(x[i, :] - centers[j, :]), [1:k])
+ y[i] = find(distances .== min(distances))[1]
+ end
+
+ previous_rss = current_rss
+ current_rss = rss(x, y, centers)
+ delta_rss = previous_rss - current_rss
+
+ iter = iter + 1
+ end
+
+ results = KMeansOutput()
+
+ results.assignments = y
+ results.centers = centers
+ results.iterations = iter
+ results.rss = current_rss
+
+ results
+end
+
+function rss(x, y, centers)
+ residuals = 0.0
+
+ for i = 1:size(x, 1)
+ residuals += norm(x[i, :] - centers[y[i], :])^2
+ end
+
+ residuals
+end
@@ -0,0 +1,8 @@
+type KMeansOutput
+ assignments
+ centers
+ iterations
+ rss
+end
+
+KMeansOutput() = KMeansOutput(0, 0, 0, 0)
No changes.

0 comments on commit 641ba8f

Please sign in to comment.