Permalink
Browse files

Add k-medoids implem and silhouettes

  • Loading branch information...
1 parent 737dca8 commit 7efd434a6b521404a69c89c8d5b027d69aefe1e1 @lendle lendle committed Feb 5, 2014
Showing with 266 additions and 2 deletions.
  1. +35 −2 README.md
  2. +6 −0 src/Clustering.jl
  3. +176 −0 src/kmedoids.jl
  4. +25 −0 src/sil.jl
  5. +12 −0 test/kmedoids.jl
  6. +12 −0 test/sil.jl
View
@@ -13,15 +13,18 @@ Pkg.add("Clustering")
Currently working algorithms:
-* Kmeans
+* K-means
* Affinity Propagation
+* K-medoids
To be available:
-* K medoids
* DP means
* ISO Data
+# Performance evaluation
+
+For partitioning methods, silhouette widths can be calculated.
# Documentation
@@ -129,3 +132,33 @@ type AffinityPropagationResult
end
```
+## K medoids
+
+K-medoids is a partitioning algorithm like k-means, but cluster centers are observations
+in the data set(,the medoids,) instead of cluster means, and any distance metric can
+be used, not necessarily the Euclidean distance.
+
+The input of the algorithm is a dissimilarity matrix `dist`, which should be symmetric, and
+the number of clusters, `k`.
+
+Interfaces:
+
+```julia
+result = kmedoids(dist, k)
+```
+
+The method returns a `KmedoidsResult`:
+
+```julia
+type KmedoidsResult
+ medoids::Vector{Int} #indexes of medoids
+ assignments::Vector{Int} #cluster assignments
+end
+```
+
+## Silhouette widths
+
+[Silhouettes](http://en.wikipedia.org/wiki/Silhouette_%28clustering%29) are useful for evaluating
+the performance of a partitioning clustering algorithm or to help choose the number of clusters.
+The `silhouettes` function takes cluster assignments and a dissimilarity matrix and returns
+a vector containing silhouettes for each observation.
View
@@ -10,9 +10,15 @@ module Clustering
export AffinityPropagationOpts
export affinity_propagation
+ export kmedoids
+
+ export silhouettes
+
include("utils.jl")
include("seeding.jl")
include("kmeans.jl")
include("affprop.jl")
include("dbscan.jl")
+ include("kmedoids.jl")
+ include("sil.jl")
end
View
@@ -0,0 +1,176 @@
+type KmedoidsResult
+ medoids::Vector{Int} #indexes of medoids
+ assignments::Vector{Int} #cluster assignments
+end
+
+
+##################################################################
+## **Build** phase based on http://www.cs.umb.edu/cs738/pam1.pdf
+##################################################################
+
+function update_DE!(D, E, S, U, dist)
+ for i in S
+ D[i] = i
+ end
+ for i in collect(U)
+ (D[i], min_index) = findmin(dist[S, i])
+ #(E[i], _) = findmin(dist[[S[1:min_index-1], S[min_index+1:end]], i])
+
+ end
+end
+
+function update_SU!(D, E, S, U, dist)
+ best_g = -Inf
+ best_i = 0
+ i = 0
+ cU = collect(U) #faster than iterating over a set
+ for i in cU
+ g_i = 0.0
+ for j in cU
+ if i != j
+ #doing this in 2 steps is faster than in max(D[j]-dist[j,i]) for some reason
+ d = D[j] - dist[j,i]
+ g_i += max(d, 0.0)
+ end
+ end
+
+ if g_i > best_g
+ best_g = g_i
+ best_i = i
+ end
+ end
+
+ push!(S, i)
+ delete!(U, i)
+end
+
+function pam_build!(D, E, S, U, dist, k)
+ for i in 1:k-1
+ update_SU!(D, E, S, U, dist)
+ update_DE!(D, E, S, U, dist)
+ end
+end
+
+function pam_build(dist, k)
+ n = size(dist, 1)
+
+ best_sumdist = Inf
+ best_p = 0
+ for p in 1:n
+ sumdist = sum(view(dist, :, p))
+ if sumdist < best_sumdist
+ best_sumdist = sumdist
+ best_p = p
+ end
+ end
+ S = [best_p]
+ U = IntSet((1:best_p-1)..., (best_p+1:n)...)
+
+ D = dist[collect(S), :][:]
+ E = Inf*ones(n);
+
+ pam_build!(D, E, S, U, dist, k)
+
+ return sort!(S)
+end
+##################################################################
+## End build phase functions
+##################################################################
+
+
+##################################################################
+## **Swap** phase based on steps 2 to 5 in
+## [K-medois](http://en.wikipedia.org/wiki/K-medoids)
+## wikipedia page and
+## [Data Mining and Algorithms in R](http://bit.ly/1inWWWh)
+## wikibook
+## The version in the notes I used for the build phase is probably
+## more memory-efficient and possibly faster
+##################################################################
+
+function assign_to_medoids(medoid_indeces, dist)
+ n = size(dist, 1)
+ medoid_assignments = (Int => Vector{Int})[]
+
+ for i in 1:n
+ if !in(i, medoid_indeces)
+ min_dist = Inf
+ min_medoid = 0
+ for m in medoid_indeces
+ if dist[i,m] < min_dist
+ min_dist = dist[i,m]
+ min_medoid = m
+ end
+ end
+ if haskey(medoid_assignments, min_medoid)
+ push!(medoid_assignments[min_medoid], i)
+ else
+ medoid_assignments[min_medoid] = [i]
+ end
+ end
+ end
+ return medoid_assignments
+end
+
+function medoid_score(m, neighbors, dist)
+ score = 0.0
+ for n in neighbors
+ score += dist[n, m]
+ end
+ return score
+end
+
+function best_medoid(cluster, dist)
+ best = 0.0
+ best_score = Inf
+ for medoid in cluster
+ #can just pass cluster instead of neighbors because adding dist[m,m] does nothing
+ score = medoid_score(medoid, cluster, dist)
+ if score < best_score
+ best_score=score
+ best = medoid
+ end
+ end
+ return best
+end
+
+function update_medoids(medoids_and_neighbors, dist)
+ medoids = Array(Int, length(medoids_and_neighbors))
+ for (i,(m, neighbors)) in enumerate(medoids_and_neighbors)
+ medoids[i] = best_medoid([m, neighbors], dist)
+ end
+ return sort!(medoids)
+end
+
+function pam_swap(medoids, dist)
+ old_medoids = Int[]
+
+ medoids_and_neighbors = [Int => [Int]]
+ while medoids != old_medoids
+ medoids_and_neighbors = assign_to_medoids(medoids, dist)
+ old_medoids = medoids
+ medoids = update_medoids(medoids_and_neighbors, dist)
+ end
+
+ return medoids_and_neighbors
+end
+##################################################################
+## end swap phase functions
+##################################################################
+
+function kmedoids{R <: FloatingPoint}(dist::Matrix{R}, k::Int)
+ n = size(dist, 1)
+ @assert 2 < k < n
+ @assert issym(dist)
+
+ medoids = pam_build(dist, k)
+ medoids_and_neighbors = pam_swap(medoids, dist)
+
+ cluster_membership = zeros(Int, size(dist, 1))
+ for (i, (medoid, neighbors)) in enumerate(medoids_and_neighbors)
+ cluster_membership[[medoid, neighbors]] = i
+ medoids[i] = medoid
+ end
+ KmedoidsResult(medoids, cluster_membership)
+end
+
View
@@ -0,0 +1,25 @@
+
+function silhouettes{T<:FloatingPoint}(assignments::Vector{Int}, dist::Matrix{T})
+ n = size(dist, 1)
+ sils = Array(Float64, n)
+ k = size(unique(assignments), 1)
+
+ for i = 1:n
+ scores = zeros(Float64, k)
+ counts = zeros(Int, k)
+
+ for j in 1:n
+ m = assignments[j]
+ scores[m] += dist[j,i]
+ counts[m] += 1
+ end
+ scores ./= counts
+
+ a = scores[assignments[i]]
+ splice!(scores, assignments[i])
+ b = minimum(scores)
+ sils[i] = (b-a) / max(a,b)
+ end
+ sils
+end
+
View
@@ -0,0 +1,12 @@
+using Base.Test
+
+using Distance
+using Clustering
+
+
+x = rand(100, 500)
+dist = pairwise(Euclidean(), x)
+
+@test isa(kmedoids(dist, 3), Clustering.KmedoidsResult)
+
+@test_throws kmedoids(dist, 500)
View
@@ -0,0 +1,12 @@
+using Base.Test
+
+using Distance
+using Clustering
+
+
+x = rand(100, 500)
+dist = pairwise(Euclidean(), x)
+
+assignments = mod(rand(Int, 500), 4) .+ 1
+
+@test isa(silhouettes(assignments, dist), Vector)

0 comments on commit 7efd434

Please sign in to comment.