V-measure (#126)

JuliaStats · Aug 20, 2018 · 7c76cac · 7c76cac
1 parent f153e4c
commit 7c76cac
Show file tree

Hide file tree

Showing 7 changed files with 152 additions and 2 deletions.
diff --git a/README.md b/README.md
@@ -35,6 +35,7 @@ Pkg.add("Clustering")
 - Silhouettes
 - Variation of Information
 - Rand index
+- V-Measure
 
 ## Resources
 

diff --git a/doc/source/validate.rst b/doc/source/validate.rst
@@ -3,8 +3,9 @@ Clustering Validation
 
 This package provides a variety of ways to validate or evaluate clustering results:
 
-.. toctree:: 
+.. toctree::
 
 	silhouette.rst
 	varinfo.rst
 	randindex.rst
+	vmeasure.rst
diff --git a/doc/source/vmeasure.rst b/doc/source/vmeasure.rst
@@ -0,0 +1,36 @@
+V-measure
+=============
+
+The V-Measure is defined as the harmonic mean of homogeneity :math:`h` and completeness :math:`c` of the clustering. Both these measures can be expressed in terms of the mutual information and entropy measures of the information theory.
+
+.. math::
+
+	V_\beta = (1+\beta)\frac{h \cdot c}{\beta \cdot h + c}
+
+Homogeneity :math:`h` is maximized when each cluster contains elements of as few different classes as possible. Completeness :math:`c` aims to put all elements of each class in single clusters.
+
+**References:**
+
+    Andrew Rosenberg and Julia Hirschberg, 2007. "V-Measure: A conditional entropy-based external cluster evaluation measure"
+
+The metric is implemented by the ``vmeasure`` function:
+
+.. function:: vmeasure(assign1, assign2; β = 1.0)
+
+	Compute V-measure value between two clustering assignments.
+
+	:param assign1: the vector of assignments for the first clustering.
+	:param assign2: the vector of assignments for the second clustering.
+	:param β: the weight of harmonic mean of homogeneity and completeness.
+
+	:return: a V-measure value.
+
+.. function:: vmeasure(R, assign)
+
+    This method takes ``R``, an instance of ``ClusteringResult``, and the corresponding assignment vector ``assign`` as input, and computes V-measure value (see above).
+
+.. function:: vmeasure(R1, R2)
+
+    This method takes ``R1`` and ``R2`` (both are instances of ``ClusteringResult``) and computes V-measure value (see above).
+
+	It is equivalent to ``vmeasure(assignments(R1), assignments(R1))``.
diff --git a/src/Clustering.jl b/src/Clustering.jl
@@ -51,6 +51,9 @@ module Clustering
     # randindex
     randindex,
 
+    # V-measure
+    vmeasure,
+
     # hclust
     Hclust, hclust, cutree,
 
@@ -72,6 +75,7 @@ module Clustering
     include("silhouette.jl")
     include("randindex.jl")
     include("varinfo.jl")
+    include("vmeasure.jl")
 
     include("hclust.jl")
 

diff --git a/src/vmeasure.jl b/src/vmeasure.jl
@@ -0,0 +1,48 @@
+# V-measure of contingency table
+function _vmeasure(A::AbstractMatrix{<:Integer}; β::Real)
+    (β >= 0) || throw(ArgumentError("β should be nonnegative"))
+
+    N = sum(A)
+    (N == 0.0) && return 0.0
+
+    entA = entropy(A)
+    entArows = entropy(sum(A, dims=2))
+    entAcols = entropy(sum(A, dims=1))
+
+    hck = (entA - entAcols)/N
+    hkc = (entA - entArows)/N
+    hc = entArows/N + log(N)
+    hk = entAcols/N + log(N)
+
+    # Homogeneity
+    h = hc == 0.0 ? 1.0 : 1.0 - hck/hc
+    # Completeness
+    c = hk == 0.0 ? 1.0 : 1.0 - hkc/hk
+
+    # V-measure
+    V_β = (1 + β)*h*c/(β*h + c)
+    return V_β
+end
+
+"""
+    vmeasure(assign1, assign2; β = 1.0)
+
+V-measure between two clustering assignments.
+
+`assign1` and `assign2` can be either `ClusteringResult` objects or
+assignments vectors (`AbstractVector{<:Integer}`).
+
+The `β` parameter defines trade-off between _homogeneity_ and _completeness_:
+ * if `β` is greater than 1, _completeness_ is weighted more strongly,
+ * if `β` is less than 1, _homogeneity_ is weighted more strongly.
+
+*Ref:* Andrew Rosenberg and Julia Hirschberg, 2007. "V-Measure: A conditional entropy-based external cluster evaluation measure"
+"""
+function vmeasure(assign1::Union{AbstractVector{<:Integer}, ClusteringResult},
+                  assign2::Union{AbstractVector{<:Integer}, ClusteringResult};
+                  β::Real = 1.0)
+    _assign1 = isa(assign1, AbstractVector) ? assign1 : assignments(assign1)
+    _assign2 = isa(assign2, AbstractVector) ? assign2 : assignments(assign2)
+    return _vmeasure(counts(_assign1, _assign2,
+                            (1:maximum(_assign1), 1:maximum(_assign2))), β=β)
+end
diff --git a/test/runtests.jl b/test/runtests.jl
@@ -15,7 +15,8 @@ tests = ["seeding",
          "varinfo",
          "randindex",
          "hclust",
-         "mcl"]
+         "mcl",
+         "vmeasure"]
 
 println("Runing tests:")
 for t in tests

diff --git a/test/vmeasure.jl b/test/vmeasure.jl
@@ -0,0 +1,59 @@
+using Test
+using Clustering
+
+@testset "V-measure" begin
+    @testset "reproducing fig.2" begin
+        # Tests are taken from the fig. 2 of the referenced paper:
+        # V-Measure: A conditional entropy-based external cluster evaluation measure,
+        # Andrew Rosenberg and Julia Hirschberg
+
+        clus = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3]
+        v = vmeasure(clus, clus)
+        @test v == 1.0
+
+        clas = [1, 1, 1, 2, 3, 3, 3, 3, 1, 2, 2, 2, 2, 1, 3]
+        v = vmeasure(clas, clus)
+        @test v ≈ 0.14 atol=1e-2
+
+        clas = [1, 1, 1, 2, 2, 3, 3, 3, 1, 1, 2, 2, 2, 3, 3]
+        v = vmeasure(clas, clus)
+        @test v ≈ 0.39 atol=1e-2
+
+        clus = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 4, 5, 5, 6, 6]
+        clas = [1, 1, 1, 2, 2, 3, 3, 3, 1, 1, 2, 2, 2, 3, 3, 1, 2, 3, 1, 2, 3]
+        v = vmeasure(clas, clus)
+        @test v ≈ 0.30 atol=1e-2
+
+        clus = [1, 1, 1, 1, 1, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 4, 5, 6, 7, 8, 9]
+        v = vmeasure(clas, clus)
+        @test v ≈ 0.41 atol=1e-2
+
+        @test_throws ArgumentError vmeasure(clas, clus, β = -1.0)
+    end
+
+    @testset "comparing 2 k-means clusterings" begin
+        Random.seed!(34568)
+        m = 3
+        n = 1000
+        k = 10
+        x = rand(m, n)
+
+        # non-weighted
+        r1 = kmeans(x, k; maxiter=50)
+        r2 = kmeans(x, k; maxiter=50)
+        v = vmeasure(r1, r2)
+        @test 0.5 < v < 1.0
+        @test_broken v ≈ 0.75 atol=1e-2 # FIXME why 0.75?
+    end
+
+    @testset "comparing 2 random label assignments" begin
+        Random.seed!(34568)
+        k = 10
+        n = 10000
+
+        a1 = rand(1:k, n)
+        a2 = rand(1:k, n)
+        v = vmeasure(a1, a2)
+        @test v ≈ 0.0 atol=1e-2 # should be close to zero
+    end
+end