Merge pull request #21 from rawls238/master

Add Jaccard + Rogers-Tanimoto Distances Fixes #10
JuliaStats · Dec 3, 2015 · 3dec24c · 3dec24c
2 parents 81cc6e4 + 6a310a4
commit 3dec24c
Show file tree

Hide file tree

Showing 4 changed files with 71 additions and 12 deletions.
diff --git a/README.md b/README.md
@@ -14,7 +14,9 @@ This package also provides optimized functions to compute column-wise and pairwi
 
 * Euclidean distance
 * Squared Euclidean distance
-* Cityblock distance 
+* Cityblock distance
+* Jaccard distance
+* Rogers-Tanimoto distance
 * Chebyshev distance
 * Minkowski distance
 * Hamming distance
@@ -47,7 +49,7 @@ Here, dist is an instance of a distance type. For example, the type for Euclidea
 
 ```julia
 r = evaluate(Euclidean(), x, y)
-``` 
+```
 
 Common distances also come with convenient functions for distance evaluation. For example, you may also compute Euclidean distance between two vectors as below
 
@@ -103,36 +105,38 @@ Please pay attention to the difference, the functions for inplace computation ar
 
 ## Distance type hierarchy
 
-The distances are organized into a type hierarchy. 
+The distances are organized into a type hierarchy.
 
 At the top of this hierarchy is an abstract class **PreMetric**, which is defined to be a function ``d`` that satisfies
 
 	d(x, x) == 0  for all x
 	d(x, y) >= 0  for all x, y
-	
+
 **SemiMetric** is a abstract type that refines **PreMetric**. Formally, a *semi-metric* is a *pre-metric* that is also symmetric, as
 
 	d(x, y) == d(y, x)  for all x, y
-	
+
 **Metric** is a abstract type that further refines **SemiMetric**. Formally, a *metric* is a *semi-metric* that also satisfies triangle inequality, as
 
 	d(x, z) <= d(x, y) + d(y, z)  for all x, y, z
-	
+
 This type system has practical significance. For example, when computing pairwise distances between a set of vectors, you may only perform computation for half of the pairs, and derive the values immediately for the remaining halve by leveraging the symmetry of *semi-metrics*.
 
 Each distance corresponds to a distance type. The type name and the corresponding mathematical definitions of the distances are listed in the following table.
 
-| type name            |  convenient syntax   | math definition     | 
+| type name            |  convenient syntax   | math definition     |
 | -------------------- | -------------------- | --------------------|
 |  Euclidean           |  euclidean(x, y)     | sqrt(sum((x - y) .^ 2)) |
 |  SqEuclidean         |  sqeuclidean(x, y)   | sum((x - y).^2) |
 |  Cityblock           |  cityblock(x, y)     | sum(abs(x - y)) |
 |  Chebyshev           |  chebyshev(x, y)     | max(abs(x - y)) |
 |  Minkowski           |  minkowski(x, y, p)  | sum(abs(x - y).^p) ^ (1/p) |
 |  Hamming             |  hamming(x, y)       | sum(x .!= y) |
+|  Rogers-Tanimoto     |  rogerstanimoto(x, y)| 2(sum(x&!y) + sum(!x&y)) / (2(sum(x&!y) + sum(!x&y)) + sum(x&y) + sum(!x&!y)) |
+|  Jaccard             |  jaccard(x, y)       | 1 - sum(min(x, y)) / sum(max(x, y)) |
 |  CosineDist          |  cosine_dist(x, y)   | 1 - dot(x, y) / (norm(x) * norm(y)) |
 |  CorrDist            |  corr_dist(x, y)     | cosine_dist(x - mean(x), y - mean(y)) |
-|  ChiSqDist           |  chisq_dist(x, y)    | sum((x - y).^2 / (x + y)) | 
+|  ChiSqDist           |  chisq_dist(x, y)    | sum((x - y).^2 / (x + y)) |
 |  KLDivergence        |  kl_divergence(x, y) | sum(p .* log(p ./ q)) |
 |  JSDivergence        |  js_divergence(x, y) | KL(x, m) / 2 + KL(y, m) / 2 with m = (x + y) / 2 |
 |  SpanNormDist        |  spannorm_dist(x, y) | max(x - y) - min(x - y ) |
@@ -145,7 +149,7 @@ Each distance corresponds to a distance type. The type name and the correspondin
 |  WeightedCityblock   |  cityblock(x, y, w)      | sum(abs(x - y) .* w)  |
 |  WeightedMinkowski   |  minkowski(x, y, w, p)   | sum(abs(x - y).^p .* w) ^ (1/p)  |
 |  WeightedHamming     |  hamming(x, y, w)        | sum((x .!= y) .* w)  |
-  
+
 **Note:** The formulas above are using *Julia*'s functions. These formulas are mainly for conveying the math concepts in a concise way. The actual implementation may use a faster way.
 
 

diff --git a/src/Distances.jl b/src/Distances.jl
@@ -24,6 +24,8 @@ export
     Cityblock,
     Chebyshev,
     Minkowski,
+    Jaccard,
+    RogersTanimoto,
 
     Hamming,
     CosineDist,
@@ -47,6 +49,8 @@ export
     euclidean,
     sqeuclidean,
     cityblock,
+    jaccard,
+    rogerstanimoto,
     chebyshev,
     minkowski,
     mahalanobis,

diff --git a/src/metrics.jl b/src/metrics.jl
@@ -10,6 +10,8 @@ type Euclidean <: Metric end
 type SqEuclidean <: SemiMetric end
 type Chebyshev <: Metric end
 type Cityblock <: Metric end
+type Jaccard <: Metric end
+type RogersTanimoto <: Metric end
 
 immutable Minkowski{T <: Real} <: Metric
     p::T
@@ -26,7 +28,7 @@ type JSDivergence <: SemiMetric end
 
 type SpanNormDist <: SemiMetric end
 
-typealias UnionMetrics @compat(Union{Euclidean, SqEuclidean, Chebyshev, Cityblock, Minkowski, Hamming, CosineDist, CorrDist, ChiSqDist, KLDivergence, JSDivergence, SpanNormDist})
+typealias UnionMetrics @compat(Union{Euclidean, SqEuclidean, Chebyshev, Cityblock, Minkowski, Hamming, Jaccard, RogersTanimoto, CosineDist, CorrDist, ChiSqDist, KLDivergence, JSDivergence, SpanNormDist})
 
 ###########################################################
 #
@@ -155,7 +157,7 @@ js_divergence(a::AbstractArray, b::AbstractArray) = evaluate(JSDivergence(), a,
 
 # SpanNormDist
 function eval_start(::SpanNormDist, a::AbstractArray, b::AbstractArray)
-    a[1] - b[1], a[1]- b[1]
+    a[1] - b[1], a[1] - b[1]
 end
 @compat @inline eval_op(::SpanNormDist, ai, bi)  = ai - bi
 @compat @inline function eval_reduce(::SpanNormDist, s1, s2)
@@ -175,6 +177,46 @@ function result_type{T1, T2}(dist::SpanNormDist, ::AbstractArray{T1}, ::Abstract
 end
 
 
+# Jaccard
+
+@compat @inline eval_start(::Jaccard, a::AbstractArray, b::AbstractArray) = 0, 0
+@compat @inline function eval_op(::Jaccard, s1, s2)
+    denominator = max(s1, s2)
+    numerator = min(s1, s2)
+    numerator, denominator
+end
+@compat @inline function eval_reduce(::Jaccard, s1, s2)
+    a = s1[1] + s2[1]
+    b = s1[2] + s2[2]
+    a, b
+end
+@compat @inline eval_end(::Jaccard, a) = 1 - (a[1]/a[2])
+jaccard(a::AbstractArray, b::AbstractArray) = evaluate(Jaccard(), a, b)
+
+# Tanimoto
+
+@compat @inline eval_start(::RogersTanimoto, a::AbstractArray, b::AbstractArray) = 0, 0, 0, 0
+@compat @inline function eval_op(::RogersTanimoto, s1, s2)
+  tt = s1 && s2
+  tf = s1 && !s2
+  ft = !s1 && s2
+  ff = !s1 && !s2
+  tt, tf, ft, ff
+end
+@compat @inline function eval_reduce(::RogersTanimoto, s1, s2)
+    a = s1[1] + s2[1]
+    b = s1[2] + s2[2]
+    c = s1[3] + s2[3]
+    d = s1[4] + s1[4]
+    a, b, c, d
+end
+@compat @inline function eval_end(::RogersTanimoto, a)
+    numerator = 2(a[2] + a[3])
+    denominator = a[1] + a[4] + 2(a[2] + a[3])
+    numerator / denominator
+end
+rogerstanimoto{T <: Bool}(a::AbstractArray{T}, b::AbstractArray{T}) = evaluate(RogersTanimoto(), a, b)
+
 ###########################################################
 #
 #  Special method
@@ -227,6 +269,7 @@ function pairwise!(r::AbstractMatrix, dist::Euclidean, a::AbstractMatrix, b::Abs
     end
     r
 end
+
 function pairwise!(r::AbstractMatrix, dist::Euclidean, a::AbstractMatrix)
     m, n = get_pairwise_dims(r, a)
     At_mul_B!(r, a, a)
@@ -245,6 +288,7 @@ function pairwise!(r::AbstractMatrix, dist::Euclidean, a::AbstractMatrix)
 end
 
 # CosineDist
+
 function pairwise!(r::AbstractMatrix, dist::CosineDist, a::AbstractMatrix, b::AbstractMatrix)
     m, na, nb = get_pairwise_dims(r, a, b)
     At_mul_B!(r, a, b)

diff --git a/test/test_dists.jl b/test/test_dists.jl
@@ -32,6 +32,10 @@ b = 2
 @test hamming(a, a) == 0
 @test hamming(a, b) == 1
 
+bt = [true, false, true]
+bf = [false, true, true]
+@test rogerstanimoto(bt, bt) == 0
+@test rogerstanimoto(bt, bf) == 4./5
 
 
 p = rand(12)
@@ -47,6 +51,9 @@ for (x, y) in (([4., 5., 6., 7.], [3., 9., 8., 1.]),
     @test euclidean(x, x) == 0.
     @test euclidean(x, y) == sqrt(57.)
 
+    @test jaccard(x, x) == 0
+    @test jaccard(x, y) == 13./28
+
     @test cityblock(x, x) == 0.
     @test cityblock(x, y) == 13.
 
@@ -56,11 +63,11 @@ for (x, y) in (([4., 5., 6., 7.], [3., 9., 8., 1.]),
     @test minkowski(x, x, 2) == 0.
     @test minkowski(x, y, 2) == sqrt(57.)
 
-
     @test_approx_eq_eps cosine_dist(x, x) 0.0 1.0e-12
     @test_throws DimensionMismatch cosine_dist(1.:2, 1.:3)
     @test_approx_eq_eps cosine_dist(x, y) (1.0 - 112. / sqrt(19530.)) 1.0e-12
 
+
     @test_approx_eq_eps corr_dist(x, x) 0. 1.0e-12
     @test_approx_eq corr_dist(x, y) cosine_dist(x .- mean(x), vec(y) .- mean(y))