Changed core function to approximate string matching

JabRef · Dec 5, 2015 · 3e8a2d9 · 3e8a2d9
1 parent 6bc0b53
commit 3e8a2d9
Show file tree

Hide file tree

Showing 2 changed files with 60 additions and 86 deletions.
diff --git a/src/main/java/net/sf/jabref/bibtex/DuplicateCheck.java b/src/main/java/net/sf/jabref/bibtex/DuplicateCheck.java
@@ -22,6 +22,9 @@
 import java.util.HashMap;
 import java.util.HashSet;
 
+import org.apache.commons.logging.Log;
+import org.apache.commons.logging.LogFactory;
+
 /**
  * This class contains utility method for duplicate checking of entries.
  */
@@ -47,6 +50,9 @@ public class DuplicateCheck {
     // Extra weighting of those fields that are most likely to provide correct duplicate detection:
     private static final HashMap<String, Double> fieldWeights = new HashMap<>();
 
+    private static final Log LOGGER = LogFactory.getLog(DuplicateCheck.class);
+
+
     static {
         DuplicateCheck.fieldWeights.put("author", 2.5);
         DuplicateCheck.fieldWeights.put("editor", 2.5);
@@ -133,7 +139,7 @@ private static int compareSingleField(String field, BibtexEntry one, BibtexEntry
             // Harmonise case:
             String auth1 = AuthorList.fixAuthor_lastNameOnlyCommas(s1, false).replaceAll(" and ", " ").toLowerCase();
             String auth2 = AuthorList.fixAuthor_lastNameOnlyCommas(s2, false).replaceAll(" and ", " ").toLowerCase();
-            double similarity = DuplicateCheck.correlateByWords(auth1, auth2, false);
+            double similarity = DuplicateCheck.correlateByWords(auth1, auth2);
             if (similarity > 0.8) {
                 return EQUAL;
             }
@@ -154,15 +160,15 @@ private static int compareSingleField(String field, BibtexEntry one, BibtexEntry
             // and without dots:
             s1 = s1.replaceAll("\\.", "").toLowerCase();
             s2 = s2.replaceAll("\\.", "").toLowerCase();
-            double similarity = DuplicateCheck.correlateByWords(s1, s2, true);
+            double similarity = DuplicateCheck.correlateByWords(s1, s2);
             if (similarity > 0.8) {
                 return EQUAL;
             }
             return NOT_EQUAL;
         } else {
             s1 = s1.toLowerCase();
             s2 = s2.toLowerCase();
-            double similarity = DuplicateCheck.correlateByWords(s1, s2, false);
+            double similarity = DuplicateCheck.correlateByWords(s1, s2);
             if (similarity > 0.8) {
                 return EQUAL;
             }
@@ -219,13 +225,13 @@ public static BibtexEntry containsDuplicate(BibtexDatabase database, BibtexEntry
      *                 harmonize their length. If false, use interpolation to harmonize the strings.
      * @return a value in the interval [0, 1] indicating the degree of match.
      */
-    static double correlateByWords(String s1, String s2, boolean truncate) {
+    static double correlateByWords(String s1, String s2) {
         String[] w1 = s1.split("\\s");
         String[] w2 = s2.split("\\s");
         int n = Math.min(w1.length, w2.length);
         int misses = 0;
         for (int i = 0; i < n; i++) {
-            double corr = DuplicateCheck.correlateStrings(w1[i], w2[i], truncate);
+            double corr = similarity(w1[i], w2[i]);
             if (corr < 0.75) {
                 misses++;
             }
@@ -234,88 +240,59 @@ static double correlateByWords(String s1, String s2, boolean truncate) {
         return 1 - missRate;
     }
 
-    private static double correlateStrings(String s1, String s2, boolean truncate) {
-        int minLength = Math.min(s1.length(), s2.length());
-        if (truncate && (minLength == 1)) {
-            return s1.charAt(0) == s2.charAt(0) ? 1.0 : 0.0;
-        } else if ((s1.length() == 1) && (s2.length() == 1)) {
-            return s1.equals(s2) ? 1.0 : 0.0;
-        } else if (minLength == 0) {
-            return s1.isEmpty() && s2.isEmpty() ? 1.0 : 0;
-        }
 
-        // Convert strings to numbers and harmonize length in a method dependent on truncate:
-        if (truncate) {
-            // Harmonize length by truncation:
-            if (s1.length() > minLength) {
-                s1 = s1.substring(0, minLength);
-            }
-            if (s2.length() > minLength) {
-                s2 = s2.substring(0, minLength);
-            }
-        }
-        double[] n1 = DuplicateCheck.numberizeString(s1);
-        double[] n2 = DuplicateCheck.numberizeString(s2);
-        // If truncation is disabled, harmonize length by interpolation:
-        if (!truncate) {
-            if (n1.length < n2.length) {
-                n1 = DuplicateCheck.stretchArray(n1, n2.length);
-            } else if (n2.length < n1.length) {
-                n2 = DuplicateCheck.stretchArray(n2, n1.length);
-            }
+    /**
+     * Calculates the similarity (a number within 0 and 1) between two strings.
+     * http://stackoverflow.com/questions/955110/similarity-string-comparison-in-java
+     */
+    private static double similarity(String s1, String s2) {
+        String longer = s1, shorter = s2;
+        if (s1.length() < s2.length()) { // longer should always have greater length
+            longer = s2;
+            shorter = s1;
         }
-        return DuplicateCheck.corrCoef(n1, n2);
-    }
+        int longerLength = longer.length();
+        if (longerLength == 0) {
+            return 1.0;
+            /* both strings are zero length */ }
+        double sim = (longerLength - editDistance(longer, shorter)) / (double) longerLength;
+        LOGGER.debug("Longer string: " + longer + " Shorter string: " + shorter + " Similarity: " + sim);
+        return sim;
 
-    private static double corrCoef(double[] n1, double[] n2) {
-        // Calculate mean values:
-        double mean1 = 0;
-        double mean2 = 0;
-        for (int i = 0; i < n1.length; i++) {
-            mean1 += n1[i];
-            mean2 += n2[i];
-        }
-        mean1 /= n1.length;
-        mean2 /= n2.length;
-        double sigma1 = 0;
-        double sigma2 = 0;
-        // Calculate correlation coefficient:
-        double corr = 0;
-        for (int i = 0; i < n1.length; i++) {
-            sigma1 += (n1[i] - mean1) * (n1[i] - mean1);
-            sigma2 += (n2[i] - mean2) * (n2[i] - mean2);
-            corr += (n1[i] - mean1) * (n2[i] - mean2);
-        }
-        sigma1 = Math.sqrt(sigma1);
-        sigma2 = Math.sqrt(sigma2);
-        if ((sigma1 > 0) && (sigma2 > 0)) {
-            return corr / (sigma1 * sigma2);
-        }
-        return 0;
     }
 
-    private static double[] numberizeString(String s) {
-        double[] res = new double[s.length()];
-        for (int i = 0; i < s.length(); i++) {
-            res[i] = s.charAt(i);
-        }
-        return res;
-    }
+    /*
+    * Levenshtein Edit Distance
+    * http://stackoverflow.com/questions/955110/similarity-string-comparison-in-java
+    */
+    private static int editDistance(String s1, String s2) {
+        s1 = s1.toLowerCase();
+        s2 = s2.toLowerCase();
 
-    private static double[] stretchArray(double[] array, int length) {
-        if ((length <= array.length) || (array.length == 0)) {
-            return array;
-        }
-        double multip = (double) array.length / (double) length;
-        double[] newArray = new double[length];
-        for (int i = 0; i < newArray.length; i++) {
-            double index = i * multip;
-            int baseInd = (int) Math.floor(index);
-            double dist = index - Math.floor(index);
-            newArray[i] = (dist * array[Math.min(array.length - 1, baseInd + 1)])
-                    + ((1.0 - dist) * array[baseInd]);
+        int[] costs = new int[s2.length() + 1];
+        for (int i = 0; i <= s1.length(); i++) {
+            int lastValue = i;
+            for (int j = 0; j <= s2.length(); j++) {
+                if (i == 0) {
+                    costs[j] = j;
+                } else {
+                    if (j > 0) {
+                        int newValue = costs[j - 1];
+                        if (s1.charAt(i - 1) != s2.charAt(j - 1)) {
+                            newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;
+                        }
+                        costs[j - 1] = lastValue;
+                        lastValue = newValue;
+                    }
+                }
+            }
+            if (i > 0) {
+                costs[s2.length()] = lastValue;
+            }
         }
-        return newArray;
+        LOGGER.debug("String 1: " + s1 + " String 2: " + s2 + " Distance: " + costs[s2.length()]);
+        return costs[s2.length()];
     }
 
+
 }
diff --git a/src/test/java/net/sf/jabref/bibtex/DuplicateCheckTest.java b/src/test/java/net/sf/jabref/bibtex/DuplicateCheckTest.java
@@ -7,7 +7,6 @@
 import net.sf.jabref.model.entry.BibtexEntryTypes;
 import org.junit.Assert;
 import org.junit.Before;
-import org.junit.Ignore;
 import org.junit.Test;
 
 import static org.junit.Assert.assertEquals;
@@ -27,7 +26,6 @@ public void setUp() {
     }
 
     @Test
-    @Ignore
     public void testDuplicateDetection() {
         BibtexEntry one = new BibtexEntry(IdGenerator.next(), BibtexEntryTypes.ARTICLE);
 
@@ -37,7 +35,6 @@ public void testDuplicateDetection() {
         two.setField("author", "Billy Bob");
         Assert.assertTrue(DuplicateCheck.isDuplicate(one, two));
 
-        //TODO algorithm thinks bob and joyce is the same with high accuracy
         two.setField("author", "James Joyce");
         Assert.assertFalse(DuplicateCheck.isDuplicate(one, two));
 
@@ -72,9 +69,9 @@ public void testWordCorrelation() {
         String d2 = "Characterization of Calunus finmarchicus habitat in the North Sea";
         String d3 = "Characterization of Calanus glacialissss habitat in the South Sea";
 
-        assertEquals(1.0, (DuplicateCheck.correlateByWords(d1, d2, false)), 0.01);
-        assertEquals(0.88, (DuplicateCheck.correlateByWords(d1, d3, false)), 0.01);
-        assertEquals(0.88, (DuplicateCheck.correlateByWords(d2, d3, false)), 0.01);
+        assertEquals(1.0, (DuplicateCheck.correlateByWords(d1, d2)), 0.01);
+        assertEquals(0.78, (DuplicateCheck.correlateByWords(d1, d3)), 0.01);
+        assertEquals(0.78, (DuplicateCheck.correlateByWords(d2, d3)), 0.01);
     }
 
 }