Skip to content

Commit

Permalink
Changed core function to approximate string matching
Browse files Browse the repository at this point in the history
  • Loading branch information
oscargus committed Dec 5, 2015
1 parent 6bc0b53 commit 3e8a2d9
Show file tree
Hide file tree
Showing 2 changed files with 60 additions and 86 deletions.
137 changes: 57 additions & 80 deletions src/main/java/net/sf/jabref/bibtex/DuplicateCheck.java
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,9 @@
import java.util.HashMap;
import java.util.HashSet;

import org.apache.commons.logging.Log;
import org.apache.commons.logging.LogFactory;

/**
* This class contains utility method for duplicate checking of entries.
*/
Expand All @@ -47,6 +50,9 @@ public class DuplicateCheck {
// Extra weighting of those fields that are most likely to provide correct duplicate detection:
private static final HashMap<String, Double> fieldWeights = new HashMap<>();

private static final Log LOGGER = LogFactory.getLog(DuplicateCheck.class);


static {
DuplicateCheck.fieldWeights.put("author", 2.5);
DuplicateCheck.fieldWeights.put("editor", 2.5);
Expand Down Expand Up @@ -133,7 +139,7 @@ private static int compareSingleField(String field, BibtexEntry one, BibtexEntry
// Harmonise case:
String auth1 = AuthorList.fixAuthor_lastNameOnlyCommas(s1, false).replaceAll(" and ", " ").toLowerCase();
String auth2 = AuthorList.fixAuthor_lastNameOnlyCommas(s2, false).replaceAll(" and ", " ").toLowerCase();
double similarity = DuplicateCheck.correlateByWords(auth1, auth2, false);
double similarity = DuplicateCheck.correlateByWords(auth1, auth2);
if (similarity > 0.8) {
return EQUAL;
}
Expand All @@ -154,15 +160,15 @@ private static int compareSingleField(String field, BibtexEntry one, BibtexEntry
// and without dots:
s1 = s1.replaceAll("\\.", "").toLowerCase();
s2 = s2.replaceAll("\\.", "").toLowerCase();
double similarity = DuplicateCheck.correlateByWords(s1, s2, true);
double similarity = DuplicateCheck.correlateByWords(s1, s2);
if (similarity > 0.8) {
return EQUAL;
}
return NOT_EQUAL;
} else {
s1 = s1.toLowerCase();
s2 = s2.toLowerCase();
double similarity = DuplicateCheck.correlateByWords(s1, s2, false);
double similarity = DuplicateCheck.correlateByWords(s1, s2);
if (similarity > 0.8) {
return EQUAL;
}
Expand Down Expand Up @@ -219,13 +225,13 @@ public static BibtexEntry containsDuplicate(BibtexDatabase database, BibtexEntry
* harmonize their length. If false, use interpolation to harmonize the strings.
* @return a value in the interval [0, 1] indicating the degree of match.
*/
static double correlateByWords(String s1, String s2, boolean truncate) {
static double correlateByWords(String s1, String s2) {
String[] w1 = s1.split("\\s");
String[] w2 = s2.split("\\s");
int n = Math.min(w1.length, w2.length);
int misses = 0;
for (int i = 0; i < n; i++) {
double corr = DuplicateCheck.correlateStrings(w1[i], w2[i], truncate);
double corr = similarity(w1[i], w2[i]);
if (corr < 0.75) {
misses++;
}
Expand All @@ -234,88 +240,59 @@ static double correlateByWords(String s1, String s2, boolean truncate) {
return 1 - missRate;
}

private static double correlateStrings(String s1, String s2, boolean truncate) {
int minLength = Math.min(s1.length(), s2.length());
if (truncate && (minLength == 1)) {
return s1.charAt(0) == s2.charAt(0) ? 1.0 : 0.0;
} else if ((s1.length() == 1) && (s2.length() == 1)) {
return s1.equals(s2) ? 1.0 : 0.0;
} else if (minLength == 0) {
return s1.isEmpty() && s2.isEmpty() ? 1.0 : 0;
}

// Convert strings to numbers and harmonize length in a method dependent on truncate:
if (truncate) {
// Harmonize length by truncation:
if (s1.length() > minLength) {
s1 = s1.substring(0, minLength);
}
if (s2.length() > minLength) {
s2 = s2.substring(0, minLength);
}
}
double[] n1 = DuplicateCheck.numberizeString(s1);
double[] n2 = DuplicateCheck.numberizeString(s2);
// If truncation is disabled, harmonize length by interpolation:
if (!truncate) {
if (n1.length < n2.length) {
n1 = DuplicateCheck.stretchArray(n1, n2.length);
} else if (n2.length < n1.length) {
n2 = DuplicateCheck.stretchArray(n2, n1.length);
}
/**
* Calculates the similarity (a number within 0 and 1) between two strings.
* http://stackoverflow.com/questions/955110/similarity-string-comparison-in-java
*/
private static double similarity(String s1, String s2) {
String longer = s1, shorter = s2;
if (s1.length() < s2.length()) { // longer should always have greater length
longer = s2;
shorter = s1;
}
return DuplicateCheck.corrCoef(n1, n2);
}
int longerLength = longer.length();
if (longerLength == 0) {
return 1.0;
/* both strings are zero length */ }
double sim = (longerLength - editDistance(longer, shorter)) / (double) longerLength;
LOGGER.debug("Longer string: " + longer + " Shorter string: " + shorter + " Similarity: " + sim);
return sim;

private static double corrCoef(double[] n1, double[] n2) {
// Calculate mean values:
double mean1 = 0;
double mean2 = 0;
for (int i = 0; i < n1.length; i++) {
mean1 += n1[i];
mean2 += n2[i];
}
mean1 /= n1.length;
mean2 /= n2.length;
double sigma1 = 0;
double sigma2 = 0;
// Calculate correlation coefficient:
double corr = 0;
for (int i = 0; i < n1.length; i++) {
sigma1 += (n1[i] - mean1) * (n1[i] - mean1);
sigma2 += (n2[i] - mean2) * (n2[i] - mean2);
corr += (n1[i] - mean1) * (n2[i] - mean2);
}
sigma1 = Math.sqrt(sigma1);
sigma2 = Math.sqrt(sigma2);
if ((sigma1 > 0) && (sigma2 > 0)) {
return corr / (sigma1 * sigma2);
}
return 0;
}

private static double[] numberizeString(String s) {
double[] res = new double[s.length()];
for (int i = 0; i < s.length(); i++) {
res[i] = s.charAt(i);
}
return res;
}
/*
* Levenshtein Edit Distance
* http://stackoverflow.com/questions/955110/similarity-string-comparison-in-java
*/
private static int editDistance(String s1, String s2) {
s1 = s1.toLowerCase();
s2 = s2.toLowerCase();

private static double[] stretchArray(double[] array, int length) {
if ((length <= array.length) || (array.length == 0)) {
return array;
}
double multip = (double) array.length / (double) length;
double[] newArray = new double[length];
for (int i = 0; i < newArray.length; i++) {
double index = i * multip;
int baseInd = (int) Math.floor(index);
double dist = index - Math.floor(index);
newArray[i] = (dist * array[Math.min(array.length - 1, baseInd + 1)])
+ ((1.0 - dist) * array[baseInd]);
int[] costs = new int[s2.length() + 1];
for (int i = 0; i <= s1.length(); i++) {
int lastValue = i;
for (int j = 0; j <= s2.length(); j++) {
if (i == 0) {
costs[j] = j;
} else {
if (j > 0) {
int newValue = costs[j - 1];
if (s1.charAt(i - 1) != s2.charAt(j - 1)) {
newValue = Math.min(Math.min(newValue, lastValue), costs[j]) + 1;
}
costs[j - 1] = lastValue;
lastValue = newValue;
}
}
}
if (i > 0) {
costs[s2.length()] = lastValue;
}
}
return newArray;
LOGGER.debug("String 1: " + s1 + " String 2: " + s2 + " Distance: " + costs[s2.length()]);
return costs[s2.length()];
}


}
9 changes: 3 additions & 6 deletions src/test/java/net/sf/jabref/bibtex/DuplicateCheckTest.java
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,6 @@
import net.sf.jabref.model.entry.BibtexEntryTypes;
import org.junit.Assert;
import org.junit.Before;
import org.junit.Ignore;
import org.junit.Test;

import static org.junit.Assert.assertEquals;
Expand All @@ -27,7 +26,6 @@ public void setUp() {
}

@Test
@Ignore
public void testDuplicateDetection() {
BibtexEntry one = new BibtexEntry(IdGenerator.next(), BibtexEntryTypes.ARTICLE);

Expand All @@ -37,7 +35,6 @@ public void testDuplicateDetection() {
two.setField("author", "Billy Bob");
Assert.assertTrue(DuplicateCheck.isDuplicate(one, two));

//TODO algorithm thinks bob and joyce is the same with high accuracy
two.setField("author", "James Joyce");
Assert.assertFalse(DuplicateCheck.isDuplicate(one, two));

Expand Down Expand Up @@ -72,9 +69,9 @@ public void testWordCorrelation() {
String d2 = "Characterization of Calunus finmarchicus habitat in the North Sea";
String d3 = "Characterization of Calanus glacialissss habitat in the South Sea";

assertEquals(1.0, (DuplicateCheck.correlateByWords(d1, d2, false)), 0.01);
assertEquals(0.88, (DuplicateCheck.correlateByWords(d1, d3, false)), 0.01);
assertEquals(0.88, (DuplicateCheck.correlateByWords(d2, d3, false)), 0.01);
assertEquals(1.0, (DuplicateCheck.correlateByWords(d1, d2)), 0.01);
assertEquals(0.78, (DuplicateCheck.correlateByWords(d1, d3)), 0.01);
assertEquals(0.78, (DuplicateCheck.correlateByWords(d2, d3)), 0.01);
}

}

0 comments on commit 3e8a2d9

Please sign in to comment.