Permalink
Browse files

Dice's Coefficient

  • Loading branch information...
1 parent c879f11 commit 9c0e3c51706ddde64a0f7f3ff54b277cc7be81c4 @seejohnrun seejohnrun committed Mar 23, 2012
Showing with 103 additions and 2 deletions.
  1. +5 −0 README.md
  2. +69 −0 lib/natural/distance/dice_coefficient.js
  3. +2 −1 lib/natural/index.js
  4. +1 −1 package.json
  5. +26 −0 spec/dice_coefficient_spec.js
View
@@ -73,6 +73,11 @@ Output:
2
0
+And Dice's co-efficient:
+
+ var natural = require('natural');
+ console.log(natural.DiceCoefficient('thing', 'things'));
+
Stemmers
--------
@@ -0,0 +1,69 @@
+/*
+Copyright (c) 2011, John Crepezzi, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// Get all of the pairs of letters for a string
+var letterPairs = function (str) {
+ var numPairs = str.length - 1;
+ var pairs = new Array(numPairs);
+ for (var i = 0; i < numPairs; i++) {
+ pairs[i] = str.substring(i, i + 2);
+ }
+ return pairs;
+};
+
+// Get all of the pairs in all of the words for a string
+var wordLetterPairs = function (str) {
+ var allPairs = [], pairs;
+ var words = str.split(/\s+/);
+ for (var i = 0; i < words.length; i++) {
+ pairs = letterPairs(words[i]);
+ allPairs.push.apply(allPairs, pairs);
+ }
+ return allPairs;
+};
+
+// Perform some sanitization steps
+var sanitize = function (str) {
+ return str.toLowerCase().replace(/^\s+|\s+$/g, '');
+};
+
+// Compare two strings, and spit out a number from 0-1
+var compare = function (str1, str2) {
+ var pairs1 = wordLetterPairs(sanitize(str1));
+ var pairs2 = wordLetterPairs(sanitize(str2));
+ var intersection = 0, union = pairs1.length + pairs2.length;
+ var i, j, pair1, pair2;
+ for (i = 0; i < pairs1.length; i++) {
+ pair1 = pairs1[i];
+ for (j = 0; j < pairs2.length; j++) {
+ pair2 = pairs2[j];
+ if (pair1 == pair2) {
+ intersection ++;
+ delete pairs2[j];
+ break;
+ }
+ }
+ }
+ return 2 * intersection / union;
+};
+
+module.exports = compare;
View
@@ -40,4 +40,5 @@ exports.SentenceAnalyzer = require('./analyzers/sentence_analyzer');
exports.stopwords = require('./util/stopwords').words;
exports.NGrams = require('./ngrams/ngrams');
exports.JaroWinklerDistance = require('./distance/jaro-winkler_distance');
-exports.LevenshteinDistance = require('./distance/levenshtein_distance');
+exports.LevenshteinDistance = require('./distance/levenshtein_distance');
+exports.DiceCoefficient = require('./distance/dice_coefficient');
View
@@ -1,6 +1,6 @@
{
"name": "natural",
- "description": "General natural language (tokenizing, stemming, classification, inflection, phonetics, tfidf, WordNet, jaro-winkler, Levenshtein distance) facilities for node.",
+ "description": "General natural language (tokenizing, stemming, classification, inflection, phonetics, tfidf, WordNet, jaro-winkler, Levenshtein distance, Dice's Coefficient) facilities for node.",
"version": "0.1.4",
"homepage": "https://github.com/NaturalNode/natural",
"engines": {
@@ -0,0 +1,26 @@
+var dice = require('lib/natural/distance/dice_coefficient');
+
+describe('dice', function () {
+
+ it('should handle exact matches', function () {
+ expect(dice('john', 'john')).toBe(1);
+ });
+
+ it('should handle total mis-matches', function () {
+ expect(dice('john', 'matt')).toBe(0);
+ });
+
+ // Example from http://en.wikipedia.org/wiki/Dice's_coefficient
+ it('should handle a typical case', function () {
+ expect(dice('night', 'nacht')).toBe(0.25);
+ });
+
+ it('should sanitize case', function () {
+ expect(dice('night', 'NIGHT')).toBe(1);
+ });
+
+ it('should sanitize spacing', function () {
+ expect(dice('the space', 'the space')).toBe(1);
+ });
+
+});

0 comments on commit 9c0e3c5

Please sign in to comment.