From dd4c205f7a7bd03f26d08e6796416538923fd819 Mon Sep 17 00:00:00 2001 From: Chris Umbel Date: Sun, 12 Feb 2012 21:13:49 -0500 Subject: [PATCH] added X handling --- lib/natural/phonetics/double_metaphone.js | 61 ++++++++++++++--------- spec/double_metaphone_spec.js | 24 ++++++++- 2 files changed, 59 insertions(+), 26 deletions(-) diff --git a/lib/natural/phonetics/double_metaphone.js b/lib/natural/phonetics/double_metaphone.js index 1e592cf52..5d0c952d5 100644 --- a/lib/natural/phonetics/double_metaphone.js +++ b/lib/natural/phonetics/double_metaphone.js @@ -54,6 +54,30 @@ function process(token, spyCallback) { add(encoded || c); } + function handleD() { + if(token.substring(pos, pos + 2) == 'DG') { + if(['I', 'E', 'Y'].indexOf(token[pos + 2]) > -1) { + add('J'); + pos += 2; + } else { + add('TK'); + pos++; + } + } else if(token.substring(pos, pos + 2) == 'DT') { + add('T'); + pos++; + } else + addCompressedDouble('D', 'T'); + } + + function handleH() { + // keep if starts a word or is surrounded by vowels + if((pos == 0 || isVowel(token[pos - 1])) && isVowel(token[pos + 1])) { + add('H'); + pos++; + } + } + function handleL() { if(token[pos + 1] == 'L') { if(pos == token.length - 3 && ( @@ -90,14 +114,6 @@ function process(token, spyCallback) { } } - function handleH() { - // keep if starts a word or is surrounded by vowels - if((pos == 0 || isVowel(token[pos - 1])) && isVowel(token[pos + 1])) { - add('H'); - pos++; - } - } - function handleR() { if(pos == token.length - 1 && !slavoGermanic && token.substring(pos - 2, pos) == 'IE' @@ -107,28 +123,22 @@ function process(token, spyCallback) { addCompressedDouble('R'); } - function handleD() { - if(token.substring(pos, pos + 2) == 'DG') { - if(['I', 'E', 'Y'].indexOf(token[pos + 2]) > -1) { - add('J'); - pos += 2; - } else { - add('TK'); - pos++; - } - } else if(token.substring(pos, pos + 2) == 'DT') { - add('T'); - pos++; - } else - addCompressedDouble('D', 'T'); + function handleX() { + if(pos == 0) { + add('S'); + } else if(!(pos == token.length - 1 + && (['IAU', 'EAU', 'IEU'].indexOf(token.substring(pos - 3, pos)) > -1 + || ['AU', 'OU'].indexOf(token.substring(pos - 2, pos)) > -1))) { + add('KS'); + } } function handleZ() { if(token[pos + 1] == 'H') { add('J'); pos++; - } else if(['ZO', 'ZI', 'ZA'].indexOf(token.substring(pos + 1, pos + 3)) > -1 || - (slavoGermanic && pos > 0 && token[pos - 1] != 'T')) { + } else if(['ZO', 'ZI', 'ZA'].indexOf(token.substring(pos + 1, pos + 3)) > -1 + || (slavoGermanic && pos > 0 && token[pos - 1] != 'T')) { addSecondary('S', 'TS'); pos++; } else @@ -186,6 +196,9 @@ function process(token, spyCallback) { case 'V': addCompressedDouble('V', 'F'); break; + case 'X': + handleX(); + break; case 'Z': handleZ(); break; diff --git a/spec/double_metaphone_spec.js b/spec/double_metaphone_spec.js index 096fad332..f444a0b92 100644 --- a/spec/double_metaphone_spec.js +++ b/spec/double_metaphone_spec.js @@ -249,7 +249,7 @@ describe('double metaphone', function() { expect(encodings[1]).toMatch(/^R.*/); }); - it('should ignore trailing french Rs', function() { + it('should ignore trailing French Rs', function() { var encodings = doubleMetaphone.process('papier'); expect(encodings[0]).toMatch(/.*[^R]$/); expect(encodings[1]).toMatch(/.*R$/); @@ -274,6 +274,26 @@ describe('double metaphone', function() { }); }); + describe('X', function() { + it('should encode X as S at start', function() { + var encodings = doubleMetaphone.process('xenophobia'); + expect(encodings[0]).toMatch(/^S.*/); + expect(encodings[1]).toMatch(/^S.*/); + }); + + it('should encode X as KS at end for non-French words', function() { + var encodings = doubleMetaphone.process('box'); + expect(encodings[0]).toMatch(/.*KS$/); + expect(encodings[1]).toMatch(/.*KS$/); + }); + + it('should skip X end for French words', function() { + var encodings = doubleMetaphone.process('lemieux'); + expect(encodings[0]).toNotMatch(/.*KS$/); + expect(encodings[1]).toNotMatch(/.*KS$/); + }); + }); + describe('Z', function() { it('should encode Z to S', function() { var encodings = doubleMetaphone.process('zookeeper'); @@ -281,7 +301,7 @@ describe('double metaphone', function() { expect(encodings[1]).toMatch(/^S.*$/); }); - it('should encode chinese ZH to J', function() { + it('should encode Chinese ZH to J', function() { var encodings = doubleMetaphone.process('zheng'); expect(encodings[0]).toMatch(/^J.*$/); expect(encodings[1]).toMatch(/^J.*$/);