From 18c4affab762d868eb3af6483c86535b780adb5c Mon Sep 17 00:00:00 2001 From: Chris Umbel Date: Mon, 13 Feb 2012 19:54:21 -0500 Subject: [PATCH] T handling --- lib/natural/phonetics/double_metaphone.js | 20 ++++++++++++- spec/double_metaphone_spec.js | 35 ++++++++++++++++++++++- 2 files changed, 53 insertions(+), 2 deletions(-) diff --git a/lib/natural/phonetics/double_metaphone.js b/lib/natural/phonetics/double_metaphone.js index 52d695bcb..0e22c8725 100644 --- a/lib/natural/phonetics/double_metaphone.js +++ b/lib/natural/phonetics/double_metaphone.js @@ -123,8 +123,26 @@ function process(token, spyCallback) { } function handleT() { - if(token.substring(pos + 1, pos + 4) == 'ION' || subMatch(1, 3, ['IA', 'CH'])) { + if(token.substring(pos + 1, pos + 4) == 'ION') { + add('XN'); + pos += 3; + } else if(subMatch(1, 3, ['IA', 'CH'])) { add('X'); + pos += 2; + } else if(token[pos + 1] == 'H' + || token.substring(1, 2) == 'TH') { + if(subMatch(2, 4, ['OM', 'AM']) + || ['VAN ', 'VON '].indexOf(token.substring(0, 4)) > -1 + || token.substring(0, 3) == 'SCH') { + add('T'); + } else + addSecondary('0', 'T'); + pos++; + } else { + addCompressedDouble('T'); + + if(token[pos + 1] == 'D') + pos++; } } diff --git a/spec/double_metaphone_spec.js b/spec/double_metaphone_spec.js index 3de726489..00bbd8cdf 100644 --- a/spec/double_metaphone_spec.js +++ b/spec/double_metaphone_spec.js @@ -257,9 +257,42 @@ describe('double metaphone', function() { }); describe('T', function() { - it('should encode T', function() { + it('should encode TION to XN', function() { var encodings = doubleMetaphone.process('nation'); expect(encodings[0]).toMatch(/.*XN$/); + expect(encodings[1]).toMatch(/.*XN$/); + }); + + it('should encode CH sounds to X', function() { + var encodings = doubleMetaphone.process('thatch'); + expect(encodings[0]).toMatch(/.*X$/); + expect(encodings[1]).toMatch(/.*X$/); + }); + + it('should encode hard TH to T', function() { + var encodings = doubleMetaphone.process('thomas'); + expect(encodings[0]).toMatch(/^T.*/); + expect(encodings[1]).toMatch(/^T.*/); + }); + + it('should encode soft TH to 0,T', function() { + var encodings = doubleMetaphone.process('this'); + expect(encodings[0]).toMatch(/^0.*/); + expect(encodings[1]).toMatch(/^T.*/); + }); + + it('should encode TT to T', function() { + var encodings = doubleMetaphone.process('matta'); + expect(encodings[0]).toMatch(/[^T]T/); + expect(encodings[1]).toMatch(/[^T]T/); + }); + + it('should encode TD to T', function() { + var encodings = doubleMetaphone.process('countdown'); + expect(encodings[0]).toContain('T'); + expect(encodings[0]).toNotContain('D'); + expect(encodings[1]).toContain('T'); + expect(encodings[1]).toNotContain('D'); }); });