From 6eedbb16a2da6c72f739a7e96aeb23b970ed93b9 Mon Sep 17 00:00:00 2001 From: Mike Amaral Date: Thu, 30 Apr 2015 11:28:09 -0400 Subject: [PATCH] Ensure we lowercase all tokens before testing against the stopwords, and reduce the number of conditionals per function call. --- lib/natural/stemmers/stemmer.js | 18 ++++++++++++++---- 1 file changed, 14 insertions(+), 4 deletions(-) diff --git a/lib/natural/stemmers/stemmer.js b/lib/natural/stemmers/stemmer.js index f1c2d1777..12f4cca59 100644 --- a/lib/natural/stemmers/stemmer.js +++ b/lib/natural/stemmers/stemmer.js @@ -40,11 +40,21 @@ module.exports = function() { stemmer.tokenizeAndStem = function(text, keepStops) { var stemmedTokens = []; - - new Tokenizer().tokenize(text).forEach(function(token) { - if(keepStops || stopwords.words.indexOf(token) == -1) + var lowercaseText = text.toLowerCase(); + var tokens = new Tokenizer().tokenize(lowercaseText); + + if (keepStops) { + tokens.forEach(function(token) { stemmedTokens.push(stemmer.stem(token)); - }); + }); + } + + else { + tokens.forEach(function(token) { + if (stopwords.words.indexOf(token) == -1) + stemmedTokens.push(stemmer.stem(token)); + }); + } return stemmedTokens; };