Skip to content

Commit

Permalink
Ensure we lowercase all tokens before testing against the stopwords, …
Browse files Browse the repository at this point in the history
…and reduce the number of conditionals per function call.
  • Loading branch information
Mike Amaral committed Apr 30, 2015
1 parent a29689b commit 6eedbb1
Showing 1 changed file with 14 additions and 4 deletions.
18 changes: 14 additions & 4 deletions lib/natural/stemmers/stemmer.js
Expand Up @@ -40,11 +40,21 @@ module.exports = function() {

stemmer.tokenizeAndStem = function(text, keepStops) {
var stemmedTokens = [];

new Tokenizer().tokenize(text).forEach(function(token) {
if(keepStops || stopwords.words.indexOf(token) == -1)
var lowercaseText = text.toLowerCase();
var tokens = new Tokenizer().tokenize(lowercaseText);

if (keepStops) {
tokens.forEach(function(token) {
stemmedTokens.push(stemmer.stem(token));
});
});
}

else {
tokens.forEach(function(token) {
if (stopwords.words.indexOf(token) == -1)
stemmedTokens.push(stemmer.stem(token));
});
}

return stemmedTokens;
};
Expand Down

0 comments on commit 6eedbb1

Please sign in to comment.