Browse files

implemented #132, possible fix for #125 as well.

  • Loading branch information...
1 parent 614aff6 commit 443692b069172fcd863f3d4c2f08f493532483f2 Ken Koch committed Mar 7, 2014
View
2 lib/natural/stemmers/stemmer_es.js
@@ -36,7 +36,7 @@ module.exports = function() {
new Tokenizer().tokenize(text).forEach(function(token) {
if (keepStops || stopwords.words.indexOf(token) == -1) {
var resultToken = token.toLowerCase();
- if (resultToken.match(new RegExp('[а-záéíóúüñ0-9]+', 'gi'))) {
+ if (resultToken.match(new RegExp('[a-záéíóúüñ0-9]+', 'gi'))) {
resultToken = stemmer.stem(resultToken);
}
stemmedTokens.push(resultToken);
View
2 lib/natural/tokenizers/aggressive_tokenizer_es.js
@@ -32,5 +32,5 @@ module.exports = AggressiveTokenizer;
AggressiveTokenizer.prototype.tokenize = function(text) {
// break a string up into an array of tokens by anything non-word
- return this.trim(text.split(/\W+/));
+ return this.trim(text.split(/[^a-zA-Zá-úÁ-ÚñÑüÜ]+/));
};
View
36 spec/aggressive_tokenizer_es_spec.js
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2011, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this softwé and associated documentation files (the "Softwé"), to deal
+in the Softwé without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Softwé, and to permit persons to whom the Softwé is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Softwé.
+
+THE SOFTWé IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWé OR THE USE OR OTHER DEALINGS IN
+THE SOFTWé.
+*/
+
+var Tokenizer = require('../lib/natural/tokenizers/aggressive_tokenizer_es'),
+ tokenizer = new Tokenizer();
+
+describe('aggressive_tokenizer_es', function() {
+ it('should tokenize strings', function() {
+ expect(tokenizer.tokenize('hola yo me llamo eduardo y esudié ingeniería')).toEqual(['hola', 'yo', 'me', 'llamo', 'eduardo', 'y', 'esudié', 'ingeniería']);
+ });
+
+ it('should tokenize strings via attached string method', function() {
+ tokenizer.attach();
+ expect('hola yo me llamo eduardo y esudié ingeniería'.tokenize()).toEqual(['hola', 'yo', 'me', 'llamo', 'eduardo', 'y', 'esudié', 'ingeniería']);
+ });
+
+});

0 comments on commit 443692b

Please sign in to comment.