Permalink
Browse files

Merge pull request #77 from dav009/master

Added Stemmer for Spanish
  • Loading branch information...
2 parents f703081 + 9b38de2 commit c82c0969000ea2963c6ca33d42be0ad6c8c23161 @chrisumbel chrisumbel committed Nov 20, 2012
View
@@ -27,10 +27,12 @@ exports.SoundExDM = require('./phonetics/dm_soundex');
exports.PorterStemmer = require('./stemmers/porter_stemmer');
exports.PorterStemmerFa = require('./stemmers/porter_stemmer_fa');
exports.PorterStemmerRu = require('./stemmers/porter_stemmer_ru');
+exports.PorterStemmerEs = require('./stemmers/porter_stemmer_es');
exports.LancasterStemmer = require('./stemmers/lancaster_stemmer');
exports.StemmerJa = require('./stemmers/stemmer_ja');
exports.AggressiveTokenizerFa = require('./tokenizers/aggressive_tokenizer_fa');
exports.AggressiveTokenizerRu = require('./tokenizers/aggressive_tokenizer_ru');
+exports.AggressiveTokenizerEs = require('./tokenizers/aggressive_tokenizer_es');
exports.AggressiveTokenizer = require('./tokenizers/aggressive_tokenizer');
exports.RegexpTokenizer = require('./tokenizers/regexp_tokenizer').RegexpTokenizer;
exports.WordTokenizer = require('./tokenizers/regexp_tokenizer').WordTokenizer;
@@ -0,0 +1,222 @@
+/*
+Copyright (c) 2012, David Przybilla, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+var Stemmer = require('./stemmer_es');
+
+var PorterStemmer = new Stemmer();
+module.exports = PorterStemmer;
+
+
+function isVowel(letter){
+ return (letter == 'a' || letter == 'e' || letter == 'i' || letter == 'o' || letter == 'u' || letter == 'á' || letter == 'é' ||
+ letter == 'í' || letter == 'ó' || letter == 'ú');
+};
+
+function getNextVowelPos(token,start){
+ length=token.length
+ for (var i = start; i < length; i++)
+ if (isVowel(token[i])) return i;
+ return length;
+};
+
+function getNextConsonantPos(token,start){
+ length=token.length
+ for (var i = start; i < length; i++)
+ if (!isVowel(token[i])) return i;
+ return length;
+};
+
+
+function endsin(token, suffix) {
+ if (token.length < suffix.length) return false;
+ return (token.slice(-suffix.length) == suffix);
+};
+
+function endsinArr(token, suffixes) {
+ for(var i=0;i<suffixes.length;i++){
+ if (endsin(token, suffixes[i])) return suffixes[i];
+ }
+ return '';
+};
+
+function removeAccent(token) {
+ var str=token.replace(/á/gi,'a');
+ str=str.replace(/é/gi,'e');
+ str=str.replace(/í/gi,'i');
+ str=str.replace(/ó/gi,'o');
+ str=str.replace(/ú/gi,'u');
+ return str;
+};
+
+
+// perform full stemming algorithm on a single word
+PorterStemmer.stem = function(token) {
+ token = token.toLowerCase();
+
+ if (token.length<3){
+ return token;
+ }
+
+ var r1,r2,rv,len= token.length;
+ //looking for regions after vowels
+
+ for(var i=0; i< token.length-1 && r1==len;i++){
+ if(isVowel(token[i]) && !isVowel(token[i+1]) ){
+ r1=i+2;
+ }
+
+ }
+
+ for(var i=r1; i< token.length-1 && r2==len;i++){
+ if(isVowel(token[i]) && !isVowel(token[i+1])){
+ r2=i+2;
+ }
+ }
+
+ if (len > 3) {
+ if(isVowel(token[1])) {
+ // If the second letter is a consonant, RV is the region after the next following vowel
+ rv = getNextVowelPos(token, 2) +1;
+ } else if (isVowel(token[0]) && isVowel(token[1])) {
+ // or if the first two letters are vowels, RV is the region after the next consonant
+ rv = getNextConsonantPos(token, 2) + 1;
+ } else {
+ //otherwise (consonant-vowel case) RV is the region after the third letter. But RV is the end of the word if these positions cannot be found.
+ rv = 3;
+ }
+ }
+
+ var r1_txt = token.substring(r1-1);
+ var r2_txt = token.substring(r2-1);
+ var rv_txt = token.substring(rv-1);
+
+
+ var token_orig = token;
+
+ // Step 0: Attached pronoun
+ var pronoun_suf = new Array('me', 'se', 'sela', 'selo', 'selas', 'selos', 'la', 'le', 'lo', 'las', 'les', 'los', 'nos');
+ var pronoun_suf_pre1 = new Array('éndo', 'ándo', 'ár', 'ér', 'ír');
+ var pronoun_suf_pre2 = new Array('ando', 'iendo', 'ar', 'er', 'ir');
+ var suf = endsinArr(token, pronoun_suf);
+
+
+ if (suf!='') {
+
+ var pre_suff = endsinArr(rv_txt.slice(0,-suf.length),pronoun_suf_pre1);
+
+ if (pre_suff != '') {
+
+ token = removeAccent(token.slice(0,-suf.length));
+ } else {
+ var pre_suff = endsinArr(rv_txt.slice(0,-suf.length),pronoun_suf_pre2);
+
+ if (pre_suff != '' ||
+ (endsin(token, 'yendo' ) &&
+ (token.slice(-suf.length-6,1) == 'u'))) {
+ token = token.slice(0,-suf.length);
+ }
+ }
+ }
+
+ if (token != token_orig) {
+ r1_txt = token.substring(r1-1);
+ r2_txt = token.substring(r2-1);
+ rv_txt = token.substring(rv-1);
+ }
+ var token_after0 = token;
+
+ if ((suf = endsinArr(r2_txt, new Array('anza', 'anzas', 'ico', 'ica', 'icos', 'icas', 'ismo', 'ismos', 'able', 'ables', 'ible', 'ibles', 'ista', 'istas', 'oso', 'osa', 'osos', 'osas', 'amiento', 'amientos', 'imiento', 'imientos'))) != '') {
+ token = token.slice(0, -suf.length);
+ } else if ((suf = endsinArr(r2_txt, new Array('icadora', 'icador', 'icación', 'icadoras', 'icadores', 'icaciones', 'icante', 'icantes', 'icancia', 'icancias', 'adora', 'ador', 'ación', 'adoras', 'adores', 'aciones', 'ante', 'antes', 'ancia', 'ancias'))) != '') {
+ token = token.slice(0, -suf.length);
+ } else if ((suf = endsinArr(r2_txt, new Array('logía', 'logías'))) != '') {
+ token = token.slice(0, -suf.length)+ 'log';
+ } else if ((suf =endsinArr(r2_txt, new Array('ución', 'uciones'))) != '') {
+ token = token.slice(0, -suf.length) + 'u';
+ } else if ((suf = endsinArr(r2_txt, new Array('encia', 'encias'))) != '') {
+ token = token.slice(0, -suf.length)+ 'ente';
+ } else if ((suf = endsinArr(r2_txt, new Array('ativamente', 'ivamente', 'osamente', 'icamente', 'adamente'))) != '') {
+ token = token.slice(0, -suf.length);
+ } else if ((suf = endsinArr(r1_txt, new Array('amente'))) != '') {
+ token = token.slice(0, -suf.length);
+ } else if ((suf = endsinArr(r2_txt, new Array('antemente', 'ablemente', 'iblemente', 'mente'))) != '') {
+ token = token.slice(0, -suf.length);
+ } else if ((suf = endsinArr(r2_txt, new Array('abilidad', 'abilidades', 'icidad', 'icidades', 'ividad', 'ividades', 'idad', 'idades'))) != '') {
+ token = token.slice(0, -suf.length);
+ } else if ((suf = endsinArr(r2_txt, new Array('ativa', 'ativo', 'ativas', 'ativos', 'iva', 'ivo', 'ivas', 'ivos'))) != '') {
+ token = token.slice(0, -suf.length);
+ }
+
+ if (token != token_after0) {
+ r1_txt = token.substring(r1-1);
+ r2_txt = token.substring(r2-1);
+ rv_txt = token.substring(rv-1);
+ }
+ var token_after1 = token;
+
+ if (token_after0 == token_after1) {
+ // Do step 2a if no ending was removed by step 1.
+ if ((suf = endsinArr(rv_txt, new Array('ya', 'ye', 'yan', 'yen', 'yeron', 'yendo', 'yo', '', 'yas', 'yes', 'yais', 'yamos'))) != '' && (token.substring(suf.length-1,1) == 'u')) {
+ token = token.slice(0, -suf.length);
+ }
+
+ if (token != token_after1) {
+ r1_txt = token.substring(r1-1);
+ r2_txt = token.substring(r2-1);
+ rv_txt = token.substring(rv-1);
+ }
+ var token_after2a = token;
+
+ // Do Step 2b if step 2a was done, but failed to remove a suffix.
+ if (token_after2a == token_after1) {
+
+ if ((suf = endsinArr(rv_txt,new Array('en', 'es', 'éis', 'emos'))) != '') {
+ token = token.slice(0,-suf.length);
+ if (endsin(token, 'gu')) {
+ token = token.slice(0,-1);
+ }
+ } else if ((suf = endsinArr(rv_txt, new Array('arían', 'arías', 'arán', 'arás', 'aríais', 'aría', 'aréis', 'aríamos', 'aremos', 'ará', 'aré', 'erían', 'erías', 'erán', 'erás', 'eríais', 'ería', 'eréis', 'eríamos', 'eremos', 'erá', 'eré', 'irían', 'irías', 'irán', 'irás', 'iríais', 'iría', 'iréis', 'iríamos', 'iremos', 'irá', 'iré', 'aba', 'ada', 'ida', 'ía', 'ara', 'iera', 'ad', 'ed', 'id', 'ase', 'iese', 'aste', 'iste', 'an', 'aban', 'ían', 'aran', 'ieran', 'asen', 'iesen', 'aron', 'ieron', 'ado', 'ido', 'ando', 'iendo', '', 'ar', 'er', 'ir', 'as', 'abas', 'adas', 'idas', 'ías', 'aras', 'ieras', 'ases', 'ieses', 'ís', 'áis', 'abais', 'íais', 'arais', 'ierais', ' aseis', 'ieseis', 'asteis', 'isteis', 'ados', 'idos', 'amos', 'ábamos', 'íamos', 'imos', 'áramos', 'iéramos', 'iésemos', 'ásemos'))) != '') {
+
+ token = token.slice(0, -suf.length);
+
+ }
+ }
+ }
+
+ // Always do step 3.
+ r1_txt = token.substring(r1-1);
+ r2_txt = token.substring(r2-1);
+ rv_txt = token.substring(rv-1);
+
+ if ((suf = endsinArr(rv_txt, new Array('os', 'a', 'o', 'á', 'í', 'ó'))) != '') {
+ token = token.slice(0, -suf.length);
+ } else if ((suf = endsinArr(rv_txt ,new Array('e','é'))) != '') {
+ token = token.slice(0,-1);
+ rv_txt = token.substring(rv-1);
+ if (endsin(rv_txt,'u') && endsin(token,'gu')) {
+ token = token.slice(0,-1);
+ }
+ }
+
+ return removeAccent(token);
+
+};
@@ -0,0 +1,58 @@
+/*
+Copyright (c) 2012, David Przybilla, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+var stopwords = require('../util/stopwords_es');
+var Tokenizer = require('../tokenizers/aggressive_tokenizer_es');
+
+module.exports = function() {
+ var stemmer = this;
+
+ stemmer.stem = function(token) {
+ return token;
+ };
+
+ stemmer.tokenizeAndStem = function(text, keepStops) {
+ var stemmedTokens = [];
+
+ new Tokenizer().tokenize(text).forEach(function(token) {
+ if (keepStops || stopwords.words.indexOf(token) == -1) {
+ var resultToken = token.toLowerCase();
+ if (resultToken.match(new RegExp('[а-záéíóúüñ0-9]+', 'gi'))) {
+ resultToken = stemmer.stem(resultToken);
+ }
+ stemmedTokens.push(resultToken);
+ }
+ });
+
+ return stemmedTokens;
+ };
+
+ stemmer.attach = function() {
+ String.prototype.stem = function() {
+ return stemmer.stem(this);
+ };
+
+ String.prototype.tokenizeAndStem = function(keepStops) {
+ return stemmer.tokenizeAndStem(this, keepStops);
+ };
+ };
+}
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2011, Chris Umbel,David Przybilla
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+var Tokenizer = require('./tokenizer'),
+ util = require('util');
+
+var AggressiveTokenizer = function() {
+ Tokenizer.call(this);
+};
+util.inherits(AggressiveTokenizer, Tokenizer);
+
+module.exports = AggressiveTokenizer;
+
+AggressiveTokenizer.prototype.tokenize = function(text) {
+ // break a string up into an array of tokens by anything non-word
+ return this.trim(text.split(/\W+/));
+};
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2011, David Przybilla, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// a list of commonly used words that have little meaning and can be excluded
+// from analysis.
+var words = [
+ 'a','un','el','ella','y','sobre','de','la','que','en',
+ 'los','del','se','las','por','un','para','con','no',
+ 'una','su','al','lo','como','más','pero','sus','le',
+ 'ya','o','porque','cuando','muy','sin','sobre','también',
+ 'me','hasta','donde','quien','desde','nos','durante','uno',
+ 'ni','contra','ese','eso','','qué','otro','él','cual',
+ 'poco','mi','','te','ti','',
+ '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '_'];
+
+// tell the world about the noise words.
+exports.words = words;

0 comments on commit c82c096

Please sign in to comment.