Permalink
Browse files

Merge pull request #80 from leofenu/italian_stemmer

Added italian porter stemmer
  • Loading branch information...
2 parents 970b5bf + 31afbae commit 7b8a7c2d0792b747db19d5375397abbbf9cb27f5 @chrisumbel chrisumbel committed Dec 7, 2012
View
@@ -28,11 +28,13 @@ exports.PorterStemmer = require('./stemmers/porter_stemmer');
exports.PorterStemmerFa = require('./stemmers/porter_stemmer_fa');
exports.PorterStemmerRu = require('./stemmers/porter_stemmer_ru');
exports.PorterStemmerEs = require('./stemmers/porter_stemmer_es');
+exports.PorterStemmerIt = require('./stemmers/porter_stemmer_it');
exports.LancasterStemmer = require('./stemmers/lancaster_stemmer');
exports.StemmerJa = require('./stemmers/stemmer_ja');
exports.AggressiveTokenizerFa = require('./tokenizers/aggressive_tokenizer_fa');
exports.AggressiveTokenizerRu = require('./tokenizers/aggressive_tokenizer_ru');
exports.AggressiveTokenizerEs = require('./tokenizers/aggressive_tokenizer_es');
+exports.AggressiveTokenizerIt = require('./tokenizers/aggressive_tokenizer_it');
exports.AggressiveTokenizer = require('./tokenizers/aggressive_tokenizer');
exports.RegexpTokenizer = require('./tokenizers/regexp_tokenizer').RegexpTokenizer;
exports.WordTokenizer = require('./tokenizers/regexp_tokenizer').WordTokenizer;
@@ -0,0 +1,233 @@
+/*
+Copyright (c) 2012, Leonardo Fenu, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+var Stemmer = require('./stemmer_it');
+
+var PorterStemmer = new Stemmer();
+module.exports = PorterStemmer;
+
+
+function isVowel(letter){
+ return (letter == 'a' || letter == 'e' || letter == 'i' || letter == 'o' || letter == 'u' || letter == 'à' ||
+ letter == 'è' || letter == 'ì' || letter == 'ò' || letter == 'ù');
+};
+
+function getNextVowelPos(token,start){
+ start = start + 1;
+ var length = token.length;
+ for (var i = start; i < length; i++) {
+ if (isVowel(token[i])) {
+ return i;
+ }
+ }
+ return length;
+};
+
+function getNextConsonantPos(token,start){
+ length=token.length
+ for (var i = start; i < length; i++)
+ if (!isVowel(token[i])) return i;
+ return length;
+};
+
+
+function endsin(token, suffix) {
+ if (token.length < suffix.length) return false;
+ return (token.slice(-suffix.length) == suffix);
+};
+
+function endsinArr(token, suffixes) {
+ for(var i=0;i<suffixes.length;i++){
+ if (endsin(token, suffixes[i])) return suffixes[i];
+ }
+ return '';
+};
+
+function replaceAcute(token) {
+ var str=token.replace(/á/gi,'à');
+ str=str.replace(/é/gi,'è');
+ str=str.replace(/í/gi,'ì');
+ str=str.replace(/ó/gi,'ò');
+ str=str.replace(/ú/gi,'ù');
+ return str;
+};
+
+function vowelMarking(token) {
+ function replacer(match, p1, p2, p3){
+ return p1+p2.toUpperCase()+p3;
+ };
+ str=token.replace(/([aeiou])(i|u)([aeiou])/g, replacer);
+ return str;
+}
+
+
+// perform full stemming algorithm on a single word
+PorterStemmer.stem = function(token) {
+
+ token = token.toLowerCase();
+ token = replaceAcute(token);
+ token = token.replace(/qu/g,'qU');
+ token = vowelMarking(token);
+
+ if (token.length<3){
+ return token;
+ }
+
+ var r1 = r2 = rv = len = token.length;
+ // R1 is the region after the first non-vowel following a vowel,
+ for(var i=0; i < token.length-1 && r1==len;i++){
+ if(isVowel(token[i]) && !isVowel(token[i+1]) ){
+ r1=i+2;
+ }
+ }
+ // Or is the null region at the end of the word if there is no such non-vowel.
+
+ // R2 is the region after the first non-vowel following a vowel in R1
+ for(var i=r1; i< token.length-1 && r2==len;i++){
+ if(isVowel(token[i]) && !isVowel(token[i+1])){
+ r2=i+2;
+ }
+ }
+
+ // Or is the null region at the end of the word if there is no such non-vowel.
+
+ // If the second letter is a consonant, RV is the region after the next following vowel,
+
+ // RV as follow
+
+ if (len > 3) {
+ if(!isVowel(token[1])) {
+ // If the second letter is a consonant, RV is the region after the next following vowel
+ rv = getNextVowelPos(token, 1) +1;
+ } else if (isVowel(token[0]) && isVowel(token[1])) {
+ // or if the first two letters are vowels, RV is the region after the next consonant
+ rv = getNextConsonantPos(token, 2) + 1;
+ } else {
+ //otherwise (consonant-vowel case) RV is the region after the third letter. But RV is the end of the word if these positions cannot be found.
+ rv = 3;
+ }
+ }
+
+ var r1_txt = token.substring(r1);
+ var r2_txt = token.substring(r2);
+ var rv_txt = token.substring(rv);
+
+ var token_orig = token;
+
+ // Step 0: Attached pronoun
+
+ var pronoun_suf = new Array('glieli','glielo','gliene','gliela','gliele','sene','tene','cela','cele','celi','celo','cene','vela','vele','veli','velo','vene','mela','mele','meli','melo','mene','tela','tele','teli','telo','gli','ci', 'la','le','li','lo','mi','ne','si','ti','vi');
+ var pronoun_suf_pre1 = new Array('ando','endo');
+ var pronoun_suf_pre2 = new Array('ar', 'er', 'ir');
+ var suf = endsinArr(token, pronoun_suf);
+
+ if (suf!='') {
+ var pre_suff1 = endsinArr(rv_txt.slice(0,-suf.length),pronoun_suf_pre1);
+ var pre_suff2 = endsinArr(rv_txt.slice(0,-suf.length),pronoun_suf_pre2);
+
+ if (pre_suff1 != '') {
+ token = token.slice(0,-suf.length);
+ }
+ if (pre_suff2 != '') {
+ token = token.slice(0, -suf.length)+ 'e';
+ }
+ }
+
+ if (token != token_orig) {
+ r1_txt = token.substring(r1);
+ r2_txt = token.substring(r2);
+ rv_txt = token.substring(rv);
+ }
+
+ var token_after0 = token;
+
+ // Step 1: Standard suffix removal
+
+ if ((suf = endsinArr(r2_txt, new Array('ativamente','abilamente','ivamente','osamente','icamente'))) != '') {
+ token = token.slice(0, -suf.length); // delete
+ } else if ((suf = endsinArr(r2_txt, new Array('icazione','icazioni','icatore','icatori','azione','azioni','atore','atori'))) != '') {
+ token = token.slice(0, -suf.length); // delete
+ } else if ((suf = endsinArr(r2_txt, new Array('logia','logie'))) != '') {
+ token = token.slice(0, -suf.length)+ 'log'; // replace with log
+ } else if ((suf =endsinArr(r2_txt, new Array('uzione','uzioni','usione','usioni'))) != '') {
+ token = token.slice(0, -suf.length) + 'u'; // replace with u
+ } else if ((suf = endsinArr(r2_txt, new Array('enza','enze'))) != '') {
+ token = token.slice(0, -suf.length)+ 'ente'; // replace with ente
+ } else if ((suf = endsinArr(rv_txt, new Array('amento', 'amenti', 'imento', 'imenti'))) != '') {
+ token = token.slice(0, -suf.length); // delete
+ } else if ((suf = endsinArr(r1_txt, new Array('amente'))) != '') {
+ token = token.slice(0, -suf.length); // delete
+ } else if ((suf = endsinArr(r2_txt, new Array('atrice','atrici','abile','abili','ibile','ibili','mente','ante','anti','anza','anze','iche','ichi','ismo','ismi','ista','iste','isti','istà','istè','istì','ico','ici','ica','ice','oso','osi','osa','ose'))) != '') {
+ token = token.slice(0, -suf.length); // delete
+ } else if ((suf = endsinArr(r2_txt, new Array('abilità', 'icità', 'ività', 'ità'))) != '') {
+ token = token.slice(0, -suf.length); // delete
+ } else if ((suf = endsinArr(r2_txt, new Array('icativa','icativo','icativi','icative','ativa','ativo','ativi','ative','iva','ivo','ivi','ive'))) != '') {
+ token = token.slice(0, -suf.length);
+ }
+
+
+ if (token != token_after0) {
+ r1_txt = token.substring(r1);
+ r2_txt = token.substring(r2);
+ rv_txt = token.substring(rv);
+ }
+
+
+ var token_after1 = token;
+
+ // Step 2: Verb suffixes
+
+ if (token_after0 == token_after1) {
+ if ((suf = endsinArr(rv_txt, new Array('erebbero','irebbero','assero','assimo','eranno','erebbe','eremmo','ereste','eresti','essero','iranno','irebbe','iremmo','ireste','iresti','iscano','iscono','issero','arono','avamo','avano','avate','eremo','erete','erono','evamo','evano','evate','iremo','irete','irono','ivamo','ivano','ivate','ammo','ando','asse','assi','emmo','enda','ende','endi','endo','erai','Yamo','iamo','immo','irai','irei','isca','isce','isci','isco','erei','uti','uto','ita','ite','iti','ito','iva','ivi','ivo','ono','uta','ute','ano','are','ata','ate','ati','ato','ava','avi','avo','erà','ere','erò','ete','eva','evi','evo','irà','ire','irò','ar','ir'))) != '') {
+ token = token.slice(0, -suf.length);
+ }
+ }
+
+
+ r1_txt = token.substring(r1);
+ r2_txt = token.substring(r2);
+ rv_txt = token.substring(rv);
+
+ // Always do step 3.
+
+ if ((suf = endsinArr(rv_txt, new Array('ia', 'ie', 'ii', 'io', '', '','', '','a','e','i','o','à','è','ì','ò'))) != '') {
+ token = token.slice(0, -suf.length);
+ }
+
+ r1_txt = token.substring(r1);
+ r2_txt = token.substring(r2);
+ rv_txt = token.substring(rv);
+
+ if ((suf =endsinArr(rv_txt, new Array('ch'))) != '') {
+ token = token.slice(0, -suf.length) + 'c'; // replace with c
+ } else if ((suf =endsinArr(rv_txt, new Array('gh'))) != '') {
+ token = token.slice(0, -suf.length) + 'g'; // replace with g
+ }
+
+
+ r1_txt = token.substring(r1);
+ r2_txt = token.substring(r2);
+ rv_txt = token.substring(rv);
+
+ return token.toLowerCase();
+
+};
@@ -0,0 +1,36 @@
+var stopwords = require('../util/stopwords_it');
+var Tokenizer = require('../tokenizers/aggressive_tokenizer_it');
+
+module.exports = function() {
+ var stemmer = this;
+
+ stemmer.stem = function(token) {
+ return token;
+ };
+
+ stemmer.tokenizeAndStem = function(text, keepStops) {
+ var stemmedTokens = [];
+
+ new Tokenizer().tokenize(text).forEach(function(token) {
+ if (keepStops || stopwords.words.indexOf(token) == -1) {
+ var resultToken = token.toLowerCase();
+ if (resultToken.match(/[a-zàèìòù0-9]/gi)) {
+ resultToken = stemmer.stem(resultToken);
+ }
+ stemmedTokens.push(resultToken);
+ }
+ });
+
+ return stemmedTokens;
+ };
+
+ stemmer.attach = function() {
+ String.prototype.stem = function() {
+ return stemmer.stem(this);
+ };
+
+ String.prototype.tokenizeAndStem = function(keepStops) {
+ return stemmer.tokenizeAndStem(this, keepStops);
+ };
+ };
+}
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2011, Chris Umbel,David Przybilla
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+var Tokenizer = require('./tokenizer'),
+ util = require('util');
+
+var AggressiveTokenizer = function() {
+ Tokenizer.call(this);
+};
+util.inherits(AggressiveTokenizer, Tokenizer);
+
+module.exports = AggressiveTokenizer;
+
+AggressiveTokenizer.prototype.tokenize = function(text) {
+ // break a string up into an array of tokens by anything non-word
+ return this.trim(text.split(/\W+/));
+};
@@ -0,0 +1,52 @@
+/*
+Copyright (c) 2011, David Przybilla, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// a list of commonly used words that have little meaning and can be excluded
+// from analysis.
+var words = [
+ 'ad','al','allo','ai','agli','all','agl','alla','alle','con','col','coi','da','dal','dallo',
+ 'dai','dagli','dall','dagl','dalla','dalle','di','del','dello','dei','degli','dell','degl',
+ 'della','delle','in','nel','nello','nei','negli','nell','negl','nella','nelle','su','sul',
+ 'sullo','sui','sugli','sull','sugl','sulla','sulle','per','tra','contro','io','tu','lui',
+ 'lei','noi','voi','loro','mio','mia','miei','mie','tuo','tua','tuoi','tue','suo','sua','suoi',
+ 'sue','nostro','nostra','nostri','nostre','vostro','vostra','vostri','vostre','mi','ti','ci',
+ 'vi','lo','la','li','le','gli','ne','il','un','uno','una','ma','ed','se','perché','anche','come',
+ 'dov','dove','che','chi','cui','non','più','quale','quanto','quanti','quanta','quante','quello',
+ 'quelli','quella','quelle','questo','questi','questa','queste','si','tutto','tutti','a','c','e',
+ 'i','l','o','ho','hai','ha','abbiamo','avete','hanno','abbia','abbiate','abbiano','avrò','avrai',
+ 'avrà','avremo','avrete','avranno','avrei','avresti','avrebbe','avremmo','avreste','avrebbero',
+ 'avevo','avevi','aveva','avevamo','avevate','avevano','ebbi','avesti','ebbe','avemmo','aveste',
+ 'ebbero','avessi','avesse','avessimo','avessero','avendo','avuto','avuta','avuti','avute','sono',
+ 'sei','è','siamo','siete','sia','siate','siano','sarò','sarai','sarà','saremo','sarete','saranno',
+ 'sarei','saresti','sarebbe','saremmo','sareste','sarebbero','ero','eri','era','eravamo','eravate',
+ 'erano','fui','fosti','fu','fummo','foste','furono','fossi','fosse','fossimo','fossero','essendo',
+ 'faccio','fai','facciamo','fanno','faccia','facciate','facciano','farò','farai','farà','faremo',
+ 'farete','faranno','farei','faresti','farebbe','faremmo','fareste','farebbero','facevo','facevi',
+ 'faceva','facevamo','facevate','facevano','feci','facesti','fece','facemmo','faceste','fecero',
+ 'facessi','facesse','facessimo','facessero','facendo','sto','stai','sta','stiamo','stanno','stia',
+ 'stiate','stiano','starò','starai','starà','staremo','starete','staranno','starei','staresti',
+ 'starebbe','staremmo','stareste','starebbero','stavo','stavi','stava','stavamo','stavate','stavano',
+ 'stetti','stesti','stette','stemmo','steste','stettero','stessi','stesse','stessimo','stessero','stando',
+ '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '_'];
+
+// tell the world about the noise words.
+exports.words = words;
Oops, something went wrong.

0 comments on commit 7b8a7c2

Please sign in to comment.