NaturalNode · chrisumbel · Dec 7, 2012 · Dec 5, 2012
diff --git a/lib/natural/index.js b/lib/natural/index.js
@@ -28,11 +28,13 @@ exports.PorterStemmer = require('./stemmers/porter_stemmer');
 exports.PorterStemmerFa = require('./stemmers/porter_stemmer_fa');
 exports.PorterStemmerRu = require('./stemmers/porter_stemmer_ru');
 exports.PorterStemmerEs = require('./stemmers/porter_stemmer_es');
+exports.PorterStemmerIt = require('./stemmers/porter_stemmer_it');
 exports.LancasterStemmer = require('./stemmers/lancaster_stemmer');
 exports.StemmerJa = require('./stemmers/stemmer_ja');
 exports.AggressiveTokenizerFa = require('./tokenizers/aggressive_tokenizer_fa');
 exports.AggressiveTokenizerRu = require('./tokenizers/aggressive_tokenizer_ru');
 exports.AggressiveTokenizerEs = require('./tokenizers/aggressive_tokenizer_es');
+exports.AggressiveTokenizerIt = require('./tokenizers/aggressive_tokenizer_it');
 exports.AggressiveTokenizer = require('./tokenizers/aggressive_tokenizer');
 exports.RegexpTokenizer = require('./tokenizers/regexp_tokenizer').RegexpTokenizer;
 exports.WordTokenizer = require('./tokenizers/regexp_tokenizer').WordTokenizer;

diff --git a/lib/natural/stemmers/porter_stemmer_it.js b/lib/natural/stemmers/porter_stemmer_it.js
@@ -0,0 +1,233 @@
+/*
+Copyright (c) 2012, Leonardo Fenu, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+var Stemmer = require('./stemmer_it');
+
+var PorterStemmer = new Stemmer();
+module.exports = PorterStemmer;
+
+
+function isVowel(letter){
+	return (letter == 'a' || letter == 'e' || letter == 'i' || letter == 'o' || letter == 'u' || letter == 'à' ||
+			letter == 'è' || letter == 'ì' || letter == 'ò' || letter == 'ù');
+};
+
+function getNextVowelPos(token,start){
+	start = start + 1;
+	var length = token.length;
+	for (var i = start; i < length; i++) {
+		if (isVowel(token[i])) {
+			return i;
+		}
+	}
+	return length;
+};
+
+function getNextConsonantPos(token,start){
+	length=token.length
+			for (var i = start; i < length; i++)
+				if (!isVowel(token[i])) return i;
+			return length;
+};
+
+
+function endsin(token, suffix) {
+	if (token.length < suffix.length) return false;
+	return (token.slice(-suffix.length) == suffix);
+};
+
+function endsinArr(token, suffixes) {
+	for(var i=0;i<suffixes.length;i++){
+		if (endsin(token, suffixes[i])) return suffixes[i];
+	}
+	return '';
+};
+
+function replaceAcute(token) {
+	var str=token.replace(/á/gi,'à');
+	str=str.replace(/é/gi,'è');
+	str=str.replace(/í/gi,'ì');
+	str=str.replace(/ó/gi,'ò');
+	str=str.replace(/ú/gi,'ù');
+	return str;
+};
+
+function vowelMarking(token) {
+	function replacer(match, p1, p2, p3){
+  		return p1+p2.toUpperCase()+p3;
+	};	
+	str=token.replace(/([aeiou])(i|u)([aeiou])/g, replacer);	
+	return str;
+}
+
+
+// perform full stemming algorithm on a single word
+PorterStemmer.stem = function(token) {
+
+	token = token.toLowerCase();
+	token = replaceAcute(token);
+	token = token.replace(/qu/g,'qU');	
+	token = vowelMarking(token);
+
+	if (token.length<3){
+		return token;
+	}
+
+	var r1 = r2 = rv = len = token.length;
+	// R1 is the region after the first non-vowel following a vowel, 
+	for(var i=0; i < token.length-1 && r1==len;i++){
+ 		if(isVowel(token[i]) && !isVowel(token[i+1]) ){
+ 			r1=i+2;
+ 		}
+	}
+	// Or is the null region at the end of the word if there is no such non-vowel.  
+
+	// R2 is the region after the first non-vowel following a vowel in R1
+	for(var i=r1; i< token.length-1 && r2==len;i++){
+		if(isVowel(token[i]) && !isVowel(token[i+1])){
+			r2=i+2;
+		}
+	}
+
+	// Or is the null region at the end of the word if there is no such non-vowel. 
+
+	// If the second letter is a consonant, RV is the region after the next following vowel, 
+
+	// RV as follow
+
+	if (len > 3) {
+		if(!isVowel(token[1])) {
+			// If the second letter is a consonant, RV is the region after the next following vowel
+			rv = getNextVowelPos(token, 1) +1;
+		} else if (isVowel(token[0]) && isVowel(token[1])) { 
+			// or if the first two letters are vowels, RV is the region after the next consonant
+			rv = getNextConsonantPos(token, 2) + 1;
+		} else {
+			//otherwise (consonant-vowel case) RV is the region after the third letter. But RV is the end of the word if these positions cannot be found.
+			rv = 3;
+		}
+	}
+
+	var r1_txt = token.substring(r1);
+	var r2_txt = token.substring(r2);
+	var rv_txt = token.substring(rv);
+
+	var token_orig = token;
+
+	// Step 0: Attached pronoun
+
+	var pronoun_suf = new Array('glieli','glielo','gliene','gliela','gliele','sene','tene','cela','cele','celi','celo','cene','vela','vele','veli','velo','vene','mela','mele','meli','melo','mene','tela','tele','teli','telo','gli','ci', 'la','le','li','lo','mi','ne','si','ti','vi');	
+	var pronoun_suf_pre1 = new Array('ando','endo');	
+	var pronoun_suf_pre2 = new Array('ar', 'er', 'ir');
+	var suf = endsinArr(token, pronoun_suf);
+
+	if (suf!='') {
+		var pre_suff1 = endsinArr(rv_txt.slice(0,-suf.length),pronoun_suf_pre1);
+		var pre_suff2 = endsinArr(rv_txt.slice(0,-suf.length),pronoun_suf_pre2);	
+
+		if (pre_suff1 != '') {
+			token = token.slice(0,-suf.length);
+		}
+		if (pre_suff2 != '') {
+			token = token.slice(0,  -suf.length)+ 'e';
+		}
+	}
+
+	if (token != token_orig) {
+		r1_txt = token.substring(r1);
+		r2_txt = token.substring(r2);
+		rv_txt = token.substring(rv);
+	}
+
+	var token_after0 = token;
+
+	// Step 1:  Standard suffix removal
+
+	if ((suf = endsinArr(r2_txt, new  Array('ativamente','abilamente','ivamente','osamente','icamente'))) != '') {
+		token = token.slice(0, -suf.length);	// delete
+	} else if ((suf = endsinArr(r2_txt, new  Array('icazione','icazioni','icatore','icatori','azione','azioni','atore','atori'))) != '') {
+		token = token.slice(0,  -suf.length);	// delete
+	} else if ((suf = endsinArr(r2_txt, new  Array('logia','logie'))) != '') {
+		token = token.slice(0,  -suf.length)+ 'log'; // replace with log
+	} else if ((suf =endsinArr(r2_txt, new  Array('uzione','uzioni','usione','usioni'))) != '') {
+		token = token.slice(0,  -suf.length) + 'u'; // replace with u
+	} else if ((suf = endsinArr(r2_txt, new  Array('enza','enze'))) != '') {
+		token = token.slice(0,  -suf.length)+ 'ente'; // replace with ente
+	} else if ((suf = endsinArr(rv_txt, new  Array('amento', 'amenti', 'imento', 'imenti'))) != '') {
+		token = token.slice(0,  -suf.length);	// delete
+	} else if ((suf = endsinArr(r1_txt, new  Array('amente'))) != '') {
+		token = token.slice(0,  -suf.length); // delete
+	} else if ((suf = endsinArr(r2_txt, new Array('atrice','atrici','abile','abili','ibile','ibili','mente','ante','anti','anza','anze','iche','ichi','ismo','ismi','ista','iste','isti','istà','istè','istì','ico','ici','ica','ice','oso','osi','osa','ose'))) != '') {
+		token = token.slice(0,  -suf.length); // delete
+	} else if ((suf = endsinArr(r2_txt, new  Array('abilità', 'icità', 'ività', 'ità'))) != '') {
+		token = token.slice(0,  -suf.length); // delete
+	} else if ((suf = endsinArr(r2_txt, new  Array('icativa','icativo','icativi','icative','ativa','ativo','ativi','ative','iva','ivo','ivi','ive'))) != '') {
+		token = token.slice(0,  -suf.length);
+	}
+
+
+	if (token != token_after0) {
+		r1_txt = token.substring(r1);
+		r2_txt = token.substring(r2);
+		rv_txt = token.substring(rv);
+	}
+
+
+	var token_after1 = token;
+
+	// Step 2:  Verb suffixes
+
+	if (token_after0 == token_after1) {
+		if ((suf = endsinArr(rv_txt, new Array('erebbero','irebbero','assero','assimo','eranno','erebbe','eremmo','ereste','eresti','essero','iranno','irebbe','iremmo','ireste','iresti','iscano','iscono','issero','arono','avamo','avano','avate','eremo','erete','erono','evamo','evano','evate','iremo','irete','irono','ivamo','ivano','ivate','ammo','ando','asse','assi','emmo','enda','ende','endi','endo','erai','Yamo','iamo','immo','irai','irei','isca','isce','isci','isco','erei','uti','uto','ita','ite','iti','ito','iva','ivi','ivo','ono','uta','ute','ano','are','ata','ate','ati','ato','ava','avi','avo','erà','ere','erò','ete','eva','evi','evo','irà','ire','irò','ar','ir'))) != '') {
+			token = token.slice(0, -suf.length);
+		}
+	}
+
+
+	r1_txt = token.substring(r1);
+	r2_txt = token.substring(r2);
+	rv_txt = token.substring(rv);
+
+	// Always do step 3. 
+
+	if ((suf = endsinArr(rv_txt, new Array('ia', 'ie', 'ii', 'io', 'ià', 'iè','iì', 'iò','a','e','i','o','à','è','ì','ò'))) != '') {
+		token = token.slice(0, -suf.length);
+	} 
+
+	r1_txt = token.substring(r1);
+	r2_txt = token.substring(r2);
+	rv_txt = token.substring(rv);
+
+	if ((suf =endsinArr(rv_txt, new  Array('ch'))) != '') {
+		token = token.slice(0,  -suf.length) + 'c'; // replace with c
+	} else if ((suf =endsinArr(rv_txt, new  Array('gh'))) != '') {
+		token = token.slice(0,  -suf.length) + 'g'; // replace with g
+	}
+
+
+	r1_txt = token.substring(r1);
+	r2_txt = token.substring(r2);
+	rv_txt = token.substring(rv);
+
+	return token.toLowerCase();
+
+};
diff --git a/lib/natural/stemmers/stemmer_it.js b/lib/natural/stemmers/stemmer_it.js
@@ -0,0 +1,36 @@
+var stopwords = require('../util/stopwords_it');
+var Tokenizer = require('../tokenizers/aggressive_tokenizer_it');
+
+module.exports = function() {
+    var stemmer = this;
+
+    stemmer.stem = function(token) {
+        return token;
+    };
+
+    stemmer.tokenizeAndStem = function(text, keepStops) {
+        var stemmedTokens = [];
+
+        new Tokenizer().tokenize(text).forEach(function(token) {
+            if (keepStops || stopwords.words.indexOf(token) == -1) {
+                var resultToken = token.toLowerCase();
+                if (resultToken.match(/[a-zàèìòù0-9]/gi)) {
+                    resultToken = stemmer.stem(resultToken);
+                }
+                stemmedTokens.push(resultToken);
+            }
+        });
+
+        return stemmedTokens;
+    };
+
+    stemmer.attach = function() {
+        String.prototype.stem = function() {
+            return stemmer.stem(this);
+        };
+
+        String.prototype.tokenizeAndStem = function(keepStops) {
+            return stemmer.tokenizeAndStem(this, keepStops);
+        };
+    };
+}
diff --git a/lib/natural/tokenizers/aggressive_tokenizer_it.js b/lib/natural/tokenizers/aggressive_tokenizer_it.js
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2011, Chris Umbel,David Przybilla
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+var Tokenizer = require('./tokenizer'),
+    util = require('util');
+
+var AggressiveTokenizer = function() {
+    Tokenizer.call(this);    
+};
+util.inherits(AggressiveTokenizer, Tokenizer);
+
+module.exports = AggressiveTokenizer;
+
+AggressiveTokenizer.prototype.tokenize = function(text) {
+    // break a string up into an array of tokens by anything non-word
+    return this.trim(text.split(/\W+/));
+};
diff --git a/lib/natural/util/stopwords_it.js b/lib/natural/util/stopwords_it.js
@@ -0,0 +1,52 @@
+/*
+Copyright (c) 2011, David Przybilla, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// a list of commonly used words that have little meaning and can be excluded
+// from analysis.
+var words = [
+    'ad','al','allo','ai','agli','all','agl','alla','alle','con','col','coi','da','dal','dallo',
+    'dai','dagli','dall','dagl','dalla','dalle','di','del','dello','dei','degli','dell','degl',
+    'della','delle','in','nel','nello','nei','negli','nell','negl','nella','nelle','su','sul',
+    'sullo','sui','sugli','sull','sugl','sulla','sulle','per','tra','contro','io','tu','lui',
+    'lei','noi','voi','loro','mio','mia','miei','mie','tuo','tua','tuoi','tue','suo','sua','suoi',
+    'sue','nostro','nostra','nostri','nostre','vostro','vostra','vostri','vostre','mi','ti','ci',
+    'vi','lo','la','li','le','gli','ne','il','un','uno','una','ma','ed','se','perché','anche','come',
+    'dov','dove','che','chi','cui','non','più','quale','quanto','quanti','quanta','quante','quello',
+    'quelli','quella','quelle','questo','questi','questa','queste','si','tutto','tutti','a','c','e',
+    'i','l','o','ho','hai','ha','abbiamo','avete','hanno','abbia','abbiate','abbiano','avrò','avrai',
+    'avrà','avremo','avrete','avranno','avrei','avresti','avrebbe','avremmo','avreste','avrebbero',
+    'avevo','avevi','aveva','avevamo','avevate','avevano','ebbi','avesti','ebbe','avemmo','aveste',
+    'ebbero','avessi','avesse','avessimo','avessero','avendo','avuto','avuta','avuti','avute','sono',
+    'sei','è','siamo','siete','sia','siate','siano','sarò','sarai','sarà','saremo','sarete','saranno',
+    'sarei','saresti','sarebbe','saremmo','sareste','sarebbero','ero','eri','era','eravamo','eravate',
+    'erano','fui','fosti','fu','fummo','foste','furono','fossi','fosse','fossimo','fossero','essendo',
+    'faccio','fai','facciamo','fanno','faccia','facciate','facciano','farò','farai','farà','faremo',
+    'farete','faranno','farei','faresti','farebbe','faremmo','fareste','farebbero','facevo','facevi',
+    'faceva','facevamo','facevate','facevano','feci','facesti','fece','facemmo','faceste','fecero',
+    'facessi','facesse','facessimo','facessero','facendo','sto','stai','sta','stiamo','stanno','stia',
+    'stiate','stiano','starò','starai','starà','staremo','starete','staranno','starei','staresti',
+    'starebbe','staremmo','stareste','starebbero','stavo','stavi','stava','stavamo','stavate','stavano',
+    'stetti','stesti','stette','stemmo','steste','stettero','stessi','stesse','stessimo','stessero','stando',
+     '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '_'];
+
+// tell the world about the noise words.    
+exports.words = words;