Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Added italian porter stemmer

  • Loading branch information...
commit 31afbae2480fd3d0dbde03adf5f32a2da4c0a1f4 1 parent 970b5bf
@genialeo genialeo authored
View
2  lib/natural/index.js
@@ -28,11 +28,13 @@ exports.PorterStemmer = require('./stemmers/porter_stemmer');
exports.PorterStemmerFa = require('./stemmers/porter_stemmer_fa');
exports.PorterStemmerRu = require('./stemmers/porter_stemmer_ru');
exports.PorterStemmerEs = require('./stemmers/porter_stemmer_es');
+exports.PorterStemmerIt = require('./stemmers/porter_stemmer_it');
exports.LancasterStemmer = require('./stemmers/lancaster_stemmer');
exports.StemmerJa = require('./stemmers/stemmer_ja');
exports.AggressiveTokenizerFa = require('./tokenizers/aggressive_tokenizer_fa');
exports.AggressiveTokenizerRu = require('./tokenizers/aggressive_tokenizer_ru');
exports.AggressiveTokenizerEs = require('./tokenizers/aggressive_tokenizer_es');
+exports.AggressiveTokenizerIt = require('./tokenizers/aggressive_tokenizer_it');
exports.AggressiveTokenizer = require('./tokenizers/aggressive_tokenizer');
exports.RegexpTokenizer = require('./tokenizers/regexp_tokenizer').RegexpTokenizer;
exports.WordTokenizer = require('./tokenizers/regexp_tokenizer').WordTokenizer;
View
233 lib/natural/stemmers/porter_stemmer_it.js
@@ -0,0 +1,233 @@
+/*
+Copyright (c) 2012, Leonardo Fenu, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+var Stemmer = require('./stemmer_it');
+
+var PorterStemmer = new Stemmer();
+module.exports = PorterStemmer;
+
+
+function isVowel(letter){
+ return (letter == 'a' || letter == 'e' || letter == 'i' || letter == 'o' || letter == 'u' || letter == 'à' ||
+ letter == 'è' || letter == 'ì' || letter == 'ò' || letter == 'ù');
+};
+
+function getNextVowelPos(token,start){
+ start = start + 1;
+ var length = token.length;
+ for (var i = start; i < length; i++) {
+ if (isVowel(token[i])) {
+ return i;
+ }
+ }
+ return length;
+};
+
+function getNextConsonantPos(token,start){
+ length=token.length
+ for (var i = start; i < length; i++)
+ if (!isVowel(token[i])) return i;
+ return length;
+};
+
+
+function endsin(token, suffix) {
+ if (token.length < suffix.length) return false;
+ return (token.slice(-suffix.length) == suffix);
+};
+
+function endsinArr(token, suffixes) {
+ for(var i=0;i<suffixes.length;i++){
+ if (endsin(token, suffixes[i])) return suffixes[i];
+ }
+ return '';
+};
+
+function replaceAcute(token) {
+ var str=token.replace(/á/gi,'à');
+ str=str.replace(/é/gi,'è');
+ str=str.replace(/í/gi,'ì');
+ str=str.replace(/ó/gi,'ò');
+ str=str.replace(/ú/gi,'ù');
+ return str;
+};
+
+function vowelMarking(token) {
+ function replacer(match, p1, p2, p3){
+ return p1+p2.toUpperCase()+p3;
+ };
+ str=token.replace(/([aeiou])(i|u)([aeiou])/g, replacer);
+ return str;
+}
+
+
+// perform full stemming algorithm on a single word
+PorterStemmer.stem = function(token) {
+
+ token = token.toLowerCase();
+ token = replaceAcute(token);
+ token = token.replace(/qu/g,'qU');
+ token = vowelMarking(token);
+
+ if (token.length<3){
+ return token;
+ }
+
+ var r1 = r2 = rv = len = token.length;
+ // R1 is the region after the first non-vowel following a vowel,
+ for(var i=0; i < token.length-1 && r1==len;i++){
+ if(isVowel(token[i]) && !isVowel(token[i+1]) ){
+ r1=i+2;
+ }
+ }
+ // Or is the null region at the end of the word if there is no such non-vowel.
+
+ // R2 is the region after the first non-vowel following a vowel in R1
+ for(var i=r1; i< token.length-1 && r2==len;i++){
+ if(isVowel(token[i]) && !isVowel(token[i+1])){
+ r2=i+2;
+ }
+ }
+
+ // Or is the null region at the end of the word if there is no such non-vowel.
+
+ // If the second letter is a consonant, RV is the region after the next following vowel,
+
+ // RV as follow
+
+ if (len > 3) {
+ if(!isVowel(token[1])) {
+ // If the second letter is a consonant, RV is the region after the next following vowel
+ rv = getNextVowelPos(token, 1) +1;
+ } else if (isVowel(token[0]) && isVowel(token[1])) {
+ // or if the first two letters are vowels, RV is the region after the next consonant
+ rv = getNextConsonantPos(token, 2) + 1;
+ } else {
+ //otherwise (consonant-vowel case) RV is the region after the third letter. But RV is the end of the word if these positions cannot be found.
+ rv = 3;
+ }
+ }
+
+ var r1_txt = token.substring(r1);
+ var r2_txt = token.substring(r2);
+ var rv_txt = token.substring(rv);
+
+ var token_orig = token;
+
+ // Step 0: Attached pronoun
+
+ var pronoun_suf = new Array('glieli','glielo','gliene','gliela','gliele','sene','tene','cela','cele','celi','celo','cene','vela','vele','veli','velo','vene','mela','mele','meli','melo','mene','tela','tele','teli','telo','gli','ci', 'la','le','li','lo','mi','ne','si','ti','vi');
+ var pronoun_suf_pre1 = new Array('ando','endo');
+ var pronoun_suf_pre2 = new Array('ar', 'er', 'ir');
+ var suf = endsinArr(token, pronoun_suf);
+
+ if (suf!='') {
+ var pre_suff1 = endsinArr(rv_txt.slice(0,-suf.length),pronoun_suf_pre1);
+ var pre_suff2 = endsinArr(rv_txt.slice(0,-suf.length),pronoun_suf_pre2);
+
+ if (pre_suff1 != '') {
+ token = token.slice(0,-suf.length);
+ }
+ if (pre_suff2 != '') {
+ token = token.slice(0, -suf.length)+ 'e';
+ }
+ }
+
+ if (token != token_orig) {
+ r1_txt = token.substring(r1);
+ r2_txt = token.substring(r2);
+ rv_txt = token.substring(rv);
+ }
+
+ var token_after0 = token;
+
+ // Step 1: Standard suffix removal
+
+ if ((suf = endsinArr(r2_txt, new Array('ativamente','abilamente','ivamente','osamente','icamente'))) != '') {
+ token = token.slice(0, -suf.length); // delete
+ } else if ((suf = endsinArr(r2_txt, new Array('icazione','icazioni','icatore','icatori','azione','azioni','atore','atori'))) != '') {
+ token = token.slice(0, -suf.length); // delete
+ } else if ((suf = endsinArr(r2_txt, new Array('logia','logie'))) != '') {
+ token = token.slice(0, -suf.length)+ 'log'; // replace with log
+ } else if ((suf =endsinArr(r2_txt, new Array('uzione','uzioni','usione','usioni'))) != '') {
+ token = token.slice(0, -suf.length) + 'u'; // replace with u
+ } else if ((suf = endsinArr(r2_txt, new Array('enza','enze'))) != '') {
+ token = token.slice(0, -suf.length)+ 'ente'; // replace with ente
+ } else if ((suf = endsinArr(rv_txt, new Array('amento', 'amenti', 'imento', 'imenti'))) != '') {
+ token = token.slice(0, -suf.length); // delete
+ } else if ((suf = endsinArr(r1_txt, new Array('amente'))) != '') {
+ token = token.slice(0, -suf.length); // delete
+ } else if ((suf = endsinArr(r2_txt, new Array('atrice','atrici','abile','abili','ibile','ibili','mente','ante','anti','anza','anze','iche','ichi','ismo','ismi','ista','iste','isti','istà','istè','istì','ico','ici','ica','ice','oso','osi','osa','ose'))) != '') {
+ token = token.slice(0, -suf.length); // delete
+ } else if ((suf = endsinArr(r2_txt, new Array('abilità', 'icità', 'ività', 'ità'))) != '') {
+ token = token.slice(0, -suf.length); // delete
+ } else if ((suf = endsinArr(r2_txt, new Array('icativa','icativo','icativi','icative','ativa','ativo','ativi','ative','iva','ivo','ivi','ive'))) != '') {
+ token = token.slice(0, -suf.length);
+ }
+
+
+ if (token != token_after0) {
+ r1_txt = token.substring(r1);
+ r2_txt = token.substring(r2);
+ rv_txt = token.substring(rv);
+ }
+
+
+ var token_after1 = token;
+
+ // Step 2: Verb suffixes
+
+ if (token_after0 == token_after1) {
+ if ((suf = endsinArr(rv_txt, new Array('erebbero','irebbero','assero','assimo','eranno','erebbe','eremmo','ereste','eresti','essero','iranno','irebbe','iremmo','ireste','iresti','iscano','iscono','issero','arono','avamo','avano','avate','eremo','erete','erono','evamo','evano','evate','iremo','irete','irono','ivamo','ivano','ivate','ammo','ando','asse','assi','emmo','enda','ende','endi','endo','erai','Yamo','iamo','immo','irai','irei','isca','isce','isci','isco','erei','uti','uto','ita','ite','iti','ito','iva','ivi','ivo','ono','uta','ute','ano','are','ata','ate','ati','ato','ava','avi','avo','erà','ere','erò','ete','eva','evi','evo','irà','ire','irò','ar','ir'))) != '') {
+ token = token.slice(0, -suf.length);
+ }
+ }
+
+
+ r1_txt = token.substring(r1);
+ r2_txt = token.substring(r2);
+ rv_txt = token.substring(rv);
+
+ // Always do step 3.
+
+ if ((suf = endsinArr(rv_txt, new Array('ia', 'ie', 'ii', 'io', '', '','', '','a','e','i','o','à','è','ì','ò'))) != '') {
+ token = token.slice(0, -suf.length);
+ }
+
+ r1_txt = token.substring(r1);
+ r2_txt = token.substring(r2);
+ rv_txt = token.substring(rv);
+
+ if ((suf =endsinArr(rv_txt, new Array('ch'))) != '') {
+ token = token.slice(0, -suf.length) + 'c'; // replace with c
+ } else if ((suf =endsinArr(rv_txt, new Array('gh'))) != '') {
+ token = token.slice(0, -suf.length) + 'g'; // replace with g
+ }
+
+
+ r1_txt = token.substring(r1);
+ r2_txt = token.substring(r2);
+ rv_txt = token.substring(rv);
+
+ return token.toLowerCase();
+
+};
View
36 lib/natural/stemmers/stemmer_it.js
@@ -0,0 +1,36 @@
+var stopwords = require('../util/stopwords_it');
+var Tokenizer = require('../tokenizers/aggressive_tokenizer_it');
+
+module.exports = function() {
+ var stemmer = this;
+
+ stemmer.stem = function(token) {
+ return token;
+ };
+
+ stemmer.tokenizeAndStem = function(text, keepStops) {
+ var stemmedTokens = [];
+
+ new Tokenizer().tokenize(text).forEach(function(token) {
+ if (keepStops || stopwords.words.indexOf(token) == -1) {
+ var resultToken = token.toLowerCase();
+ if (resultToken.match(/[a-zàèìòù0-9]/gi)) {
+ resultToken = stemmer.stem(resultToken);
+ }
+ stemmedTokens.push(resultToken);
+ }
+ });
+
+ return stemmedTokens;
+ };
+
+ stemmer.attach = function() {
+ String.prototype.stem = function() {
+ return stemmer.stem(this);
+ };
+
+ String.prototype.tokenizeAndStem = function(keepStops) {
+ return stemmer.tokenizeAndStem(this, keepStops);
+ };
+ };
+}
View
36 lib/natural/tokenizers/aggressive_tokenizer_it.js
@@ -0,0 +1,36 @@
+/*
+Copyright (c) 2011, Chris Umbel,David Przybilla
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+var Tokenizer = require('./tokenizer'),
+ util = require('util');
+
+var AggressiveTokenizer = function() {
+ Tokenizer.call(this);
+};
+util.inherits(AggressiveTokenizer, Tokenizer);
+
+module.exports = AggressiveTokenizer;
+
+AggressiveTokenizer.prototype.tokenize = function(text) {
+ // break a string up into an array of tokens by anything non-word
+ return this.trim(text.split(/\W+/));
+};
View
52 lib/natural/util/stopwords_it.js
@@ -0,0 +1,52 @@
+/*
+Copyright (c) 2011, David Przybilla, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// a list of commonly used words that have little meaning and can be excluded
+// from analysis.
+var words = [
+ 'ad','al','allo','ai','agli','all','agl','alla','alle','con','col','coi','da','dal','dallo',
+ 'dai','dagli','dall','dagl','dalla','dalle','di','del','dello','dei','degli','dell','degl',
+ 'della','delle','in','nel','nello','nei','negli','nell','negl','nella','nelle','su','sul',
+ 'sullo','sui','sugli','sull','sugl','sulla','sulle','per','tra','contro','io','tu','lui',
+ 'lei','noi','voi','loro','mio','mia','miei','mie','tuo','tua','tuoi','tue','suo','sua','suoi',
+ 'sue','nostro','nostra','nostri','nostre','vostro','vostra','vostri','vostre','mi','ti','ci',
+ 'vi','lo','la','li','le','gli','ne','il','un','uno','una','ma','ed','se','perché','anche','come',
+ 'dov','dove','che','chi','cui','non','più','quale','quanto','quanti','quanta','quante','quello',
+ 'quelli','quella','quelle','questo','questi','questa','queste','si','tutto','tutti','a','c','e',
+ 'i','l','o','ho','hai','ha','abbiamo','avete','hanno','abbia','abbiate','abbiano','avrò','avrai',
+ 'avrà','avremo','avrete','avranno','avrei','avresti','avrebbe','avremmo','avreste','avrebbero',
+ 'avevo','avevi','aveva','avevamo','avevate','avevano','ebbi','avesti','ebbe','avemmo','aveste',
+ 'ebbero','avessi','avesse','avessimo','avessero','avendo','avuto','avuta','avuti','avute','sono',
+ 'sei','è','siamo','siete','sia','siate','siano','sarò','sarai','sarà','saremo','sarete','saranno',
+ 'sarei','saresti','sarebbe','saremmo','sareste','sarebbero','ero','eri','era','eravamo','eravate',
+ 'erano','fui','fosti','fu','fummo','foste','furono','fossi','fosse','fossimo','fossero','essendo',
+ 'faccio','fai','facciamo','fanno','faccia','facciate','facciano','farò','farai','farà','faremo',
+ 'farete','faranno','farei','faresti','farebbe','faremmo','fareste','farebbero','facevo','facevi',
+ 'faceva','facevamo','facevate','facevano','feci','facesti','fece','facemmo','faceste','fecero',
+ 'facessi','facesse','facessimo','facessero','facendo','sto','stai','sta','stiamo','stanno','stia',
+ 'stiate','stiano','starò','starai','starà','staremo','starete','staranno','starei','staresti',
+ 'starebbe','staremmo','stareste','starebbero','stavo','stavi','stava','stavamo','stavate','stavano',
+ 'stetti','stesti','stette','stemmo','steste','stettero','stessi','stesse','stessimo','stessero','stando',
+ '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '_'];
+
+// tell the world about the noise words.
+exports.words = words;
View
42 spec/porter_stemmer_it_spec.js
@@ -0,0 +1,42 @@
+/*
+Copyright (c) 2012, Leonardo Fenu, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+var stemmer = require('lib/natural/stemmers/porter_stemmer_it');
+var fs = require('fs');
+stemmer.attach();
+
+describe('porter_stemmer_it', function() {
+ it('should perform stem', function() {
+ fs.readFileSync('spec/test_data/snowball_it.txt').toString().split('\n').forEach(
+ function (line) {
+ if (line) {
+ var fields = line.split(' -> ');
+ var stemmed = stemmer.stem(fields[0]);
+ expect(stemmed).toEqual(fields[1]);
+ }
+ }
+ );
+ }),
+ it('should tokenize and stem attached', function() {
+ expect('SOPRA la panca la capra CAMPA'.tokenizeAndStem()).toEqual([ 'sopr', 'panc', 'capr', 'camp' ]);
+ expect('SOTTO la panca la capra CREPA'.tokenizeAndStem()).toEqual([ 'sott', 'panc', 'capr', 'crep' ]);
+ });
+});
View
35,482 spec/test_data/snowball_it.txt
35,482 additions, 0 deletions not shown

3 comments on commit 31afbae

@chrisumbel
Owner

I didn't poke around but I get some test failures:

1) should tokenize and stem attached
Message:
Expected [ 'sopr', 'la', 'panc', 'la', 'capr', 'camp' ] to equal [ 'sopr', 'panc', 'capr', 'camp' ].

2) should tokenize and stem attached
Message:
Expected [ 'sotto', 'la', 'panc', 'la', 'capr', 'crep' ] to equal [ 'sott', 'panc', 'capr', 'crep' ].

@genialeo
@chrisumbel
Owner

Ah, I got it fixed. It was the test that was goofed, not the stemmer. I'll explain in the commit.

Thanks again for your contribution! I'm very excited about it!

Please sign in to comment.
Something went wrong with that request. Please try again.