Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Added italian porter stemmer #80

Merged
merged 1 commit into from Dec 7, 2012
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Jump to
Jump to file
Failed to load files.
Diff view
Diff view
2 changes: 2 additions & 0 deletions lib/natural/index.js
Expand Up @@ -28,11 +28,13 @@ exports.PorterStemmer = require('./stemmers/porter_stemmer');
exports.PorterStemmerFa = require('./stemmers/porter_stemmer_fa');
exports.PorterStemmerRu = require('./stemmers/porter_stemmer_ru');
exports.PorterStemmerEs = require('./stemmers/porter_stemmer_es');
exports.PorterStemmerIt = require('./stemmers/porter_stemmer_it');
exports.LancasterStemmer = require('./stemmers/lancaster_stemmer');
exports.StemmerJa = require('./stemmers/stemmer_ja');
exports.AggressiveTokenizerFa = require('./tokenizers/aggressive_tokenizer_fa');
exports.AggressiveTokenizerRu = require('./tokenizers/aggressive_tokenizer_ru');
exports.AggressiveTokenizerEs = require('./tokenizers/aggressive_tokenizer_es');
exports.AggressiveTokenizerIt = require('./tokenizers/aggressive_tokenizer_it');
exports.AggressiveTokenizer = require('./tokenizers/aggressive_tokenizer');
exports.RegexpTokenizer = require('./tokenizers/regexp_tokenizer').RegexpTokenizer;
exports.WordTokenizer = require('./tokenizers/regexp_tokenizer').WordTokenizer;
Expand Down
233 changes: 233 additions & 0 deletions lib/natural/stemmers/porter_stemmer_it.js
@@ -0,0 +1,233 @@
/*
Copyright (c) 2012, Leonardo Fenu, Chris Umbel

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

var Stemmer = require('./stemmer_it');

var PorterStemmer = new Stemmer();
module.exports = PorterStemmer;


function isVowel(letter){
return (letter == 'a' || letter == 'e' || letter == 'i' || letter == 'o' || letter == 'u' || letter == 'à' ||
letter == 'è' || letter == 'ì' || letter == 'ò' || letter == 'ù');
};

function getNextVowelPos(token,start){
start = start + 1;
var length = token.length;
for (var i = start; i < length; i++) {
if (isVowel(token[i])) {
return i;
}
}
return length;
};

function getNextConsonantPos(token,start){
length=token.length
for (var i = start; i < length; i++)
if (!isVowel(token[i])) return i;
return length;
};


function endsin(token, suffix) {
if (token.length < suffix.length) return false;
return (token.slice(-suffix.length) == suffix);
};

function endsinArr(token, suffixes) {
for(var i=0;i<suffixes.length;i++){
if (endsin(token, suffixes[i])) return suffixes[i];
}
return '';
};

function replaceAcute(token) {
var str=token.replace(/á/gi,'à');
str=str.replace(/é/gi,'è');
str=str.replace(/í/gi,'ì');
str=str.replace(/ó/gi,'ò');
str=str.replace(/ú/gi,'ù');
return str;
};

function vowelMarking(token) {
function replacer(match, p1, p2, p3){
return p1+p2.toUpperCase()+p3;
};
str=token.replace(/([aeiou])(i|u)([aeiou])/g, replacer);
return str;
}


// perform full stemming algorithm on a single word
PorterStemmer.stem = function(token) {

token = token.toLowerCase();
token = replaceAcute(token);
token = token.replace(/qu/g,'qU');
token = vowelMarking(token);

if (token.length<3){
return token;
}

var r1 = r2 = rv = len = token.length;
// R1 is the region after the first non-vowel following a vowel,
for(var i=0; i < token.length-1 && r1==len;i++){
if(isVowel(token[i]) && !isVowel(token[i+1]) ){
r1=i+2;
}
}
// Or is the null region at the end of the word if there is no such non-vowel.

// R2 is the region after the first non-vowel following a vowel in R1
for(var i=r1; i< token.length-1 && r2==len;i++){
if(isVowel(token[i]) && !isVowel(token[i+1])){
r2=i+2;
}
}

// Or is the null region at the end of the word if there is no such non-vowel.

// If the second letter is a consonant, RV is the region after the next following vowel,

// RV as follow

if (len > 3) {
if(!isVowel(token[1])) {
// If the second letter is a consonant, RV is the region after the next following vowel
rv = getNextVowelPos(token, 1) +1;
} else if (isVowel(token[0]) && isVowel(token[1])) {
// or if the first two letters are vowels, RV is the region after the next consonant
rv = getNextConsonantPos(token, 2) + 1;
} else {
//otherwise (consonant-vowel case) RV is the region after the third letter. But RV is the end of the word if these positions cannot be found.
rv = 3;
}
}

var r1_txt = token.substring(r1);
var r2_txt = token.substring(r2);
var rv_txt = token.substring(rv);

var token_orig = token;

// Step 0: Attached pronoun

var pronoun_suf = new Array('glieli','glielo','gliene','gliela','gliele','sene','tene','cela','cele','celi','celo','cene','vela','vele','veli','velo','vene','mela','mele','meli','melo','mene','tela','tele','teli','telo','gli','ci', 'la','le','li','lo','mi','ne','si','ti','vi');
var pronoun_suf_pre1 = new Array('ando','endo');
var pronoun_suf_pre2 = new Array('ar', 'er', 'ir');
var suf = endsinArr(token, pronoun_suf);

if (suf!='') {
var pre_suff1 = endsinArr(rv_txt.slice(0,-suf.length),pronoun_suf_pre1);
var pre_suff2 = endsinArr(rv_txt.slice(0,-suf.length),pronoun_suf_pre2);

if (pre_suff1 != '') {
token = token.slice(0,-suf.length);
}
if (pre_suff2 != '') {
token = token.slice(0, -suf.length)+ 'e';
}
}

if (token != token_orig) {
r1_txt = token.substring(r1);
r2_txt = token.substring(r2);
rv_txt = token.substring(rv);
}

var token_after0 = token;

// Step 1: Standard suffix removal

if ((suf = endsinArr(r2_txt, new Array('ativamente','abilamente','ivamente','osamente','icamente'))) != '') {
token = token.slice(0, -suf.length); // delete
} else if ((suf = endsinArr(r2_txt, new Array('icazione','icazioni','icatore','icatori','azione','azioni','atore','atori'))) != '') {
token = token.slice(0, -suf.length); // delete
} else if ((suf = endsinArr(r2_txt, new Array('logia','logie'))) != '') {
token = token.slice(0, -suf.length)+ 'log'; // replace with log
} else if ((suf =endsinArr(r2_txt, new Array('uzione','uzioni','usione','usioni'))) != '') {
token = token.slice(0, -suf.length) + 'u'; // replace with u
} else if ((suf = endsinArr(r2_txt, new Array('enza','enze'))) != '') {
token = token.slice(0, -suf.length)+ 'ente'; // replace with ente
} else if ((suf = endsinArr(rv_txt, new Array('amento', 'amenti', 'imento', 'imenti'))) != '') {
token = token.slice(0, -suf.length); // delete
} else if ((suf = endsinArr(r1_txt, new Array('amente'))) != '') {
token = token.slice(0, -suf.length); // delete
} else if ((suf = endsinArr(r2_txt, new Array('atrice','atrici','abile','abili','ibile','ibili','mente','ante','anti','anza','anze','iche','ichi','ismo','ismi','ista','iste','isti','istà','istè','istì','ico','ici','ica','ice','oso','osi','osa','ose'))) != '') {
token = token.slice(0, -suf.length); // delete
} else if ((suf = endsinArr(r2_txt, new Array('abilità', 'icità', 'ività', 'ità'))) != '') {
token = token.slice(0, -suf.length); // delete
} else if ((suf = endsinArr(r2_txt, new Array('icativa','icativo','icativi','icative','ativa','ativo','ativi','ative','iva','ivo','ivi','ive'))) != '') {
token = token.slice(0, -suf.length);
}


if (token != token_after0) {
r1_txt = token.substring(r1);
r2_txt = token.substring(r2);
rv_txt = token.substring(rv);
}


var token_after1 = token;

// Step 2: Verb suffixes

if (token_after0 == token_after1) {
if ((suf = endsinArr(rv_txt, new Array('erebbero','irebbero','assero','assimo','eranno','erebbe','eremmo','ereste','eresti','essero','iranno','irebbe','iremmo','ireste','iresti','iscano','iscono','issero','arono','avamo','avano','avate','eremo','erete','erono','evamo','evano','evate','iremo','irete','irono','ivamo','ivano','ivate','ammo','ando','asse','assi','emmo','enda','ende','endi','endo','erai','Yamo','iamo','immo','irai','irei','isca','isce','isci','isco','erei','uti','uto','ita','ite','iti','ito','iva','ivi','ivo','ono','uta','ute','ano','are','ata','ate','ati','ato','ava','avi','avo','erà','ere','erò','ete','eva','evi','evo','irà','ire','irò','ar','ir'))) != '') {
token = token.slice(0, -suf.length);
}
}


r1_txt = token.substring(r1);
r2_txt = token.substring(r2);
rv_txt = token.substring(rv);

// Always do step 3.

if ((suf = endsinArr(rv_txt, new Array('ia', 'ie', 'ii', 'io', 'ià', 'iè','iì', 'iò','a','e','i','o','à','è','ì','ò'))) != '') {
token = token.slice(0, -suf.length);
}

r1_txt = token.substring(r1);
r2_txt = token.substring(r2);
rv_txt = token.substring(rv);

if ((suf =endsinArr(rv_txt, new Array('ch'))) != '') {
token = token.slice(0, -suf.length) + 'c'; // replace with c
} else if ((suf =endsinArr(rv_txt, new Array('gh'))) != '') {
token = token.slice(0, -suf.length) + 'g'; // replace with g
}


r1_txt = token.substring(r1);
r2_txt = token.substring(r2);
rv_txt = token.substring(rv);

return token.toLowerCase();

};
36 changes: 36 additions & 0 deletions lib/natural/stemmers/stemmer_it.js
@@ -0,0 +1,36 @@
var stopwords = require('../util/stopwords_it');
var Tokenizer = require('../tokenizers/aggressive_tokenizer_it');

module.exports = function() {
var stemmer = this;

stemmer.stem = function(token) {
return token;
};

stemmer.tokenizeAndStem = function(text, keepStops) {
var stemmedTokens = [];

new Tokenizer().tokenize(text).forEach(function(token) {
if (keepStops || stopwords.words.indexOf(token) == -1) {
var resultToken = token.toLowerCase();
if (resultToken.match(/[a-zàèìòù0-9]/gi)) {
resultToken = stemmer.stem(resultToken);
}
stemmedTokens.push(resultToken);
}
});

return stemmedTokens;
};

stemmer.attach = function() {
String.prototype.stem = function() {
return stemmer.stem(this);
};

String.prototype.tokenizeAndStem = function(keepStops) {
return stemmer.tokenizeAndStem(this, keepStops);
};
};
}
36 changes: 36 additions & 0 deletions lib/natural/tokenizers/aggressive_tokenizer_it.js
@@ -0,0 +1,36 @@
/*
Copyright (c) 2011, Chris Umbel,David Przybilla

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

var Tokenizer = require('./tokenizer'),
util = require('util');

var AggressiveTokenizer = function() {
Tokenizer.call(this);
};
util.inherits(AggressiveTokenizer, Tokenizer);

module.exports = AggressiveTokenizer;

AggressiveTokenizer.prototype.tokenize = function(text) {
// break a string up into an array of tokens by anything non-word
return this.trim(text.split(/\W+/));
};
52 changes: 52 additions & 0 deletions lib/natural/util/stopwords_it.js
@@ -0,0 +1,52 @@
/*
Copyright (c) 2011, David Przybilla, Chris Umbel

Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:

The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.

THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

// a list of commonly used words that have little meaning and can be excluded
// from analysis.
var words = [
'ad','al','allo','ai','agli','all','agl','alla','alle','con','col','coi','da','dal','dallo',
'dai','dagli','dall','dagl','dalla','dalle','di','del','dello','dei','degli','dell','degl',
'della','delle','in','nel','nello','nei','negli','nell','negl','nella','nelle','su','sul',
'sullo','sui','sugli','sull','sugl','sulla','sulle','per','tra','contro','io','tu','lui',
'lei','noi','voi','loro','mio','mia','miei','mie','tuo','tua','tuoi','tue','suo','sua','suoi',
'sue','nostro','nostra','nostri','nostre','vostro','vostra','vostri','vostre','mi','ti','ci',
'vi','lo','la','li','le','gli','ne','il','un','uno','una','ma','ed','se','perché','anche','come',
'dov','dove','che','chi','cui','non','più','quale','quanto','quanti','quanta','quante','quello',
'quelli','quella','quelle','questo','questi','questa','queste','si','tutto','tutti','a','c','e',
'i','l','o','ho','hai','ha','abbiamo','avete','hanno','abbia','abbiate','abbiano','avrò','avrai',
'avrà','avremo','avrete','avranno','avrei','avresti','avrebbe','avremmo','avreste','avrebbero',
'avevo','avevi','aveva','avevamo','avevate','avevano','ebbi','avesti','ebbe','avemmo','aveste',
'ebbero','avessi','avesse','avessimo','avessero','avendo','avuto','avuta','avuti','avute','sono',
'sei','è','siamo','siete','sia','siate','siano','sarò','sarai','sarà','saremo','sarete','saranno',
'sarei','saresti','sarebbe','saremmo','sareste','sarebbero','ero','eri','era','eravamo','eravate',
'erano','fui','fosti','fu','fummo','foste','furono','fossi','fosse','fossimo','fossero','essendo',
'faccio','fai','facciamo','fanno','faccia','facciate','facciano','farò','farai','farà','faremo',
'farete','faranno','farei','faresti','farebbe','faremmo','fareste','farebbero','facevo','facevi',
'faceva','facevamo','facevate','facevano','feci','facesti','fece','facemmo','faceste','fecero',
'facessi','facesse','facessimo','facessero','facendo','sto','stai','sta','stiamo','stanno','stia',
'stiate','stiano','starò','starai','starà','staremo','starete','staranno','starei','staresti',
'starebbe','staremmo','stareste','starebbero','stavo','stavi','stava','stavamo','stavate','stavano',
'stetti','stesti','stette','stemmo','steste','stettero','stessi','stesse','stessimo','stessero','stando',
'1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '_'];

// tell the world about the noise words.
exports.words = words;