Skip to content

Commit

Permalink
Merge pull request #41 from Nordberg/master
Browse files Browse the repository at this point in the history
Natural for Russian language
  • Loading branch information
chrisumbel committed May 19, 2012
2 parents ee33618 + 5f162b3 commit d1ec5dc
Show file tree
Hide file tree
Showing 6 changed files with 362 additions and 0 deletions.
2 changes: 2 additions & 0 deletions lib/natural/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,9 @@ exports.SoundEx = require('./phonetics/soundex');
exports.Metaphone = require('./phonetics/metaphone');
exports.DoubleMetaphone = require('./phonetics/double_metaphone');
exports.PorterStemmer = require('./stemmers/porter_stemmer');
exports.PorterStemmerRu = require('./stemmers/porter_stemmer_ru');
exports.LancasterStemmer = require('./stemmers/lancaster_stemmer');
exports.AggressiveTokenizerRu = require('./tokenizers/aggressive_tokenizer_ru');
exports.AggressiveTokenizer = require('./tokenizers/aggressive_tokenizer');
exports.RegexpTokenizer = require('./tokenizers/regexp_tokenizer').RegexpTokenizer;
exports.WordTokenizer = require('./tokenizers/regexp_tokenizer').WordTokenizer;
Expand Down
151 changes: 151 additions & 0 deletions lib/natural/stemmers/porter_stemmer_ru.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,151 @@
/*
Copyright (c) 2012, Polyakov Vladimir, Chris Umbel
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

var Stemmer = require('./stemmer_ru');

var PorterStemmer = new Stemmer();
module.exports = PorterStemmer;

function attemptReplacePatterns(token, patterns) {
var replacement = null;
var i = 0, isReplaced = false;
while ((i < patterns.length) && !isReplaced) {
if (patterns[i][0].test(token)) {
replacement = token.replace(patterns[i][0], patterns[i][1]);
isReplaced = true;
}
i++;
}
return replacement;
};

function perfectiveGerund(token) {
var result = attemptReplacePatterns(token, [
[/[ая]в(ши|шись)$/g, ''],
[/(ив|ивши|ившись|ывши|ывшись|ыв)$/g, '']
]);
return result;
};

function adjectival(token) {
var result = adjective(token);
if (result != null) {
var pariticipleResult = participle(result);
result = pariticipleResult ? pariticipleResult : result;
}
return result;
};

function adjective(token) {
var result = attemptReplacePatterns(token, [
[/(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$/g, '']
]);
return result;
};

function participle(token) {
var result = attemptReplacePatterns(token, [
[/([ая])(ем|нн|вш|ющ|щ)$/g, '$1'],
[/(ивш|ывш|ующ)$/g, '']
]);
return result;
};

function reflexive(token) {
var result = attemptReplacePatterns(token, [
[/(ся|сь)$/g, '']
]);
return result;
};

function verb(token) {
var result = attemptReplacePatterns(token, [
[/([ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)$/g, '$1'],
[/(ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|ит|ыт|ены|ить|ыть|ишь|ую|ю)$/g, '']
]);
return result;
};

function noun(token) {
var result = attemptReplacePatterns(token, [
[/(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$/g, '']
]);
return result;
};

function superlative (token) {
var result = attemptReplacePatterns(token, [
[/(ейш|ейше)$/g, '']
]);
return result;
};

function derivational (token) {
var result = attemptReplacePatterns(token, [
[/(ост|ость)$/g, '']
]);
return result;
};

// perform full stemming algorithm on a single word
PorterStemmer.stem = function(token) {
token = token.toLowerCase().replace(/ё/g, 'e');
var volwesRegexp = /^(.*?[аеиоюяуыиэ])(.*)$/g;
var RV = volwesRegexp.exec(token);
if (!RV || RV.length < 3) {
return token;
}
var head = RV[1];
RV = RV[2];
volwesRegexp.lastIndex = 0;
var R2 = volwesRegexp.exec(RV);
var result = perfectiveGerund(RV);
if (result === null) {
var resultReflexive = reflexive(RV) || RV;
result = adjectival(resultReflexive);
if (result === null) {
result = verb(resultReflexive);
if (result === null) {
result = noun(resultReflexive);
if (result === null) {
result = resultReflexive;
}
}
}
}
result = result.replace(/и$/g, '');
var derivationalResult = result
if (R2 && R2[2]) {
derivationalResult = derivational(R2[2]);
if (derivationalResult != null) {
derivationalResult = derivational(result);
} else {
derivationalResult = result;
}
}

var superlativeResult = superlative(derivationalResult) || derivationalResult;

superlativeResult = superlativeResult.replace(/(н)н/g, '$1');
superlativeResult = superlativeResult.replace(/ь$/g, '');
return head + superlativeResult;
};
58 changes: 58 additions & 0 deletions lib/natural/stemmers/stemmer_ru.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@
/*
Copyright (c) 2012, Polyakov Vladimir, Chris Umbel
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

var stopwords = require('../util/stopwords_ru');
var Tokenizer = require('../tokenizers/aggressive_tokenizer_ru');

module.exports = function() {
var stemmer = this;

stemmer.stem = function(token) {
return token;
};

stemmer.tokenizeAndStem = function(text, keepStops) {
var stemmedTokens = [];

new Tokenizer().tokenize(text).forEach(function(token) {
if (keepStops || stopwords.words.indexOf(token) == -1) {
var resultToken = token.toLowerCase();
if (resultToken.match(new RegExp('[а-яё0-9]+', 'gi'))) {
resultToken = stemmer.stem(resultToken);
}
stemmedTokens.push(resultToken);
}
});

return stemmedTokens;
};

stemmer.attach = function() {
String.prototype.stem = function() {
return stemmer.stem(this);
};

String.prototype.tokenizeAndStem = function(keepStops) {
return stemmer.tokenizeAndStem(this, keepStops);
};
};
}
47 changes: 47 additions & 0 deletions lib/natural/tokenizers/aggressive_tokenizer_ru.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,47 @@
/*
Copyright (c) 2011, Chris Umbel
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

var Tokenizer = require('./tokenizer'),
util = require('util');

AggressiveTokenizer = function() {
Tokenizer.call(this);
};
util.inherits(AggressiveTokenizer, Tokenizer);

module.exports = AggressiveTokenizer;

AggressiveTokenizer.prototype.clearEmptyString = function(array) {
return array.filter(function(a) {
return a != '';
});
};

AggressiveTokenizer.prototype.clearText = function(text) {
return text.replace(new RegExp('«|»|!|\\?', 'g'), ' ');
};

AggressiveTokenizer.prototype.tokenize = function(text) {
// break a string up into an array of tokens by anything non-word
text = this.clearText(text);
return this.clearEmptyString(text.split(/-|[|$|\b|\(|\)|[ \s\xA0'\.,:"]+/gi));
};
41 changes: 41 additions & 0 deletions lib/natural/util/stopwords_ru.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,41 @@
/*
Copyright (c) 2011, Polyakov Vladimir, Chris Umbel
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

// a list of commonly used words that have little meaning and can be excluded
// from analysis.
var words = [
'о', 'после', 'все', 'также', 'и', 'другие', 'все', 'как', 'во', 'быть',
'потому', 'был', 'до', 'являюсь', 'между', 'все', 'но', 'от', 'иди', 'могу',
'подойди', 'мог', 'делал', 'делаю', 'каждый', 'для', 'откуда', 'иметь', 'имел',
'он', 'имеет', 'её', 'здесь', 'его', 'как', 'если', 'в', 'оно', 'за',
'делать', 'много', 'я', 'может быть', 'более', 'самый', 'должен',
'мой', 'никогда', 'сейчас', 'из', 'на', 'только', 'или', 'другой', 'другая',
'другое', 'наше', 'вне', 'конец', 'сказал', 'сказала', 'также', 'видел', 'c',
'немного', 'все еще', 'так', 'затем', 'тот', 'их', 'там', 'этот', 'они', 'те',
'через', 'тоже', 'под', 'над', 'очень', 'был', 'путь', 'мы', 'хорошо',
'что', 'где', 'который', 'пока', 'кто', 'с кем', 'хотел бы', 'ты', 'твои',
'а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н',
'o', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь',
'э', 'ю', 'я','$', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '_'];

// tell the world about the noise words.
exports.words = words;
63 changes: 63 additions & 0 deletions spec/porter_stemmer_ru_spec.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,63 @@
/*
Copyright (c) 2011, Polyakov Vladimir, Chris Umbel
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/
var stemmer = require('lib/natural/stemmers/porter_stemmer_ru');

var test = [
'в', 'вавиловка', 'вагнера', 'вагон', 'вагона', 'вагоне', 'вагонов', 'вагоном', 'вагоны',
'важная', 'важнее', 'важнейшие', 'важнейшими', 'важничал', 'важно', 'важного', 'важное',
'важной', 'важном', 'важному', 'важности', 'важностию', 'важность', 'важностью', 'важную',
'важны', 'важные', 'важный', 'важным', 'важных', 'вазах', 'вазы', 'вакса', 'вакханка', 'вал',
'валандался', 'валентина', 'валериановых', 'валерию', 'валетами', 'вали', 'валил', 'валился',
'валится', 'валов', 'вальдшнепа', 'вальс', 'вальса', 'вальсе', 'вальсишку', 'вальтера', 'валяется',
'валялась', 'валялись', 'валялось', 'валялся', 'валять', 'валяются', 'вам', 'вами', 'п', 'па', 'пава',
'павел', 'павильон', 'павильонам', 'павла', 'павлиний', 'павлиньи', 'павлиньим', 'павлович', 'павловна',
'павловне', 'павловной', 'павловну', 'павловны', 'павловцы', 'павлыч', 'павлыча', 'пагубная', 'падает',
'падай', 'падал', 'падала', 'падаль', 'падать', 'падаю', 'падают', 'падающего', 'падающие', 'падеж', 'падение',
'падением', 'падении', 'падений', 'падения', 'паденье', 'паденья', 'падет', 'падут', 'падучая', 'падчерицей',
'падчерицы', 'падшая', 'падшей', 'падшему', 'падший', 'падшим', 'падших', 'падшую', 'паек', 'пазухи', 'пазуху',
'пай', 'пакет', 'пакетом', 'пакеты', 'пакостей', 'пакостно', 'пал'];

var testResult = [
'в', 'вавиловк', 'вагнер', 'вагон', 'вагон', 'вагон', 'вагон', 'вагон', 'вагон', 'важн', 'важн', 'важн',
'важн', 'важнича', 'важн', 'важн', 'важн', 'важн', 'важн', 'важн', 'важност', 'важност', 'важност',
'важност', 'важн', 'важн', 'важн', 'важн', 'важн', 'важн', 'ваз', 'ваз', 'вакс', 'вакханк', 'вал',
'валанда', 'валентин', 'валерианов', 'валер', 'валет', 'вал', 'вал', 'вал', 'вал', 'вал', 'вальдшнеп',
'вальс', 'вальс', 'вальс', 'вальсишк', 'вальтер', 'валя', 'валя', 'валя', 'валя', 'валя', 'валя', 'валя',
'вам', 'вам', 'п', 'па', 'пав', 'павел', 'павильон', 'павильон', 'павл', 'павлин', 'павлин', 'павлин',
'павлович', 'павловн', 'павловн', 'павловн', 'павловн', 'павловн', 'павловц', 'павлыч', 'павлыч', 'пагубн',
'пада', 'пада', 'пада', 'пада', 'падал', 'пада', 'пада', 'пада', 'пада', 'пада', 'падеж', 'паден', 'паден',
'паден', 'паден', 'паден', 'паден', 'паден', 'падет', 'падут', 'падуч', 'падчериц', 'падчериц', 'падш', 'падш',
'падш', 'падш', 'падш', 'падш', 'падш', 'паек', 'пазух', 'пазух', 'па', 'пакет', 'пакет', 'пакет', 'пакост',
'пакостн', 'пал'];

describe('porter_stemmer', function() {
it('should permof stem', function() {
for (var i = 0; i < test.length; i++) {
expect(stemmer.stem(test[i])).toBe(testResult[i]);
}
}),
it('should tokenize and stem attached', function() {
stemmer.attach();
expect('мама мыла раму'.tokenizeAndStem()).toEqual(['мам', 'мыл', 'рам']);
expect('МАМА МЫЛА РАМУ'.tokenizeAndStem()).toEqual(['мам', 'мыл', 'рам']);
});
});

0 comments on commit d1ec5dc

Please sign in to comment.