Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

Merge pull request #41 from Nordberg/master

Natural for Russian language
  • Loading branch information...
commit d1ec5dc572e5bf55abce00f4d42213eb5fab6d7a 2 parents ee33618 + 5f162b3
@chrisumbel chrisumbel authored
View
2  lib/natural/index.js
@@ -24,7 +24,9 @@ exports.SoundEx = require('./phonetics/soundex');
exports.Metaphone = require('./phonetics/metaphone');
exports.DoubleMetaphone = require('./phonetics/double_metaphone');
exports.PorterStemmer = require('./stemmers/porter_stemmer');
+exports.PorterStemmerRu = require('./stemmers/porter_stemmer_ru');
exports.LancasterStemmer = require('./stemmers/lancaster_stemmer');
+exports.AggressiveTokenizerRu = require('./tokenizers/aggressive_tokenizer_ru');
exports.AggressiveTokenizer = require('./tokenizers/aggressive_tokenizer');
exports.RegexpTokenizer = require('./tokenizers/regexp_tokenizer').RegexpTokenizer;
exports.WordTokenizer = require('./tokenizers/regexp_tokenizer').WordTokenizer;
View
151 lib/natural/stemmers/porter_stemmer_ru.js
@@ -0,0 +1,151 @@
+/*
+Copyright (c) 2012, Polyakov Vladimir, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+var Stemmer = require('./stemmer_ru');
+
+var PorterStemmer = new Stemmer();
+module.exports = PorterStemmer;
+
+function attemptReplacePatterns(token, patterns) {
+ var replacement = null;
+ var i = 0, isReplaced = false;
+ while ((i < patterns.length) && !isReplaced) {
+ if (patterns[i][0].test(token)) {
+ replacement = token.replace(patterns[i][0], patterns[i][1]);
+ isReplaced = true;
+ }
+ i++;
+ }
+ return replacement;
+};
+
+function perfectiveGerund(token) {
+ var result = attemptReplacePatterns(token, [
+ [/[ая]в(ши|шись)$/g, ''],
+ [/(ив|ивши|ившись|ывши|ывшись|ыв)$/g, '']
+ ]);
+ return result;
+};
+
+function adjectival(token) {
+ var result = adjective(token);
+ if (result != null) {
+ var pariticipleResult = participle(result);
+ result = pariticipleResult ? pariticipleResult : result;
+ }
+ return result;
+};
+
+function adjective(token) {
+ var result = attemptReplacePatterns(token, [
+ [/(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$/g, '']
+ ]);
+ return result;
+};
+
+function participle(token) {
+ var result = attemptReplacePatterns(token, [
+ [/([ая])(ем|нн|вш|ющ|щ)$/g, '$1'],
+ [/(ивш|ывш|ующ)$/g, '']
+ ]);
+ return result;
+};
+
+function reflexive(token) {
+ var result = attemptReplacePatterns(token, [
+ [/(ся|сь)$/g, '']
+ ]);
+ return result;
+};
+
+function verb(token) {
+ var result = attemptReplacePatterns(token, [
+ [/([ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)$/g, '$1'],
+ [/(ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|ит|ыт|ены|ить|ыть|ишь|ую|ю)$/g, '']
+ ]);
+ return result;
+};
+
+function noun(token) {
+ var result = attemptReplacePatterns(token, [
+ [/|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$/g, '']
+ ]);
+ return result;
+};
+
+function superlative (token) {
+ var result = attemptReplacePatterns(token, [
+ [/(ейш|ейше)$/g, '']
+ ]);
+ return result;
+};
+
+function derivational (token) {
+ var result = attemptReplacePatterns(token, [
+ [/(ост|ость)$/g, '']
+ ]);
+ return result;
+};
+
+// perform full stemming algorithm on a single word
+PorterStemmer.stem = function(token) {
+ token = token.toLowerCase().replace(/ё/g, 'e');
+ var volwesRegexp = /^(.*?[аеиоюяуыиэ])(.*)$/g;
+ var RV = volwesRegexp.exec(token);
+ if (!RV || RV.length < 3) {
+ return token;
+ }
+ var head = RV[1];
+ RV = RV[2];
+ volwesRegexp.lastIndex = 0;
+ var R2 = volwesRegexp.exec(RV);
+ var result = perfectiveGerund(RV);
+ if (result === null) {
+ var resultReflexive = reflexive(RV) || RV;
+ result = adjectival(resultReflexive);
+ if (result === null) {
+ result = verb(resultReflexive);
+ if (result === null) {
+ result = noun(resultReflexive);
+ if (result === null) {
+ result = resultReflexive;
+ }
+ }
+ }
+ }
+ result = result.replace(/и$/g, '');
+ var derivationalResult = result
+ if (R2 && R2[2]) {
+ derivationalResult = derivational(R2[2]);
+ if (derivationalResult != null) {
+ derivationalResult = derivational(result);
+ } else {
+ derivationalResult = result;
+ }
+ }
+
+ var superlativeResult = superlative(derivationalResult) || derivationalResult;
+
+ superlativeResult = superlativeResult.replace(/(н)н/g, '$1');
+ superlativeResult = superlativeResult.replace(/ь$/g, '');
+ return head + superlativeResult;
+};
View
58 lib/natural/stemmers/stemmer_ru.js
@@ -0,0 +1,58 @@
+/*
+Copyright (c) 2012, Polyakov Vladimir, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+var stopwords = require('../util/stopwords_ru');
+var Tokenizer = require('../tokenizers/aggressive_tokenizer_ru');
+
+module.exports = function() {
+ var stemmer = this;
+
+ stemmer.stem = function(token) {
+ return token;
+ };
+
+ stemmer.tokenizeAndStem = function(text, keepStops) {
+ var stemmedTokens = [];
+
+ new Tokenizer().tokenize(text).forEach(function(token) {
+ if (keepStops || stopwords.words.indexOf(token) == -1) {
+ var resultToken = token.toLowerCase();
+ if (resultToken.match(new RegExp('[а-яё0-9]+', 'gi'))) {
+ resultToken = stemmer.stem(resultToken);
+ }
+ stemmedTokens.push(resultToken);
+ }
+ });
+
+ return stemmedTokens;
+ };
+
+ stemmer.attach = function() {
+ String.prototype.stem = function() {
+ return stemmer.stem(this);
+ };
+
+ String.prototype.tokenizeAndStem = function(keepStops) {
+ return stemmer.tokenizeAndStem(this, keepStops);
+ };
+ };
+}
View
47 lib/natural/tokenizers/aggressive_tokenizer_ru.js
@@ -0,0 +1,47 @@
+/*
+Copyright (c) 2011, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+var Tokenizer = require('./tokenizer'),
+ util = require('util');
+
+AggressiveTokenizer = function() {
+ Tokenizer.call(this);
+};
+util.inherits(AggressiveTokenizer, Tokenizer);
+
+module.exports = AggressiveTokenizer;
+
+AggressiveTokenizer.prototype.clearEmptyString = function(array) {
+ return array.filter(function(a) {
+ return a != '';
+ });
+};
+
+AggressiveTokenizer.prototype.clearText = function(text) {
+ return text.replace(new RegExp('«|»|!|\\?', 'g'), ' ');
+};
+
+AggressiveTokenizer.prototype.tokenize = function(text) {
+ // break a string up into an array of tokens by anything non-word
+ text = this.clearText(text);
+ return this.clearEmptyString(text.split(/-|[|$|\b|\(|\)|[ \s\xA0'\.,:"]+/gi));
+};
View
41 lib/natural/util/stopwords_ru.js
@@ -0,0 +1,41 @@
+/*
+Copyright (c) 2011, Polyakov Vladimir, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// a list of commonly used words that have little meaning and can be excluded
+// from analysis.
+var words = [
+ 'о', 'после', 'все', 'также', 'и', 'другие', 'все', 'как', 'во', 'быть',
+ 'потому', 'был', 'до', 'являюсь', 'между', 'все', 'но', 'от', 'иди', 'могу',
+ 'подойди', 'мог', 'делал', 'делаю', 'каждый', 'для', 'откуда', 'иметь', 'имел',
+ 'он', 'имеет', 'её', 'здесь', 'его', 'как', 'если', 'в', 'оно', 'за',
+ 'делать', 'много', 'я', 'может быть', 'более', 'самый', 'должен',
+ 'мой', 'никогда', 'сейчас', 'из', 'на', 'только', 'или', 'другой', 'другая',
+ 'другое', 'наше', 'вне', 'конец', 'сказал', 'сказала', 'также', 'видел', 'c',
+ 'немного', 'все еще', 'так', 'затем', 'тот', 'их', 'там', 'этот', 'они', 'те',
+ 'через', 'тоже', 'под', 'над', 'очень', 'был', 'путь', 'мы', 'хорошо',
+ 'что', 'где', 'который', 'пока', 'кто', 'с кем', 'хотел бы', 'ты', 'твои',
+ 'а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н',
+ 'o', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь',
+ 'э', 'ю', 'я','$', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '_'];
+
+// tell the world about the noise words.
+exports.words = words;
View
63 spec/porter_stemmer_ru_spec.js
@@ -0,0 +1,63 @@
+/*
+Copyright (c) 2011, Polyakov Vladimir, Chris Umbel
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+var stemmer = require('lib/natural/stemmers/porter_stemmer_ru');
+
+var test = [
+ 'в', 'вавиловка', 'вагнера', 'вагон', 'вагона', 'вагоне', 'вагонов', 'вагоном', 'вагоны',
+ 'важная', 'важнее', 'важнейшие', 'важнейшими', 'важничал', 'важно', 'важного', 'важное',
+ 'важной', 'важном', 'важному', 'важности', 'важностию', 'важность', 'важностью', 'важную',
+ 'важны', 'важные', 'важный', 'важным', 'важных', 'вазах', 'вазы', 'вакса', 'вакханка', 'вал',
+ 'валандался', 'валентина', 'валериановых', 'валерию', 'валетами', 'вали', 'валил', 'валился',
+ 'валится', 'валов', 'вальдшнепа', 'вальс', 'вальса', 'вальсе', 'вальсишку', 'вальтера', 'валяется',
+ 'валялась', 'валялись', 'валялось', 'валялся', 'валять', 'валяются', 'вам', 'вами', 'п', 'па', 'пава',
+ 'павел', 'павильон', 'павильонам', 'павла', 'павлиний', 'павлиньи', 'павлиньим', 'павлович', 'павловна',
+ 'павловне', 'павловной', 'павловну', 'павловны', 'павловцы', 'павлыч', 'павлыча', 'пагубная', 'падает',
+ 'падай', 'падал', 'падала', 'падаль', 'падать', 'падаю', 'падают', 'падающего', 'падающие', 'падеж', 'падение',
+ 'падением', 'падении', 'падений', 'падения', 'паденье', 'паденья', 'падет', 'падут', 'падучая', 'падчерицей',
+ 'падчерицы', 'падшая', 'падшей', 'падшему', 'падший', 'падшим', 'падших', 'падшую', 'паек', 'пазухи', 'пазуху',
+ 'пай', 'пакет', 'пакетом', 'пакеты', 'пакостей', 'пакостно', 'пал'];
+
+var testResult = [
+ 'в', 'вавиловк', 'вагнер', 'вагон', 'вагон', 'вагон', 'вагон', 'вагон', 'вагон', 'важн', 'важн', 'важн',
+ 'важн', 'важнича', 'важн', 'важн', 'важн', 'важн', 'важн', 'важн', 'важност', 'важност', 'важност',
+ 'важност', 'важн', 'важн', 'важн', 'важн', 'важн', 'важн', 'ваз', 'ваз', 'вакс', 'вакханк', 'вал',
+ 'валанда', 'валентин', 'валерианов', 'валер', 'валет', 'вал', 'вал', 'вал', 'вал', 'вал', 'вальдшнеп',
+ 'вальс', 'вальс', 'вальс', 'вальсишк', 'вальтер', 'валя', 'валя', 'валя', 'валя', 'валя', 'валя', 'валя',
+ 'вам', 'вам', 'п', 'па', 'пав', 'павел', 'павильон', 'павильон', 'павл', 'павлин', 'павлин', 'павлин',
+ 'павлович', 'павловн', 'павловн', 'павловн', 'павловн', 'павловн', 'павловц', 'павлыч', 'павлыч', 'пагубн',
+ 'пада', 'пада', 'пада', 'пада', 'падал', 'пада', 'пада', 'пада', 'пада', 'пада', 'падеж', 'паден', 'паден',
+ 'паден', 'паден', 'паден', 'паден', 'паден', 'падет', 'падут', 'падуч', 'падчериц', 'падчериц', 'падш', 'падш',
+ 'падш', 'падш', 'падш', 'падш', 'падш', 'паек', 'пазух', 'пазух', 'па', 'пакет', 'пакет', 'пакет', 'пакост',
+ 'пакостн', 'пал'];
+
+describe('porter_stemmer', function() {
+ it('should permof stem', function() {
+ for (var i = 0; i < test.length; i++) {
+ expect(stemmer.stem(test[i])).toBe(testResult[i]);
+ }
+ }),
+ it('should tokenize and stem attached', function() {
+ stemmer.attach();
+ expect('мама мыла раму'.tokenizeAndStem()).toEqual(['мам', 'мыл', 'рам']);
+ expect('МАМА МЫЛА РАМУ'.tokenizeAndStem()).toEqual(['мам', 'мыл', 'рам']);
+ });
+});
Please sign in to comment.
Something went wrong with that request. Please try again.