-
Notifications
You must be signed in to change notification settings - Fork 862
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Merge pull request #41 from Nordberg/master
Natural for Russian language
- Loading branch information
Showing
6 changed files
with
362 additions
and
0 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,151 @@ | ||
/* | ||
Copyright (c) 2012, Polyakov Vladimir, Chris Umbel | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. | ||
*/ | ||
|
||
var Stemmer = require('./stemmer_ru'); | ||
|
||
var PorterStemmer = new Stemmer(); | ||
module.exports = PorterStemmer; | ||
|
||
function attemptReplacePatterns(token, patterns) { | ||
var replacement = null; | ||
var i = 0, isReplaced = false; | ||
while ((i < patterns.length) && !isReplaced) { | ||
if (patterns[i][0].test(token)) { | ||
replacement = token.replace(patterns[i][0], patterns[i][1]); | ||
isReplaced = true; | ||
} | ||
i++; | ||
} | ||
return replacement; | ||
}; | ||
|
||
function perfectiveGerund(token) { | ||
var result = attemptReplacePatterns(token, [ | ||
[/[ая]в(ши|шись)$/g, ''], | ||
[/(ив|ивши|ившись|ывши|ывшись|ыв)$/g, ''] | ||
]); | ||
return result; | ||
}; | ||
|
||
function adjectival(token) { | ||
var result = adjective(token); | ||
if (result != null) { | ||
var pariticipleResult = participle(result); | ||
result = pariticipleResult ? pariticipleResult : result; | ||
} | ||
return result; | ||
}; | ||
|
||
function adjective(token) { | ||
var result = attemptReplacePatterns(token, [ | ||
[/(ее|ие|ые|ое|ими|ыми|ей|ий|ый|ой|ем|им|ым|ом|его|ого|ему|ому|их|ых|ую|юю|ая|яя|ою|ею)$/g, ''] | ||
]); | ||
return result; | ||
}; | ||
|
||
function participle(token) { | ||
var result = attemptReplacePatterns(token, [ | ||
[/([ая])(ем|нн|вш|ющ|щ)$/g, '$1'], | ||
[/(ивш|ывш|ующ)$/g, ''] | ||
]); | ||
return result; | ||
}; | ||
|
||
function reflexive(token) { | ||
var result = attemptReplacePatterns(token, [ | ||
[/(ся|сь)$/g, ''] | ||
]); | ||
return result; | ||
}; | ||
|
||
function verb(token) { | ||
var result = attemptReplacePatterns(token, [ | ||
[/([ая])(ла|на|ете|йте|ли|й|л|ем|н|ло|но|ет|ют|ны|ть|ешь|нно)$/g, '$1'], | ||
[/(ила|ыла|ена|ейте|уйте|ите|или|ыли|ей|уй|ил|ыл|им|ым|ен|ило|ыло|ено|ят|ует|ит|ыт|ены|ить|ыть|ишь|ую|ю)$/g, ''] | ||
]); | ||
return result; | ||
}; | ||
|
||
function noun(token) { | ||
var result = attemptReplacePatterns(token, [ | ||
[/(а|ев|ов|ие|ье|е|иями|ями|ами|еи|ии|и|ией|ей|ой|ий|й|иям|ям|ием|ем|ам|ом|о|у|ах|иях|ях|ы|ь|ию|ью|ю|ия|ья|я)$/g, ''] | ||
]); | ||
return result; | ||
}; | ||
|
||
function superlative (token) { | ||
var result = attemptReplacePatterns(token, [ | ||
[/(ейш|ейше)$/g, ''] | ||
]); | ||
return result; | ||
}; | ||
|
||
function derivational (token) { | ||
var result = attemptReplacePatterns(token, [ | ||
[/(ост|ость)$/g, ''] | ||
]); | ||
return result; | ||
}; | ||
|
||
// perform full stemming algorithm on a single word | ||
PorterStemmer.stem = function(token) { | ||
token = token.toLowerCase().replace(/ё/g, 'e'); | ||
var volwesRegexp = /^(.*?[аеиоюяуыиэ])(.*)$/g; | ||
var RV = volwesRegexp.exec(token); | ||
if (!RV || RV.length < 3) { | ||
return token; | ||
} | ||
var head = RV[1]; | ||
RV = RV[2]; | ||
volwesRegexp.lastIndex = 0; | ||
var R2 = volwesRegexp.exec(RV); | ||
var result = perfectiveGerund(RV); | ||
if (result === null) { | ||
var resultReflexive = reflexive(RV) || RV; | ||
result = adjectival(resultReflexive); | ||
if (result === null) { | ||
result = verb(resultReflexive); | ||
if (result === null) { | ||
result = noun(resultReflexive); | ||
if (result === null) { | ||
result = resultReflexive; | ||
} | ||
} | ||
} | ||
} | ||
result = result.replace(/и$/g, ''); | ||
var derivationalResult = result | ||
if (R2 && R2[2]) { | ||
derivationalResult = derivational(R2[2]); | ||
if (derivationalResult != null) { | ||
derivationalResult = derivational(result); | ||
} else { | ||
derivationalResult = result; | ||
} | ||
} | ||
|
||
var superlativeResult = superlative(derivationalResult) || derivationalResult; | ||
|
||
superlativeResult = superlativeResult.replace(/(н)н/g, '$1'); | ||
superlativeResult = superlativeResult.replace(/ь$/g, ''); | ||
return head + superlativeResult; | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,58 @@ | ||
/* | ||
Copyright (c) 2012, Polyakov Vladimir, Chris Umbel | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. | ||
*/ | ||
|
||
var stopwords = require('../util/stopwords_ru'); | ||
var Tokenizer = require('../tokenizers/aggressive_tokenizer_ru'); | ||
|
||
module.exports = function() { | ||
var stemmer = this; | ||
|
||
stemmer.stem = function(token) { | ||
return token; | ||
}; | ||
|
||
stemmer.tokenizeAndStem = function(text, keepStops) { | ||
var stemmedTokens = []; | ||
|
||
new Tokenizer().tokenize(text).forEach(function(token) { | ||
if (keepStops || stopwords.words.indexOf(token) == -1) { | ||
var resultToken = token.toLowerCase(); | ||
if (resultToken.match(new RegExp('[а-яё0-9]+', 'gi'))) { | ||
resultToken = stemmer.stem(resultToken); | ||
} | ||
stemmedTokens.push(resultToken); | ||
} | ||
}); | ||
|
||
return stemmedTokens; | ||
}; | ||
|
||
stemmer.attach = function() { | ||
String.prototype.stem = function() { | ||
return stemmer.stem(this); | ||
}; | ||
|
||
String.prototype.tokenizeAndStem = function(keepStops) { | ||
return stemmer.tokenizeAndStem(this, keepStops); | ||
}; | ||
}; | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,47 @@ | ||
/* | ||
Copyright (c) 2011, Chris Umbel | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. | ||
*/ | ||
|
||
var Tokenizer = require('./tokenizer'), | ||
util = require('util'); | ||
|
||
AggressiveTokenizer = function() { | ||
Tokenizer.call(this); | ||
}; | ||
util.inherits(AggressiveTokenizer, Tokenizer); | ||
|
||
module.exports = AggressiveTokenizer; | ||
|
||
AggressiveTokenizer.prototype.clearEmptyString = function(array) { | ||
return array.filter(function(a) { | ||
return a != ''; | ||
}); | ||
}; | ||
|
||
AggressiveTokenizer.prototype.clearText = function(text) { | ||
return text.replace(new RegExp('«|»|!|\\?', 'g'), ' '); | ||
}; | ||
|
||
AggressiveTokenizer.prototype.tokenize = function(text) { | ||
// break a string up into an array of tokens by anything non-word | ||
text = this.clearText(text); | ||
return this.clearEmptyString(text.split(/-|[|$|\b|\(|\)|[ \s\xA0'\.,:"]+/gi)); | ||
}; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,41 @@ | ||
/* | ||
Copyright (c) 2011, Polyakov Vladimir, Chris Umbel | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. | ||
*/ | ||
|
||
// a list of commonly used words that have little meaning and can be excluded | ||
// from analysis. | ||
var words = [ | ||
'о', 'после', 'все', 'также', 'и', 'другие', 'все', 'как', 'во', 'быть', | ||
'потому', 'был', 'до', 'являюсь', 'между', 'все', 'но', 'от', 'иди', 'могу', | ||
'подойди', 'мог', 'делал', 'делаю', 'каждый', 'для', 'откуда', 'иметь', 'имел', | ||
'он', 'имеет', 'её', 'здесь', 'его', 'как', 'если', 'в', 'оно', 'за', | ||
'делать', 'много', 'я', 'может быть', 'более', 'самый', 'должен', | ||
'мой', 'никогда', 'сейчас', 'из', 'на', 'только', 'или', 'другой', 'другая', | ||
'другое', 'наше', 'вне', 'конец', 'сказал', 'сказала', 'также', 'видел', 'c', | ||
'немного', 'все еще', 'так', 'затем', 'тот', 'их', 'там', 'этот', 'они', 'те', | ||
'через', 'тоже', 'под', 'над', 'очень', 'был', 'путь', 'мы', 'хорошо', | ||
'что', 'где', 'который', 'пока', 'кто', 'с кем', 'хотел бы', 'ты', 'твои', | ||
'а', 'б', 'в', 'г', 'д', 'е', 'ё', 'ж', 'з', 'и', 'й', 'к', 'л', 'м', 'н', | ||
'o', 'п', 'р', 'с', 'т', 'у', 'ф', 'х', 'ц', 'ч', 'ш', 'щ', 'ъ', 'ы', 'ь', | ||
'э', 'ю', 'я','$', '1', '2', '3', '4', '5', '6', '7', '8', '9', '0', '_']; | ||
|
||
// tell the world about the noise words. | ||
exports.words = words; |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,63 @@ | ||
/* | ||
Copyright (c) 2011, Polyakov Vladimir, Chris Umbel | ||
Permission is hereby granted, free of charge, to any person obtaining a copy | ||
of this software and associated documentation files (the "Software"), to deal | ||
in the Software without restriction, including without limitation the rights | ||
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | ||
copies of the Software, and to permit persons to whom the Software is | ||
furnished to do so, subject to the following conditions: | ||
The above copyright notice and this permission notice shall be included in | ||
all copies or substantial portions of the Software. | ||
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR | ||
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | ||
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | ||
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | ||
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | ||
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN | ||
THE SOFTWARE. | ||
*/ | ||
var stemmer = require('lib/natural/stemmers/porter_stemmer_ru'); | ||
|
||
var test = [ | ||
'в', 'вавиловка', 'вагнера', 'вагон', 'вагона', 'вагоне', 'вагонов', 'вагоном', 'вагоны', | ||
'важная', 'важнее', 'важнейшие', 'важнейшими', 'важничал', 'важно', 'важного', 'важное', | ||
'важной', 'важном', 'важному', 'важности', 'важностию', 'важность', 'важностью', 'важную', | ||
'важны', 'важные', 'важный', 'важным', 'важных', 'вазах', 'вазы', 'вакса', 'вакханка', 'вал', | ||
'валандался', 'валентина', 'валериановых', 'валерию', 'валетами', 'вали', 'валил', 'валился', | ||
'валится', 'валов', 'вальдшнепа', 'вальс', 'вальса', 'вальсе', 'вальсишку', 'вальтера', 'валяется', | ||
'валялась', 'валялись', 'валялось', 'валялся', 'валять', 'валяются', 'вам', 'вами', 'п', 'па', 'пава', | ||
'павел', 'павильон', 'павильонам', 'павла', 'павлиний', 'павлиньи', 'павлиньим', 'павлович', 'павловна', | ||
'павловне', 'павловной', 'павловну', 'павловны', 'павловцы', 'павлыч', 'павлыча', 'пагубная', 'падает', | ||
'падай', 'падал', 'падала', 'падаль', 'падать', 'падаю', 'падают', 'падающего', 'падающие', 'падеж', 'падение', | ||
'падением', 'падении', 'падений', 'падения', 'паденье', 'паденья', 'падет', 'падут', 'падучая', 'падчерицей', | ||
'падчерицы', 'падшая', 'падшей', 'падшему', 'падший', 'падшим', 'падших', 'падшую', 'паек', 'пазухи', 'пазуху', | ||
'пай', 'пакет', 'пакетом', 'пакеты', 'пакостей', 'пакостно', 'пал']; | ||
|
||
var testResult = [ | ||
'в', 'вавиловк', 'вагнер', 'вагон', 'вагон', 'вагон', 'вагон', 'вагон', 'вагон', 'важн', 'важн', 'важн', | ||
'важн', 'важнича', 'важн', 'важн', 'важн', 'важн', 'важн', 'важн', 'важност', 'важност', 'важност', | ||
'важност', 'важн', 'важн', 'важн', 'важн', 'важн', 'важн', 'ваз', 'ваз', 'вакс', 'вакханк', 'вал', | ||
'валанда', 'валентин', 'валерианов', 'валер', 'валет', 'вал', 'вал', 'вал', 'вал', 'вал', 'вальдшнеп', | ||
'вальс', 'вальс', 'вальс', 'вальсишк', 'вальтер', 'валя', 'валя', 'валя', 'валя', 'валя', 'валя', 'валя', | ||
'вам', 'вам', 'п', 'па', 'пав', 'павел', 'павильон', 'павильон', 'павл', 'павлин', 'павлин', 'павлин', | ||
'павлович', 'павловн', 'павловн', 'павловн', 'павловн', 'павловн', 'павловц', 'павлыч', 'павлыч', 'пагубн', | ||
'пада', 'пада', 'пада', 'пада', 'падал', 'пада', 'пада', 'пада', 'пада', 'пада', 'падеж', 'паден', 'паден', | ||
'паден', 'паден', 'паден', 'паден', 'паден', 'падет', 'падут', 'падуч', 'падчериц', 'падчериц', 'падш', 'падш', | ||
'падш', 'падш', 'падш', 'падш', 'падш', 'паек', 'пазух', 'пазух', 'па', 'пакет', 'пакет', 'пакет', 'пакост', | ||
'пакостн', 'пал']; | ||
|
||
describe('porter_stemmer', function() { | ||
it('should permof stem', function() { | ||
for (var i = 0; i < test.length; i++) { | ||
expect(stemmer.stem(test[i])).toBe(testResult[i]); | ||
} | ||
}), | ||
it('should tokenize and stem attached', function() { | ||
stemmer.attach(); | ||
expect('мама мыла раму'.tokenizeAndStem()).toEqual(['мам', 'мыл', 'рам']); | ||
expect('МАМА МЫЛА РАМУ'.tokenizeAndStem()).toEqual(['мам', 'мыл', 'рам']); | ||
}); | ||
}); |