Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with HTTPS or Subversion.

Download ZIP
Browse files

Merge branch 'master' of https://github.com/NaturalNode/natural into …

…inflectors-ja
  • Loading branch information...
commit 483f1ae12b420bcb0f39fc5cbc37875ad5e7afc6 2 parents 3b6c708 + c7554b6
@gmarty gmarty authored
View
2  README.md
@@ -144,7 +144,7 @@ You can train the classifier on sample text. It will use reasonable defaults to
tokenize and stem the text.
classifier.addDocument('i am long qqqq', 'buy');
- classifier.addDocument('buy the q's', 'buy');
+ classifier.addDocument('buy the q''s', 'buy');
classifier.addDocument('short gold', 'sell');
classifier.addDocument('sell gold', 'sell');
View
14 lib/natural/classifiers/classifier.js
@@ -35,6 +35,11 @@ function addDocument(text, classification) {
if(typeof text === 'string')
text = this.stemmer.tokenizeAndStem(text);
+ if(text.length === 0) {
+ // ignore empty documents
+ return;
+ }
+
this.docs.push({
label: classification,
text: text
@@ -87,13 +92,18 @@ function restore(classifier, stemmer) {
function save(filename, callback) {
var data = JSON.stringify(this);
var fs = require('fs');
- fs.writeFile(filename, data, encoding='utf8', callback);
+ var classifier = this;
+ fs.writeFile(filename, data, 'utf8', function(err) {
+ if(callback) {
+ callback(err, err ? null : classifier);
+ }
+ });
}
function load(filename, callback) {
var fs = require('fs');
- fs.readFile(filename, encoding='utf8', function(err, data) {
+ fs.readFile(filename, 'utf8', function(err, data) {
var classifier;
if(!err) {
View
3  lib/natural/index.js
@@ -25,9 +25,11 @@ exports.Metaphone = require('./phonetics/metaphone');
exports.DoubleMetaphone = require('./phonetics/double_metaphone');
exports.SoundExDM = require('./phonetics/dm_soundex');
exports.PorterStemmer = require('./stemmers/porter_stemmer');
+exports.PorterStemmerFa = require('./stemmers/porter_stemmer_fa');
exports.PorterStemmerRu = require('./stemmers/porter_stemmer_ru');
exports.LancasterStemmer = require('./stemmers/lancaster_stemmer');
exports.StemmerJa = require('./stemmers/stemmer_ja');
+exports.AggressiveTokenizerFa = require('./tokenizers/aggressive_tokenizer_fa');
exports.AggressiveTokenizerRu = require('./tokenizers/aggressive_tokenizer_ru');
exports.AggressiveTokenizer = require('./tokenizers/aggressive_tokenizer');
exports.RegexpTokenizer = require('./tokenizers/regexp_tokenizer').RegexpTokenizer;
@@ -51,4 +53,3 @@ exports.DiceCoefficient = require('./distance/dice_coefficient');
exports.normalize_ja = require('./normalizers/normalizer_ja').normalize_ja;
exports.removeDiacritics = require('./normalizers/remove_diacritics');
exports.transliterate_ja = require('./transliterators/ja');
-
View
4 lib/natural/normalizers/normalizer_ja.js
@@ -138,8 +138,8 @@ var conversionTables = {
'': ']',
'': '{',
'': '}',
- '': '',
- '': '',
+ '': '',
+ '': '',
'': '@',
'': '*',
'': '\\',
View
2  lib/natural/phonetics/phonetic.js
@@ -21,7 +21,7 @@ THE SOFTWARE.
*/
var stopwords = require('../util/stopwords');
-var Tokenizer = new require('../tokenizers/aggressive_tokenizer')
+var Tokenizer = require('../tokenizers/aggressive_tokenizer')
tokenizer = new Tokenizer();
module.exports = function() {
View
33 lib/natural/stemmers/porter_stemmer_fa.js
@@ -0,0 +1,33 @@
+/*
+Copyright (c) 2011, Chris Umbel
+Farsi Porter Stemmer by Fardin Koochaki <me@fardinak.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+var Stemmer = require('./stemmer_fa');
+
+var PorterStemmer = new Stemmer();
+module.exports = PorterStemmer;
+
+// disabled stemming for Farsi
+// Farsi stemming will be supported soon
+PorterStemmer.stem = function(token) {
+ return token;
+};
View
54 lib/natural/stemmers/stemmer_fa.js
@@ -0,0 +1,54 @@
+/*
+Copyright (c) 2011, Chris Umbel
+Farsi Stemmer by Fardin Koochaki <me@fardinak.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+var stopwords = require('../util/stopwords_fa');
+var Tokenizer = require('../tokenizers/aggressive_tokenizer_fa');
+
+module.exports = function() {
+ var stemmer = this;
+
+ stemmer.stem = function(token) {
+ return token;
+ };
+
+ stemmer.tokenizeAndStem = function(text, keepStops) {
+ var stemmedTokens = [];
+
+ new Tokenizer().tokenize(text).forEach(function(token) {
+ if(keepStops || stopwords.words.indexOf(token) == -1)
+ stemmedTokens.push(stemmer.stem(token));
+ });
+
+ return stemmedTokens;
+ };
+
+ stemmer.attach = function() {
+ String.prototype.stem = function() {
+ return stemmer.stem(this);
+ };
+
+ String.prototype.tokenizeAndStem = function(keepStops) {
+ return stemmer.tokenizeAndStem(this, keepStops);
+ };
+ };
+}
View
48 lib/natural/tokenizers/aggressive_tokenizer_fa.js
@@ -0,0 +1,48 @@
+/*
+Copyright (c) 2011, Chris Umbel
+Farsi Aggressive Tokenizer by Fardin Koochaki <me@fardinak.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+var Tokenizer = require('./tokenizer'),
+ util = require('util');
+
+AggressiveTokenizer = function() {
+ Tokenizer.call(this);
+};
+util.inherits(AggressiveTokenizer, Tokenizer);
+
+module.exports = AggressiveTokenizer;
+
+AggressiveTokenizer.prototype.clearEmptyString = function(array) {
+ return array.filter(function(a) {
+ return a != '';
+ });
+};
+
+AggressiveTokenizer.prototype.clearText = function(text) {
+ return text.replace(new RegExp('\.\:\+\-\=\(\)\"\'\!\?\،\,\؛\;', 'g'), ' ');
+};
+
+AggressiveTokenizer.prototype.tokenize = function(text) {
+ // break a string up into an array of tokens by anything non-word
+ text = this.clearText(text);
+ return this.clearEmptyString(text.split(/\s+/));
+};
View
38 lib/natural/util/stopwords_fa.js
@@ -0,0 +1,38 @@
+/*
+Copyright (c) 2011, Chris Umbel
+Farsi Stop Words by Fardin Koochaki <me@fardinak.com>
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
+*/
+
+// a list of commonly used words that have little meaning and can be excluded
+// from analysis.
+var words = [
+ // Words
+ 'از', 'با', 'یه', 'برای', 'و', 'باید', 'شاید',
+
+ // Symbols
+ '؟', '!', '٪', '.', '،', '؛', ':', ';', ',',
+
+ // Numbers
+ '۱', '۲', '۳', '۴', '۵', '۶', '۷', '۸', '۹', '۰'
+];
+
+// tell the world about the noise words.
+exports.words = words;
View
3  lib/natural/util/utils.js
@@ -25,6 +25,7 @@
* Generate a replacing function given a table of patterns. Inspired by:
* http://code.google.com/p/jslibs/wiki/JavascriptTips#String_converter
* The order of elements is significant. Longer elements should be listed first.
+ * @see Speed test http://jsperf.com/build-a-regexp-table
*
* @param {Object.<string, string>} translationTable The translation table of key value.
* @return {function(string): string} A translating function.
@@ -50,7 +51,7 @@ function replacer(translationTable) {
for (key in translationTable) {
// Escaping regexp special chars.
- key = key.replace(/(\^|\$|\*|\+|\?|\.|\(|\)|\[|\]|\{|\}|\||\\)/g, '\\\$1');
+ key = key.replace(/(\^|\$|\*|\+|\?|\.|\(|\)|\[|\]|\{|\}|\||\\|\/)/g, '\\\$1');
pattern.push(key);
}
View
2  package.json
@@ -1,7 +1,7 @@
{
"name": "natural",
"description": "General natural language (tokenizing, stemming, classification, inflection, phonetics, tfidf, WordNet, jaro-winkler, Levenshtein distance, Dice's Coefficient) facilities for node.",
- "version": "0.1.16",
+ "version": "0.1.17",
"homepage": "https://github.com/NaturalNode/natural",
"engines": {
"node": ">=0.4.10"
View
2  spec/normalizer_ja_spec.js
@@ -55,7 +55,7 @@ describe('normalize_ja', function() {
it('should transform halfwidth punctuation signs to fullwidth', function() {
// Taken from http://unicode.org/cldr/trac/browser/trunk/common/main/ja.xml
expect(normalize_ja('‾ __ -- ‐ — ― 〜 ・ ・ ,, 、、 ;; :: !! ?? .. ‥ … 。。 '\ ‘ ’ "" “ ” (( )) [[ ]] {{ }} 〈 〉 《 》 「「 」」 『 』 【 】 〔 〕 ‖ § ¶ @@ ++ ^^ $$ ** // \\\ && ## %% ‰ † ‡ ′ ″ 〃 ※'))
- .toEqual('‾ __ ─- ‐ — ― 〜 ・ ・ ,, 、、 ;; :: !! ?? .. ‥ … 。。 '\ ‘ ’ "" “ ” (( )) [[ ]] {{ }} 〈 〉 《 》 「「 」」 『 』 【 】 〔 〕 ‖ § ¶ @@ ++ ^^ $$ ** // \\ && ## %% ‰ † ‡ ′ ″ 〃 ※');
+ .toEqual('‾ __ ─- ‐ — ― 〜 ・ ・ ,, 、、 ;; :: !! ?? .. ‥ … 。。 '\ ‘ ’ "" “ ” (( )) [[ ]] {{ }} 〈 〉 《 》 「「 」」 『 』 【 】 〔 〕 ‖ § ¶ @@ ++ ^^ $$ ** // \\ && ## %% ‰ † ‡ ′ ″ 〃 ※');
});
it('should replace repeat characters', function() {
Please sign in to comment.
Something went wrong with that request. Please try again.