Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP

Loading…

Noun Inflector for Japanese #72

Merged
merged 8 commits into from

2 participants

@gmarty

Noun inflector has a limited interest as a noun can be either singular or plural in Japanese, depending on the context. However, some words are clearly plural forms. These forms are addressed here. As usual it is fully tested.

This pull request also contains:

  • Better regexp for detecting Japanese script
  • Various fixes
@chrisumbel chrisumbel merged commit 4aaa939 into NaturalNode:master
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
This page is out of date. Refresh to see the latest.
View
2  lib/natural/index.js
@@ -40,6 +40,8 @@ exports.TokenizerJa = require('./tokenizers/tokenizer_ja');
exports.BayesClassifier = require('./classifiers/bayes_classifier');
exports.LogisticRegressionClassifier = require('./classifiers/logistic_regression_classifier');
exports.NounInflector = require('./inflectors/noun_inflector');
+exports.NounInflectorFr = require('./inflectors/fr/noun_inflector');
+exports.NounInflectorJa = require('./inflectors/ja/noun_inflector');
exports.PresentVerbInflector = require('./inflectors/present_verb_inflector');
exports.CountInflector = require('./inflectors/count_inflector');
exports.WordNet = require('./wordnet/wordnet');
View
4 lib/natural/inflectors/fr/noun_inflector.js
@@ -198,7 +198,7 @@ var NounInflector = function() {
this.addIrregular('putto', 'putti');
this.addIrregular('targui', 'touareg'); // touareg -> touaregs is also OK.
- // Singularize
+ // Pluralize
this.pluralForms.regularForms.push([/^(av|b|c|carnav|cérémoni|chac|corr|emment|emmenth|festiv|fut|gavi|gra|narv|p|récit|rég|rit|rorqu|st)al$/i, '$1als']);
this.pluralForms.regularForms.push([/^(aspir|b|cor|ém|ferm|gemm|soupir|trav|vant|vent|vitr)ail$/i, '$1aux']);
this.pluralForms.regularForms.push([/^(bij|caill|ch|gen|hib|jouj|p|rip|chouch)ou$/i, '$1oux']);
@@ -209,7 +209,7 @@ var NounInflector = function() {
this.pluralForms.regularForms.push([/(s|x)$/i, '$1']);
this.pluralForms.regularForms.push([/(.*)$/i, '$1s']);
- // Pluralize
+ // Singularize
this.singularForms.regularForms.push([/^(aspir|b|cor|ém|ferm|gemm|soupir|trav|vant|vent|vitr)aux$/i, '$1ail']);
this.singularForms.regularForms.push([/^(aloy|b|bouc|boy|burg|conoy|coy|cr|esquim|ét|fabli|flé|flûti|glu|gr|gru|hoy|joy|kérab|matéri|nobli|noy|pré|sen|sén|t|touch|tuss|tuy|v|ypré)aux$/i, '$1au']);
this.singularForms.regularForms.push([/^(bij|caill|ch|gen|hib|jouj|p|rip|chouch)oux$/i, '$1ou']);
View
131 lib/natural/inflectors/ja/noun_inflector.js
@@ -0,0 +1,131 @@
+/*
+ Copyright (c) 2012, Guillaume Marty
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+/**
+ * A noun inflector for Japanese.
+ * Compiled from several sources including:
+ * \@see http://answers.yahoo.com/question/index?qid=20080528201740AASBWy6
+ * \@see http://www.excite.co.jp/dictionary/english_japanese/
+ *
+ * This script assumes input is normalized using normalizer_ja().
+ * Pluralizing Japanese has a very limited interest.
+ * Japanese don't usually distinct plural from singular, so even a word looking
+ * like a singular might actually be a plural.
+ *
+ * Singularization of nouns ending by -tachi or -ra is achieved using a
+ * comprehensive black list, while nouns ending by -domo or -gata use a white
+ * list because there are too many exceptions.
+ *
+ * \@todo Singularize nouns ending by -ら, but there are too many exceptions.
+ * \@todo Expand the list of common plurals ending by -domo and -gata.
+ */
+
+var SingularPluralInflector = require('../singular_plural_inflector'),
+ util = require('util'),
+ FormSet = require('../form_set');
+
+function attach() {
+ var inflector = this;
+
+ String.prototype.singularizeNoun = function() {
+ return inflector.singularize(this);
+ };
+
+ String.prototype.pluralizeNoun = function() {
+ return inflector.pluralize(this);
+ };
+}
+
+
+
+/**
+ * @constructor
+ */
+var NounInflector = function() {
+ // Ambiguous a.k.a. invariant.
+ this.ambiguous = [
+ 'ともだち', '友だち', '友達', '遊び友達', '飲み友達', '酒飲み友達', '茶飲み友達',
+ '学校友達', '女友達', '男友達', '幼友達'
+ ];
+
+ this.customPluralForms = [];
+ this.customSingularForms = [];
+ this.singularForms = new FormSet();
+ this.pluralForms = new FormSet();
+
+ this.attach = attach;
+
+ this.addIrregular('', '神神');
+ this.addIrregular('', '人人');
+ this.addIrregular('', '年年');
+ this.addIrregular('', '月月');
+ this.addIrregular('', '日日');
+ this.addIrregular('', '星星');
+ this.addIrregular('', '島島');
+ this.addIrregular('', '我我');
+
+ /**
+ * Notes:
+ * -たち exceptions: いたち, おいたち, ついたち, かたち, かおかたち, なりかたち, いでたち, はたち, からたち, なりたち
+ * -達 exceptions: 伊達, 男伊達, 栄達, 上意下達, 熟達, 上達, 下意上達, 先達, 送達, 速達, 即日速達, 書留速達, 調達, 通達, 伝達, 到達, 配達, 牛乳配達, 新聞配達, 無料配達, 四通八達, 発達, 未発達, 御用達, 宮内庁御用達, 練達, 闊達
+ * -等 exceptions: 一等, 下等, 何等, 均等, 勲等, 高等, 三等, 初等, 上等, 親等, 二親等, 数等, 対等, 中等, 同等, 特等, 二等, 品等, 不等, 平等, 悪平等, 男女平等, 不平等, 優等, 劣等
+ */
+
+ // Pluralize
+ this.pluralForms.regularForms.push([/^(.+)$/i, '$1たち']);
+
+ // Singularize
+ this.singularForms.regularForms.push([/^(.+)たち$/i, function(a, mask) {
+ if (['', 'おい', 'つい', '', 'かおか', 'なりか', 'いで', '', 'から',
+ 'なり'].indexOf(mask) >= 0)
+ return mask + 'たち';
+ return mask;
+ }]);
+ this.singularForms.regularForms.push([/^(.+)達$/i, function(a, mask) {
+ if (['', '', '', '上意下', '', '', '下意上', '', '', '',
+ '即日速', '書留速', '調', '', '', '', '', '牛乳配', '新聞配', '無料配',
+ '四通八', '', '未発', '御用', '宮内庁御用', '', ''].indexOf(mask) >= 0)
+ return mask + '';
+ return mask;
+ }]); // Singularize nouns ending by -等, but not exceptions.
+ this.singularForms.regularForms.push([/^(.+)等$/i, function(a, mask) {
+ if (['', '', '', '', '', '', '', '', '', '二親', '', '',
+ '', '', '', '', '', '', '', '悪平', '男女平', '不平', '',
+ ''].indexOf(mask) >= 0)
+ return mask + '';
+ return mask;
+ }]);
+ this.singularForms.regularForms.push([/^(人間|わたくし||てまえ|手前|野郎|やろう|勇者|がき|ガキ|餓鬼)(共|ども)$/i, '$1']);
+ this.singularForms.regularForms.push([/^(神様|先生|あなた)(方|がた)$/i, '$1']);
+
+ this.pluralize = function(token) {
+ return this.ize(token, this.pluralForms, this.customPluralForms);
+ };
+
+ this.singularize = function(token) {
+ return this.ize(token, this.singularForms, this.customSingularForms);
+ };
+};
+
+util.inherits(NounInflector, SingularPluralInflector);
+
+module.exports = NounInflector;
View
4 lib/natural/stemmers/stemmer_ja.js
@@ -26,6 +26,8 @@
* Inspired by:
* http://svn.apache.org/repos/asf/lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java
*
+ * This script assumes input is normalized using normalizer_ja().
+ *
* \@todo Use .bind() in StemmerJa.prototype.attach().
*/
@@ -117,7 +119,7 @@ StemmerJa.prototype.stemKatakana = function(token) {
* @return {boolean} True if the string has katakana only.
*/
StemmerJa.prototype.isKatakana = function(str) {
- return !!str.match(/^[ァ-ヴ]+$/);
+ return !!str.match(/^[゠-ヿ]+$/);
};
// Expose an attach function that will patch String with new methods.
View
14 lib/natural/tokenizers/tokenizer_ja.js
@@ -69,9 +69,9 @@ var Tokenizer = require('./tokenizer'),
var TokenizerJa = function() {
this.chartype_ = [
[/[〇一二三四五六七八九十百千万億兆]/, 'M'],
- [/[一-]/, 'H'],
- [/[ぁ-]/, 'I'],
- [/[ァ-ヶ]/, 'K'],
+ [/[一-]/, 'H'],
+ [/[ぁ-]/, 'I'],
+ [/[゠-ヿ]/, 'K'],
[/[a-zA-Z]/, 'A'],
[/[0-9]/, 'N']
];
@@ -132,7 +132,7 @@ util.inherits(TokenizerJa, Tokenizer);
* @private
*/
TokenizerJa.prototype.ctype_ = function(str) {
- for (var i in this.chartype_) {
+ for (var i = 0, length = this.chartype_.length; i < length; i++) {
if (str.match(this.chartype_[i][0])) {
return this.chartype_[i][1];
}
@@ -183,7 +183,9 @@ TokenizerJa.prototype.tokenize = function(text) {
var seg = ['B3', 'B2', 'B1'];
var ctype = ['O', 'O', 'O'];
var o = text.split('');
- for (i = 0; i < o.length; ++i) {
+ var i;
+ var length;
+ for (i = 0, length = o.length; i < length; ++i) {
seg.push(o[i]);
ctype.push(this.ctype_(o[i]));
}
@@ -197,7 +199,7 @@ TokenizerJa.prototype.tokenize = function(text) {
var p1 = 'U';
var p2 = 'U';
var p3 = 'U';
- for (var i = 4; i < seg.length - 3; ++i) {
+ for (i = 4, length = seg.length - 3; i < length; ++i) {
var score = this.BIAS__;
var w1 = seg[i - 3];
var w2 = seg[i - 2];
View
112 spec/noun_inflector_ja_spec.js
@@ -0,0 +1,112 @@
+/*
+ Copyright (c) 2012, Guillaume Marty
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+var NounInflector = require('lib/natural/inflectors/ja/noun_inflector'),
+ inflector = new NounInflector();
+
+describe('NounInflector', function() {
+ describe('.pluralize()', function() {
+ describe('should pluralize nouns', function() {
+ expect(inflector.pluralize('ひと')).toBe('ひとたち');
+ expect(inflector.pluralize('わたし')).toBe('わたしたち');
+ expect(inflector.pluralize('')).toBe('私たち');
+ });
+
+ describe('should not pluralize exceptions', function() {
+ expect(inflector.pluralize('ともだち')).toBe('ともだち');
+ expect(inflector.pluralize('友だち')).toBe('友だち');
+ expect(inflector.pluralize('友達')).toBe('友達');
+ });
+
+ describe('should pluralize archaic forms', function() {
+ expect(inflector.pluralize('')).toBe('神神');
+ expect(inflector.pluralize('')).toBe('人人');
+ expect(inflector.pluralize('')).toBe('我我');
+ });
+ });
+
+ describe('.singularize()', function() {
+ describe('should singularize regular nouns ending by -tachi in Hiragana', function() {
+ expect(inflector.singularize('わたしたち')).toBe('わたし');
+ expect(inflector.singularize('人たち')).toBe('');
+ expect(inflector.singularize('りかたち')).toBe('りか');
+ });
+
+ describe('should not singularize exception nouns ending by -tachi in Hiragana', function() {
+ expect(inflector.singularize('ついたち')).toBe('ついたち');
+ expect(inflector.singularize('かたち')).toBe('かたち');
+ expect(inflector.singularize('はたち')).toBe('はたち');
+ });
+
+ describe('should singularize regular nouns ending by -tachi in Kanji', function() {
+ expect(inflector.singularize('わたし達')).toBe('わたし');
+ expect(inflector.singularize('人達')).toBe('');
+ expect(inflector.singularize('日伊達')).toBe('日伊');
+ });
+
+ describe('should not singularize exception nouns ending by -tachi in Kanji', function() {
+ expect(inflector.singularize('上達')).toBe('上達');
+ expect(inflector.singularize('配達')).toBe('配達');
+ expect(inflector.singularize('発達')).toBe('発達');
+ });
+
+ describe('should singularize regular nouns ending by -ra in Kanji', function() {
+ expect(inflector.singularize('僕等')).toBe('');
+ expect(inflector.singularize('貴様等')).toBe('貴様');
+ expect(inflector.singularize('圭一等')).toBe('圭一');
+ });
+
+ describe('should not singularize exception nouns ending by -ra in Kanji', function() {
+ expect(inflector.singularize('下等')).toBe('下等');
+ expect(inflector.singularize('初等')).toBe('初等');
+ expect(inflector.singularize('一等')).toBe('一等');
+ });
+
+ describe('should singularize regular nouns ending by -gata', function() {
+ expect(inflector.singularize('神様方')).toBe('神様');
+ expect(inflector.singularize('先生方')).toBe('先生');
+ expect(inflector.singularize('あなたがた')).toBe('あなた');
+ });
+
+ describe('should singularize regular nouns ending by -domo', function() {
+ expect(inflector.singularize('人間共')).toBe('人間');
+ expect(inflector.singularize('野郎共')).toBe('野郎');
+ expect(inflector.singularize('ガキども')).toBe('ガキ');
+ });
+
+ describe('should pluralize archaic forms', function() {
+ expect(inflector.singularize('神神')).toBe('');
+ expect(inflector.singularize('人人')).toBe('');
+ expect(inflector.singularize('我我')).toBe('');
+ });
+ });
+
+ describe('.attach()', function() {
+ describe('should attach new methods to String', function() {
+ inflector.attach();
+ expect(''.pluralizeNoun()).toBe('私たち');
+ expect('私たち'.singularizeNoun()).toBe('');
+ expect(''.pluralizeNoun()).toBe('人人');
+ expect('人人'.singularizeNoun()).toBe('');
+ });
+ });
+});
View
4 spec/stemmer_ja_spec.js
@@ -23,8 +23,8 @@ THE SOFTWARE.
var StemmerJa = require('lib/natural/stemmers/stemmer_ja');
var stemmer = new StemmerJa();
-var test = ['コピー', 'コーヒー', 'タクシー', 'パーティー', 'パーティ', 'センター'];
-var testResult = ['コピー', 'コーヒ', 'タクシ', 'パーティ', 'パーティ', 'センタ'];
+var test = ['コピー', 'コーヒー', 'タクシー', 'パーティー', 'パーティ', 'ヘルプ・センター'];
+var testResult = ['コピー', 'コーヒ', 'タクシ', 'パーティ', 'パーティ', 'ヘルプ・センタ'];
var text = '明後日パーティーに行く予定がある。図書館で資料をコピーしました。';
describe('StemmerJa', function() {
Something went wrong with that request. Please try again.