Permalink
Browse files

Add a noun inflector for Japanese

  • Loading branch information...
1 parent 4e47dcf commit e87d337dec2f779391f87df84f1b55eb3f50bb97 @gmarty gmarty committed Oct 10, 2012
Showing with 235 additions and 0 deletions.
  1. +123 −0 lib/natural/inflectors/ja/noun_inflector.js
  2. +112 −0 spec/noun_inflector_ja_spec.js
View
123 lib/natural/inflectors/ja/noun_inflector.js
@@ -0,0 +1,123 @@
+/*
+ Copyright (c) 2012, Guillaume Marty
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+/**
+ * A noun inflector for Japanese.
+ * Compiled from several sources including:
+ * \@see http://answers.yahoo.com/question/index?qid=20080528201740AASBWy6
+ * \@see http://www.excite.co.jp/dictionary/english_japanese/
+ *
+ * This script assumes input is normalized using normalizer_ja().
+ *
+ * \@todo Singularize nouns ending by -ら, but there are too many exceptions.
+ */
+
+var SingularPluralInflector = require('../singular_plural_inflector'),
+ util = require('util'),
+ FormSet = require('../form_set');
+
+function attach() {
+ var inflector = this;
+
+ String.prototype.singularizeNoun = function() {
+ return inflector.singularize(this);
+ };
+
+ String.prototype.pluralizeNoun = function() {
+ return inflector.pluralize(this);
+ };
+}
+
+
+
+/**
+ * @constructor
+ */
+var NounInflector = function() {
+ // Ambiguous a.k.a. invariant.
+ this.ambiguous = [
+ 'ともだち', '友だち', '友達', '遊び友達', '飲み友達', '酒飲み友達', '茶飲み友達',
+ '学校友達', '女友達', '男友達', '幼友達'
+ ];
+
+ this.customPluralForms = [];
+ this.customSingularForms = [];
+ this.singularForms = new FormSet();
+ this.pluralForms = new FormSet();
+
+ this.attach = attach;
+
+ this.addIrregular('', '神神');
+ this.addIrregular('', '人人');
+ this.addIrregular('', '年年');
+ this.addIrregular('', '月月');
+ this.addIrregular('', '日日');
+ this.addIrregular('', '星星');
+ this.addIrregular('', '島島');
+ this.addIrregular('', '我我');
+
+ /**
+ * Notes:
+ * -たち exceptions: いたち, おいたち, ついたち, かたち, かおかたち, なりかたち, いでたち, はたち, からたち, なりたち
+ * -達 exceptions: 伊達, 男伊達, 栄達, 上意下達, 熟達, 上達, 下意上達, 先達, 送達, 速達, 即日速達, 書留速達, 調達, 通達, 伝達, 到達, 配達, 牛乳配達, 新聞配達, 無料配達, 四通八達, 発達, 未発達, 御用達, 宮内庁御用達, 練達, 闊達
+ * -等 exceptions: 一等, 下等, 何等, 均等, 勲等, 高等, 三等, 初等, 上等, 親等, 二親等, 数等, 対等, 中等, 同等, 特等, 二等, 品等, 不等, 平等, 悪平等, 男女平等, 不平等, 優等, 劣等
+ */
+
+ // Pluralize
+ this.pluralForms.regularForms.push([/^(.+)$/i, '$1たち']);
+
+ // Singularize
+ this.singularForms.regularForms.push([/^(.+)たち$/i, function(a, mask) {
+ if (['', 'おい', 'つい', '', 'かおか', 'なりか', 'いで', '', 'から',
+ 'なり'].indexOf(mask) >= 0)
+ return mask + 'たち';
+ return mask;
+ }]);
+ this.singularForms.regularForms.push([/^(.+)達$/i, function(a, mask) {
+ if (['', '', '', '上意下', '', '', '下意上', '', '', '',
+ '即日速', '書留速', '調', '', '', '', '', '牛乳配', '新聞配', '無料配',
+ '四通八', '', '未発', '御用', '宮内庁御用', '', ''].indexOf(mask) >= 0)
+ return mask + '';
+ return mask;
+ }]); // Singularize nouns ending by -等, but not exceptions.
+ this.singularForms.regularForms.push([/^(.+)等$/i, function(a, mask) {
+ if (['', '', '', '', '', '', '', '', '', '二親', '', '',
+ '', '', '', '', '', '', '', '悪平', '男女平', '不平', '',
+ ''].indexOf(mask) >= 0)
+ return mask + '';
+ return mask;
+ }]);
+ this.singularForms.regularForms.push([/^(人間|わたくし||てまえ|手前|野郎|やろう|勇者|がき|ガキ|餓鬼)(共|ども)$/i, '$1']);
+ this.singularForms.regularForms.push([/^(神様|先生|あなた)(方|がた)$/i, '$1']);
+
+ this.pluralize = function(token) {
+ return this.ize(token, this.pluralForms, this.customPluralForms);
+ };
+
+ this.singularize = function(token) {
+ return this.ize(token, this.singularForms, this.customSingularForms);
+ };
+};
+
+util.inherits(NounInflector, SingularPluralInflector);
+
+module.exports = NounInflector;
View
112 spec/noun_inflector_ja_spec.js
@@ -0,0 +1,112 @@
+/*
+ Copyright (c) 2012, Guillaume Marty
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+var NounInflector = require('lib/natural/inflectors/ja/noun_inflector'),
+ inflector = new NounInflector();
+
+describe('NounInflector', function() {
+ describe('.pluralize()', function() {
+ describe('should pluralize nouns', function() {
+ expect(inflector.pluralize('ひと')).toBe('ひとたち');
+ expect(inflector.pluralize('わたし')).toBe('わたしたち');
+ expect(inflector.pluralize('')).toBe('私たち');
+ });
+
+ describe('should not pluralize exceptions', function() {
+ expect(inflector.pluralize('ともだち')).toBe('ともだち');
+ expect(inflector.pluralize('友だち')).toBe('友だち');
+ expect(inflector.pluralize('友達')).toBe('友達');
+ });
+
+ describe('should pluralize archaic forms', function() {
+ expect(inflector.pluralize('')).toBe('神神');
+ expect(inflector.pluralize('')).toBe('人人');
+ expect(inflector.pluralize('')).toBe('我我');
+ });
+ });
+
+ describe('.singularize()', function() {
+ describe('should singularize regular nouns ending by -tachi in Hiragana', function() {
+ expect(inflector.singularize('わたしたち')).toBe('わたし');
+ expect(inflector.singularize('人たち')).toBe('');
+ expect(inflector.singularize('りかたち')).toBe('りか');
+ });
+
+ describe('should not singularize exception nouns ending by -tachi in Hiragana', function() {
+ expect(inflector.singularize('ついたち')).toBe('ついたち');
+ expect(inflector.singularize('かたち')).toBe('かたち');
+ expect(inflector.singularize('はたち')).toBe('はたち');
+ });
+
+ describe('should singularize regular nouns ending by -tachi in Kanji', function() {
+ expect(inflector.singularize('わたし達')).toBe('わたし');
+ expect(inflector.singularize('人達')).toBe('');
+ expect(inflector.singularize('日伊達')).toBe('日伊');
+ });
+
+ describe('should not singularize exception nouns ending by -tachi in Kanji', function() {
+ expect(inflector.singularize('上達')).toBe('上達');
+ expect(inflector.singularize('配達')).toBe('配達');
+ expect(inflector.singularize('発達')).toBe('発達');
+ });
+
+ describe('should singularize regular nouns ending by -ra in Kanji', function() {
+ expect(inflector.singularize('僕等')).toBe('');
+ expect(inflector.singularize('貴様等')).toBe('貴様');
+ expect(inflector.singularize('圭一等')).toBe('圭一');
+ });
+
+ describe('should not singularize exception nouns ending by -ra in Kanji', function() {
+ expect(inflector.singularize('下等')).toBe('下等');
+ expect(inflector.singularize('初等')).toBe('初等');
+ expect(inflector.singularize('一等')).toBe('一等');
+ });
+
+ describe('should singularize regular nouns ending by -gata', function() {
+ expect(inflector.singularize('神様方')).toBe('神様');
+ expect(inflector.singularize('先生方')).toBe('先生');
+ expect(inflector.singularize('あなたがた')).toBe('あなた');
+ });
+
+ describe('should singularize regular nouns ending by -domo', function() {
+ expect(inflector.singularize('人間共')).toBe('人間');
+ expect(inflector.singularize('野郎共')).toBe('野郎');
+ expect(inflector.singularize('ガキども')).toBe('ガキ');
+ });
+
+ describe('should pluralize archaic forms', function() {
+ expect(inflector.singularize('神神')).toBe('');
+ expect(inflector.singularize('人人')).toBe('');
+ expect(inflector.singularize('我我')).toBe('');
+ });
+ });
+
+ describe('.attach()', function() {
+ describe('should attach new methods to String', function() {
+ inflector.attach();
+ expect(''.pluralizeNoun()).toBe('私たち');
+ expect('私たち'.singularizeNoun()).toBe('');
+ expect(''.pluralizeNoun()).toBe('人人');
+ expect('人人'.singularizeNoun()).toBe('');
+ });
+ });
+});

0 comments on commit e87d337

Please sign in to comment.