Merge pull request #61 from gmarty/transliterator-ja

Japanese transliterator (Hiragana and Katakana) to Latin
NaturalNode · Sep 10, 2012 · 5a15dbc · 5a15dbc
2 parents 60d1f9b + 39227f1
commit 5a15dbc
Show file tree

Hide file tree

Showing 11 changed files with 1,494 additions and 12 deletions.
diff --git a/lib/natural/index.js b/lib/natural/index.js
@@ -26,12 +26,14 @@ exports.DoubleMetaphone = require('./phonetics/double_metaphone');
 exports.PorterStemmer = require('./stemmers/porter_stemmer');
 exports.PorterStemmerRu = require('./stemmers/porter_stemmer_ru');
 exports.LancasterStemmer = require('./stemmers/lancaster_stemmer');
+exports.StemmerJa = require('./stemmers/stemmer_ja');
 exports.AggressiveTokenizerRu = require('./tokenizers/aggressive_tokenizer_ru');
 exports.AggressiveTokenizer = require('./tokenizers/aggressive_tokenizer');
 exports.RegexpTokenizer = require('./tokenizers/regexp_tokenizer').RegexpTokenizer;
 exports.WordTokenizer = require('./tokenizers/regexp_tokenizer').WordTokenizer;
 exports.WordPunctTokenizer = require('./tokenizers/regexp_tokenizer').WordPunctTokenizer;
 exports.TreebankWordTokenizer = require('./tokenizers/treebank_word_tokenizer');
+exports.TokenizerJa = require('./tokenizers/tokenizer_ja');
 exports.BayesClassifier = require('./classifiers/bayes_classifier');
 exports.LogisticRegressionClassifier = require('./classifiers/logistic_regression_classifier');
 exports.NounInflector = require('./inflectors/noun_inflector');
@@ -46,3 +48,4 @@ exports.JaroWinklerDistance = require('./distance/jaro-winkler_distance');
 exports.LevenshteinDistance = require('./distance/levenshtein_distance');
 exports.DiceCoefficient = require('./distance/dice_coefficient');
 exports.normalize_ja = require('./normalizers/normalizer_ja').normalize_ja;
+exports.transliterate_ja = require('./transliterators/ja');
diff --git a/lib/natural/normalizers/normalizer_ja.js b/lib/natural/normalizers/normalizer_ja.js
@@ -26,6 +26,7 @@
  * Note: The space character is treated like a roman character as it usually
  *   has the same width as them in Japanese texts.
  *
+ * \@todo Replace characters range from ㈠ to ㉃, ㊀ to ㊰ and ㇰ to ㇿ.
  * \@todo Lazy initializations of conversionTables and converters.
  * \@todo Would fixHalfwidthKana be useful?
  *
@@ -323,6 +324,174 @@ var fixFullwidthKana = {
   'ッノ': 'ンノ'
 };
 
+var fixCompositeSymbolsTable = {
+  '㋀': '1月',
+  '㋁': '2月',
+  '㋂': '3月',
+  '㋃': '4月',
+  '㋄': '5月',
+  '㋅': '6月',
+  '㋆': '7月',
+  '㋇': '8月',
+  '㋈': '9月',
+  '㋉': '10月',
+  '㋊': '11月',
+  '㋋': '12月',
+
+  '㏠': '1日',
+  '㏡': '2日',
+  '㏢': '3日',
+  '㏣': '4日',
+  '㏤': '5日',
+  '㏥': '6日',
+  '㏦': '7日',
+  '㏧': '8日',
+  '㏨': '9日',
+  '㏩': '10日',
+  '㏪': '11日',
+  '㏫': '12日',
+  '㏬': '13日',
+  '㏭': '14日',
+  '㏮': '15日',
+  '㏯': '16日',
+  '㏰': '17日',
+  '㏱': '18日',
+  '㏲': '19日',
+  '㏳': '20日',
+  '㏴': '21日',
+  '㏵': '22日',
+  '㏶': '23日',
+  '㏷': '24日',
+  '㏸': '25日',
+  '㏹': '26日',
+  '㏺': '27日',
+  '㏻': '28日',
+  '㏼': '29日',
+  '㏽': '30日',
+  '㏾': '31日',
+
+  '㍘': '0点',
+  '㍙': '1点',
+  '㍚': '2点',
+  '㍛': '3点',
+  '㍜': '4点',
+  '㍝': '5点',
+  '㍞': '6点',
+  '㍟': '7点',
+  '㍠': '8点',
+  '㍡': '9点',
+  '㍢': '10点',
+  '㍣': '11点',
+  '㍤': '12点',
+  '㍥': '13点',
+  '㍦': '14点',
+  '㍧': '15点',
+  '㍨': '16点',
+  '㍩': '17点',
+  '㍪': '18点',
+  '㍫': '19点',
+  '㍬': '20点',
+  '㍭': '21点',
+  '㍮': '22点',
+  '㍯': '23点',
+  '㍰': '24点',
+
+  '㍻': '平成',
+  '㍼': '昭和',
+  '㍽': '大正',
+  '㍾': '明治',
+  '㍿': '株式会社',
+
+  '㌀': 'アパート',
+  '㌁': 'アルファ',
+  '㌂': 'アンペア',
+  '㌃': 'アール',
+  '㌄': 'イニング',
+  '㌅': 'インチ',
+  '㌆': 'ウオン',
+  '㌇': 'エスクード',
+  '㌈': 'エーカー',
+  '㌉': 'オンス',
+  '㌊': 'オーム',
+  '㌋': 'カイリ', //海里
+  '㌌': 'カラット',
+  '㌍': 'カロリー',
+  '㌎': 'ガロン',
+  '㌏': 'ガンマ',
+  '㌐': 'ギガ',
+  '㌑': 'ギニー',
+  '㌒': 'キュリー',
+  '㌓': 'ギルダー',
+  '㌔': 'キロ',
+  '㌕': 'キログラム',
+  '㌖': 'キロメートル',
+  '㌗': 'キロワット',
+  '㌘': 'グラム',
+  '㌙': 'グラムトン',
+  '㌚': 'クルゼイロ',
+  '㌛': 'クローネ',
+  '㌜': 'ケース',
+  '㌝': 'コルナ',
+  '㌞': 'コーポ',
+  '㌟': 'サイクル',
+  '㌠': 'サンチーム',
+  '㌡': 'シリング',
+  '㌢': 'センチ',
+  '㌣': 'セント',
+  '㌤': 'ダース',
+  '㌥': 'デシ',
+  '㌦': 'ドル',
+  '㌧': 'トン',
+  '㌨': 'ナノ',
+  '㌩': 'ノット',
+  '㌪': 'ハイツ',
+  '㌫': 'パーセント',
+  '㌬': 'パーツ',
+  '㌭': 'バーレル',
+  '㌮': 'ピアストル',
+  '㌯': 'ピクル',
+  '㌰': 'ピコ',
+  '㌱': 'ビル',
+  '㌲': 'ファラッド',
+  '㌳': 'フィート',
+  '㌴': 'ブッシェル',
+  '㌵': 'フラン',
+  '㌶': 'ヘクタール',
+  '㌷': 'ペソ',
+  '㌸': 'ペニヒ',
+  '㌹': 'ヘルツ',
+  '㌺': 'ペンス',
+  '㌻': 'ページ',
+  '㌼': 'ベータ',
+  '㌽': 'ポイント',
+  '㌾': 'ボルト',
+  '㌿': 'ホン',
+  '㍀': 'ポンド',
+  '㍁': 'ホール',
+  '㍂': 'ホーン',
+  '㍃': 'マイクロ',
+  '㍄': 'マイル',
+  '㍅': 'マッハ',
+  '㍆': 'マルク',
+  '㍇': 'マンション',
+  '㍈': 'ミクロン',
+  '㍉': 'ミリ',
+  '㍊': 'ミリバール',
+  '㍋': 'メガ',
+  '㍌': 'メガトン',
+  '㍍': 'メートル',
+  '㍎': 'ヤード',
+  '㍏': 'ヤール',
+  '㍐': 'ユアン',
+  '㍑': 'リットル',
+  '㍒': 'リラ',
+  '㍓': 'ルピー',
+  '㍔': 'ルーブル',
+  '㍕': 'レム',
+  '㍖': 'レントゲン',
+  '㍗': 'ワット'
+};
+
 // Fill in the conversion tables with the flipped tables.
 conversionTables.halfwidthToFullwidth.alphabet = flip(conversionTables.fullwidthToHalfwidth.alphabet);
 conversionTables.halfwidthToFullwidth.numbers = flip(conversionTables.fullwidthToHalfwidth.numbers);
@@ -356,6 +525,8 @@ var converters = {
   normalize: replacer(conversionTables.normalize)
 };
 
+var fixCompositeSymbols = replacer(fixCompositeSymbolsTable);
+
 
 /**
  * Convert hiragana to fullwidth katakana.
@@ -405,18 +576,29 @@ converters.katakanaToHiragana = function(str) {
 
 /**
  * Fix kana and apply the following processes;
+ * * Replace repeat characters
  * * Alphabet to halfwidth
  * * Numbers to halfwidth
  * * Punctuation to fullwidth
  * * Katakana to fullwidth
+ * * Fix fullwidth kana
+ * * Replace composite symbols
  *
  * @param {string} str
  * @return {string}
  */
 var normalize_ja = function(str) {
+  // Replace repeat characters.
+  str = str
+    .replace(/(..)々々/g, '$1$1')
+    .replace(/(.)々/g, '$1$1');
+
   str = converters.normalize(str);
   str = converters.fixFullwidthKana(str);
 
+  // Replace composite symbols.
+  str = fixCompositeSymbols(str);
+
   return str;
 };
 

diff --git a/lib/natural/stemmers/stemmer_ja.js b/lib/natural/stemmers/stemmer_ja.js
@@ -0,0 +1,136 @@
+/*
+ Copyright (c) 2012, Guillaume Marty
+
+ Permission is hereby granted, free of charge, to any person obtaining a copy
+ of this software and associated documentation files (the "Software"), to deal
+ in the Software without restriction, including without limitation the rights
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ copies of the Software, and to permit persons to whom the Software is
+ furnished to do so, subject to the following conditions:
+
+ The above copyright notice and this permission notice shall be included in
+ all copies or substantial portions of the Software.
+
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ THE SOFTWARE.
+ */
+
+/**
+ * A very basic stemmer that performs the following steps:
+ * * Stem katakana.
+ * Inspired by:
+ * http://svn.apache.org/repos/asf/lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java
+ *
+ * \@todo Use .bind() in StemmerJa.prototype.attach().
+ */
+
+var Tokenizer = require('../tokenizers/tokenizer_ja');
+var stopwords = require('../util/stopwords_ja');
+
+
+
+/**
+ * @constructor
+ */
+var StemmerJa = function() {
+};
+
+
+/**
+ * Tokenize and stem a text.
+ * Stop words are excluded except if the second argument is true.
+ *
+ * @param {string} text
+ * @param {boolean} keepStops Whether to keep stop words from the output or not.
+ * @return {Array.<string>}
+ */
+StemmerJa.prototype.tokenizeAndStem = function(text, keepStops) {
+  var self = this;
+  var stemmedTokens = [];
+  var tokens = new Tokenizer().tokenize(text);
+
+  // This is probably faster than an if at each iteration.
+  if (keepStops) {
+    tokens.forEach(function(token) {
+      var resultToken = token.toLowerCase();
+      resultToken = self.stem(resultToken);
+      stemmedTokens.push(resultToken);
+    });
+  } else {
+    tokens.forEach(function(token) {
+      if (stopwords.indexOf(token) == -1) {
+        var resultToken = token.toLowerCase();
+        resultToken = self.stem(resultToken);
+        stemmedTokens.push(resultToken);
+      }
+    });
+  }
+
+  return stemmedTokens;
+};
+
+
+/**
+ * Stem a term.
+ *
+ * @param {string} token
+ * @return {string}
+ */
+StemmerJa.prototype.stem = function(token) {
+  token = this.stemKatakana(token);
+
+  return token;
+};
+
+
+/**
+ * Remove the final prolonged sound mark on katakana if length is superior to
+ * a threshold.
+ *
+ * @param {string} token A katakana string to stem.
+ * @return {string} A katakana string stemmed.
+ */
+StemmerJa.prototype.stemKatakana = function(token) {
+  var HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK = 'ー';
+  var DEFAULT_MINIMUM_LENGTH = 4;
+
+  if (token.length >= DEFAULT_MINIMUM_LENGTH
+      && token.slice(-1) === HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK
+      && this.isKatakana(token)) {
+    token = token.slice(0, token.length - 1);
+  }
+  return token;
+};
+
+
+/**
+ * Is a string made of fullwidth katakana only?
+ * This implementation is the fastest I know:
+ * http://jsperf.com/string-contain-katakana-only/2
+ *
+ * @param {string} str A string.
+ * @return {boolean} True if the string has katakana only.
+ */
+StemmerJa.prototype.isKatakana = function(str) {
+  return !!str.match(/^[ァ-ヴー]+$/);
+};
+
+// Expose an attach function that will patch String with new methods.
+StemmerJa.prototype.attach = function() {
+  var self = this;
+
+  String.prototype.stem = function() {
+    return self.stem(this);
+  };
+
+  String.prototype.tokenizeAndStem = function(keepStops) {
+    return self.tokenizeAndStem(this, keepStops);
+  };
+};
+
+module.exports = StemmerJa;