Skip to content

Commit

Permalink
Merge pull request #61 from gmarty/transliterator-ja
Browse files Browse the repository at this point in the history
Japanese transliterator (Hiragana and Katakana) to Latin
  • Loading branch information
chrisumbel committed Sep 10, 2012
2 parents 60d1f9b + 39227f1 commit 5a15dbc
Show file tree
Hide file tree
Showing 11 changed files with 1,494 additions and 12 deletions.
3 changes: 3 additions & 0 deletions lib/natural/index.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,12 +26,14 @@ exports.DoubleMetaphone = require('./phonetics/double_metaphone');
exports.PorterStemmer = require('./stemmers/porter_stemmer');
exports.PorterStemmerRu = require('./stemmers/porter_stemmer_ru');
exports.LancasterStemmer = require('./stemmers/lancaster_stemmer');
exports.StemmerJa = require('./stemmers/stemmer_ja');
exports.AggressiveTokenizerRu = require('./tokenizers/aggressive_tokenizer_ru');
exports.AggressiveTokenizer = require('./tokenizers/aggressive_tokenizer');
exports.RegexpTokenizer = require('./tokenizers/regexp_tokenizer').RegexpTokenizer;
exports.WordTokenizer = require('./tokenizers/regexp_tokenizer').WordTokenizer;
exports.WordPunctTokenizer = require('./tokenizers/regexp_tokenizer').WordPunctTokenizer;
exports.TreebankWordTokenizer = require('./tokenizers/treebank_word_tokenizer');
exports.TokenizerJa = require('./tokenizers/tokenizer_ja');
exports.BayesClassifier = require('./classifiers/bayes_classifier');
exports.LogisticRegressionClassifier = require('./classifiers/logistic_regression_classifier');
exports.NounInflector = require('./inflectors/noun_inflector');
Expand All @@ -46,3 +48,4 @@ exports.JaroWinklerDistance = require('./distance/jaro-winkler_distance');
exports.LevenshteinDistance = require('./distance/levenshtein_distance');
exports.DiceCoefficient = require('./distance/dice_coefficient');
exports.normalize_ja = require('./normalizers/normalizer_ja').normalize_ja;
exports.transliterate_ja = require('./transliterators/ja');
182 changes: 182 additions & 0 deletions lib/natural/normalizers/normalizer_ja.js
Original file line number Diff line number Diff line change
Expand Up @@ -26,6 +26,7 @@
* Note: The space character is treated like a roman character as it usually
* has the same width as them in Japanese texts.
*
* \@todo Replace characters range from ㈠ to ㉃, ㊀ to ㊰ and ㇰ to ㇿ.
* \@todo Lazy initializations of conversionTables and converters.
* \@todo Would fixHalfwidthKana be useful?
*
Expand Down Expand Up @@ -323,6 +324,174 @@ var fixFullwidthKana = {
'ッノ': 'ンノ'
};

var fixCompositeSymbolsTable = {
'㋀': '1月',
'㋁': '2月',
'㋂': '3月',
'㋃': '4月',
'㋄': '5月',
'㋅': '6月',
'㋆': '7月',
'㋇': '8月',
'㋈': '9月',
'㋉': '10月',
'㋊': '11月',
'㋋': '12月',

'㏠': '1日',
'㏡': '2日',
'㏢': '3日',
'㏣': '4日',
'㏤': '5日',
'㏥': '6日',
'㏦': '7日',
'㏧': '8日',
'㏨': '9日',
'㏩': '10日',
'㏪': '11日',
'㏫': '12日',
'㏬': '13日',
'㏭': '14日',
'㏮': '15日',
'㏯': '16日',
'㏰': '17日',
'㏱': '18日',
'㏲': '19日',
'㏳': '20日',
'㏴': '21日',
'㏵': '22日',
'㏶': '23日',
'㏷': '24日',
'㏸': '25日',
'㏹': '26日',
'㏺': '27日',
'㏻': '28日',
'㏼': '29日',
'㏽': '30日',
'㏾': '31日',

'㍘': '0点',
'㍙': '1点',
'㍚': '2点',
'㍛': '3点',
'㍜': '4点',
'㍝': '5点',
'㍞': '6点',
'㍟': '7点',
'㍠': '8点',
'㍡': '9点',
'㍢': '10点',
'㍣': '11点',
'㍤': '12点',
'㍥': '13点',
'㍦': '14点',
'㍧': '15点',
'㍨': '16点',
'㍩': '17点',
'㍪': '18点',
'㍫': '19点',
'㍬': '20点',
'㍭': '21点',
'㍮': '22点',
'㍯': '23点',
'㍰': '24点',

'㍻': '平成',
'㍼': '昭和',
'㍽': '大正',
'㍾': '明治',
'㍿': '株式会社',

'㌀': 'アパート',
'㌁': 'アルファ',
'㌂': 'アンペア',
'㌃': 'アール',
'㌄': 'イニング',
'㌅': 'インチ',
'㌆': 'ウオン',
'㌇': 'エスクード',
'㌈': 'エーカー',
'㌉': 'オンス',
'㌊': 'オーム',
'㌋': 'カイリ', //海里
'㌌': 'カラット',
'㌍': 'カロリー',
'㌎': 'ガロン',
'㌏': 'ガンマ',
'㌐': 'ギガ',
'㌑': 'ギニー',
'㌒': 'キュリー',
'㌓': 'ギルダー',
'㌔': 'キロ',
'㌕': 'キログラム',
'㌖': 'キロメートル',
'㌗': 'キロワット',
'㌘': 'グラム',
'㌙': 'グラムトン',
'㌚': 'クルゼイロ',
'㌛': 'クローネ',
'㌜': 'ケース',
'㌝': 'コルナ',
'㌞': 'コーポ',
'㌟': 'サイクル',
'㌠': 'サンチーム',
'㌡': 'シリング',
'㌢': 'センチ',
'㌣': 'セント',
'㌤': 'ダース',
'㌥': 'デシ',
'㌦': 'ドル',
'㌧': 'トン',
'㌨': 'ナノ',
'㌩': 'ノット',
'㌪': 'ハイツ',
'㌫': 'パーセント',
'㌬': 'パーツ',
'㌭': 'バーレル',
'㌮': 'ピアストル',
'㌯': 'ピクル',
'㌰': 'ピコ',
'㌱': 'ビル',
'㌲': 'ファラッド',
'㌳': 'フィート',
'㌴': 'ブッシェル',
'㌵': 'フラン',
'㌶': 'ヘクタール',
'㌷': 'ペソ',
'㌸': 'ペニヒ',
'㌹': 'ヘルツ',
'㌺': 'ペンス',
'㌻': 'ページ',
'㌼': 'ベータ',
'㌽': 'ポイント',
'㌾': 'ボルト',
'㌿': 'ホン',
'㍀': 'ポンド',
'㍁': 'ホール',
'㍂': 'ホーン',
'㍃': 'マイクロ',
'㍄': 'マイル',
'㍅': 'マッハ',
'㍆': 'マルク',
'㍇': 'マンション',
'㍈': 'ミクロン',
'㍉': 'ミリ',
'㍊': 'ミリバール',
'㍋': 'メガ',
'㍌': 'メガトン',
'㍍': 'メートル',
'㍎': 'ヤード',
'㍏': 'ヤール',
'㍐': 'ユアン',
'㍑': 'リットル',
'㍒': 'リラ',
'㍓': 'ルピー',
'㍔': 'ルーブル',
'㍕': 'レム',
'㍖': 'レントゲン',
'㍗': 'ワット'
};

// Fill in the conversion tables with the flipped tables.
conversionTables.halfwidthToFullwidth.alphabet = flip(conversionTables.fullwidthToHalfwidth.alphabet);
conversionTables.halfwidthToFullwidth.numbers = flip(conversionTables.fullwidthToHalfwidth.numbers);
Expand Down Expand Up @@ -356,6 +525,8 @@ var converters = {
normalize: replacer(conversionTables.normalize)
};

var fixCompositeSymbols = replacer(fixCompositeSymbolsTable);


/**
* Convert hiragana to fullwidth katakana.
Expand Down Expand Up @@ -405,18 +576,29 @@ converters.katakanaToHiragana = function(str) {

/**
* Fix kana and apply the following processes;
* * Replace repeat characters
* * Alphabet to halfwidth
* * Numbers to halfwidth
* * Punctuation to fullwidth
* * Katakana to fullwidth
* * Fix fullwidth kana
* * Replace composite symbols
*
* @param {string} str
* @return {string}
*/
var normalize_ja = function(str) {
// Replace repeat characters.
str = str
.replace(/(..)々々/g, '$1$1')
.replace(/(.)々/g, '$1$1');

str = converters.normalize(str);
str = converters.fixFullwidthKana(str);

// Replace composite symbols.
str = fixCompositeSymbols(str);

return str;
};

Expand Down
136 changes: 136 additions & 0 deletions lib/natural/stemmers/stemmer_ja.js
Original file line number Diff line number Diff line change
@@ -0,0 +1,136 @@
/*
Copyright (c) 2012, Guillaume Marty
Permission is hereby granted, free of charge, to any person obtaining a copy
of this software and associated documentation files (the "Software"), to deal
in the Software without restriction, including without limitation the rights
to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
copies of the Software, and to permit persons to whom the Software is
furnished to do so, subject to the following conditions:
The above copyright notice and this permission notice shall be included in
all copies or substantial portions of the Software.
THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
THE SOFTWARE.
*/

/**
* A very basic stemmer that performs the following steps:
* * Stem katakana.
* Inspired by:
* http://svn.apache.org/repos/asf/lucene/dev/trunk/lucene/analysis/kuromoji/src/java/org/apache/lucene/analysis/ja/JapaneseKatakanaStemFilter.java
*
* \@todo Use .bind() in StemmerJa.prototype.attach().
*/

var Tokenizer = require('../tokenizers/tokenizer_ja');
var stopwords = require('../util/stopwords_ja');



/**
* @constructor
*/
var StemmerJa = function() {
};


/**
* Tokenize and stem a text.
* Stop words are excluded except if the second argument is true.
*
* @param {string} text
* @param {boolean} keepStops Whether to keep stop words from the output or not.
* @return {Array.<string>}
*/
StemmerJa.prototype.tokenizeAndStem = function(text, keepStops) {
var self = this;
var stemmedTokens = [];
var tokens = new Tokenizer().tokenize(text);

// This is probably faster than an if at each iteration.
if (keepStops) {
tokens.forEach(function(token) {
var resultToken = token.toLowerCase();
resultToken = self.stem(resultToken);
stemmedTokens.push(resultToken);
});
} else {
tokens.forEach(function(token) {
if (stopwords.indexOf(token) == -1) {
var resultToken = token.toLowerCase();
resultToken = self.stem(resultToken);
stemmedTokens.push(resultToken);
}
});
}

return stemmedTokens;
};


/**
* Stem a term.
*
* @param {string} token
* @return {string}
*/
StemmerJa.prototype.stem = function(token) {
token = this.stemKatakana(token);

return token;
};


/**
* Remove the final prolonged sound mark on katakana if length is superior to
* a threshold.
*
* @param {string} token A katakana string to stem.
* @return {string} A katakana string stemmed.
*/
StemmerJa.prototype.stemKatakana = function(token) {
var HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK = 'ー';
var DEFAULT_MINIMUM_LENGTH = 4;

if (token.length >= DEFAULT_MINIMUM_LENGTH
&& token.slice(-1) === HIRAGANA_KATAKANA_PROLONGED_SOUND_MARK
&& this.isKatakana(token)) {
token = token.slice(0, token.length - 1);
}
return token;
};


/**
* Is a string made of fullwidth katakana only?
* This implementation is the fastest I know:
* http://jsperf.com/string-contain-katakana-only/2
*
* @param {string} str A string.
* @return {boolean} True if the string has katakana only.
*/
StemmerJa.prototype.isKatakana = function(str) {
return !!str.match(/^[ァ-ヴー]+$/);
};

// Expose an attach function that will patch String with new methods.
StemmerJa.prototype.attach = function() {
var self = this;

String.prototype.stem = function() {
return self.stem(this);
};

String.prototype.tokenizeAndStem = function(keepStops) {
return self.tokenizeAndStem(this, keepStops);
};
};

module.exports = StemmerJa;
Loading

0 comments on commit 5a15dbc

Please sign in to comment.