Skip to content
This repository

Normalizer ja #54

Merged
merged 11 commits into from over 1 year ago

2 participants

Guillaume Cedric Marty Chris Umbel
Guillaume Cedric Marty

Add a normalizer for Japanese:

  • Use normalize_ja() to get a consistent corpus before further processing.
  • Function below converters namespace can be used to perform several conversion (ex: fullwidth to halfwidth characters, hiragana to katakana...).

Everything is thoroughly tested, but we should write tests for helper functions in lib/natural/util/utils.js.

Chris Umbel chrisumbel merged commit 60d1f9b into from August 21, 2012
Chris Umbel chrisumbel closed this August 21, 2012
Chris Umbel
Owner

excellent. thanks!

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment
This page is out of date. Refresh to see the latest.
1  lib/natural/index.js
@@ -45,3 +45,4 @@ exports.NGrams = require('./ngrams/ngrams');
45 45
 exports.JaroWinklerDistance = require('./distance/jaro-winkler_distance');
46 46
 exports.LevenshteinDistance = require('./distance/levenshtein_distance');
47 47
 exports.DiceCoefficient = require('./distance/dice_coefficient');
  48
+exports.normalize_ja = require('./normalizers/normalizer_ja').normalize_ja;
424  lib/natural/normalizers/normalizer_ja.js
... ...
@@ -0,0 +1,424 @@
  1
+/*
  2
+ Copyright (c) 2012, Guillaume Marty
  3
+
  4
+ Permission is hereby granted, free of charge, to any person obtaining a copy
  5
+ of this software and associated documentation files (the "Software"), to deal
  6
+ in the Software without restriction, including without limitation the rights
  7
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8
+ copies of the Software, and to permit persons to whom the Software is
  9
+ furnished to do so, subject to the following conditions:
  10
+
  11
+ The above copyright notice and this permission notice shall be included in
  12
+ all copies or substantial portions of the Software.
  13
+
  14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  20
+ THE SOFTWARE.
  21
+ */
  22
+
  23
+/**
  24
+ * Normalize Japanese inputs and expose function to perform several conversions.
  25
+ *
  26
+ * Note: The space character is treated like a roman character as it usually
  27
+ *   has the same width as them in Japanese texts.
  28
+ *
  29
+ * \@todo Lazy initializations of conversionTables and converters.
  30
+ * \@todo Would fixHalfwidthKana be useful?
  31
+ *
  32
+ * Descriptions of functions exposed:
  33
+ * normalizeJapanese 「全角」英字・数字を「半角」、「半角」記・カタカナを「全角」に変換
  34
+ * converters.fullwidthToHalfwidth.alphabet    「全角」英字を「半角」に変換
  35
+ * converters.halfwidthToFullwidth.alphabet    「半角」英字を「全角」に変換
  36
+ * converters.fullwidthToHalfwidth.numbers     「全角」数字を「半角」に変換
  37
+ * converters.halfwidthToFullwidth.numbers     「半角」数字を「全角」に変換 「全角」スペースを「半角」
  38
+ * converters.fullwidthToHalfwidth.punctuation 「全角」記号を「半角」に変換 「半角」スペースを「全角」
  39
+ * converters.halfwidthToFullwidth.punctuation 「半角」記号を「全角」に変換
  40
+ * converters.fullwidthToHalfwidth.katakana    「全角カタカナ」を「半角カタカナ」に変換
  41
+ * converters.halfwidthToFullwidth.katakana    「半角カタカナ」を「全角カタカナ」に変換
  42
+ * converters.hiraganaToKatakana               「カタカナ」を「ひらがな」に変換
  43
+ * converters.katakanaToHiragana               「ひらがな」を「カタカナ」に変換
  44
+ */
  45
+
  46
+var flip = require('../util/utils.js').flip;
  47
+var merge = require('../util/utils.js').merge;
  48
+var replacer = require('../util/utils').replacer;
  49
+
  50
+// From http://fernweh.jp/b/mb_convert_kana_js/
  51
+var conversionTables = {
  52
+  fullwidthToHalfwidth: {
  53
+    alphabet: {
  54
+      'a': 'a',
  55
+      'b': 'b',
  56
+      'c': 'c',
  57
+      'd': 'd',
  58
+      'e': 'e',
  59
+      'f': 'f',
  60
+      'g': 'g',
  61
+      'h': 'h',
  62
+      'i': 'i',
  63
+      'j': 'j',
  64
+      'k': 'k',
  65
+      'l': 'l',
  66
+      'm': 'm',
  67
+      'n': 'n',
  68
+      'o': 'o',
  69
+      'p': 'p',
  70
+      'q': 'q',
  71
+      'r': 'r',
  72
+      's': 's',
  73
+      't': 't',
  74
+      'u': 'u',
  75
+      'v': 'v',
  76
+      'w': 'w',
  77
+      'x': 'x',
  78
+      'y': 'y',
  79
+      'z': 'z',
  80
+      'A': 'A',
  81
+      'B': 'B',
  82
+      'C': 'C',
  83
+      'D': 'D',
  84
+      'E': 'E',
  85
+      'F': 'F',
  86
+      'G': 'G',
  87
+      'H': 'H',
  88
+      'I': 'I',
  89
+      'J': 'J',
  90
+      'K': 'K',
  91
+      'L': 'L',
  92
+      'M': 'M',
  93
+      'N': 'N',
  94
+      'O': 'O',
  95
+      'P': 'P',
  96
+      'Q': 'Q',
  97
+      'R': 'R',
  98
+      'S': 'S',
  99
+      'T': 'T',
  100
+      'U': 'U',
  101
+      'V': 'V',
  102
+      'W': 'W',
  103
+      'X': 'X',
  104
+      'Y': 'Y',
  105
+      'Z': 'Z',
  106
+      ' ': ' ' // Fullwidth space
  107
+    },
  108
+
  109
+    numbers: {
  110
+      '0': '0',
  111
+      '1': '1',
  112
+      '2': '2',
  113
+      '3': '3',
  114
+      '4': '4',
  115
+      '5': '5',
  116
+      '6': '6',
  117
+      '7': '7',
  118
+      '8': '8',
  119
+      '9': '9'
  120
+    },
  121
+
  122
+    punctuation: {
  123
+      '_': '_',
  124
+      '-': '-',
  125
+      '・': '・',
  126
+      ',': ',',
  127
+      '、': '、',
  128
+      ';': ';',
  129
+      ':': ':',
  130
+      '!': '!',
  131
+      '?': '?',
  132
+      '.': '.',
  133
+      '。': '。',
  134
+      '(': '(',
  135
+      ')': ')',
  136
+      '[': '[',
  137
+      ']': ']',
  138
+      '{': '{',
  139
+      '}': '}',
  140
+      '「': '「',
  141
+      '」': '」',
  142
+      '@': '@',
  143
+      '*': '*',
  144
+      '\': '\\',
  145
+      '/': '/',
  146
+      '&': '&',
  147
+      '#': '#',
  148
+      '%': '%',
  149
+      '`': '`',
  150
+      '^': '^',
  151
+      '+': '+',
  152
+      '<': '<',
  153
+      '=': '=',
  154
+      '>': '>',
  155
+      '|': '|',
  156
+      '~': '~',
  157
+      '≪': '«',
  158
+      '≫': '»',
  159
+      '─': '-',
  160
+      '$': '$',
  161
+      '"': '"'
  162
+    },
  163
+
  164
+    katakana: {
  165
+      '゛': '゙',
  166
+      '゜': '゚',
  167
+      'ー': 'ー',
  168
+
  169
+      'ヴ': 'ヴ',
  170
+      'ガ': 'ガ',
  171
+      'ギ': 'ギ',
  172
+      'グ': 'グ',
  173
+      'ゲ': 'ゲ',
  174
+      'ゴ': 'ゴ',
  175
+      'ザ': 'ザ',
  176
+      'ジ': 'ジ',
  177
+      'ズ': 'ズ',
  178
+      'ゼ': 'ゼ',
  179
+      'ゾ': 'ゾ',
  180
+      'ダ': 'ダ',
  181
+      'ヂ': 'ヂ',
  182
+      'ヅ': 'ヅ',
  183
+      'デ': 'デ',
  184
+      'ド': 'ド',
  185
+      'バ': 'バ',
  186
+      'パ': 'パ',
  187
+      'ビ': 'ビ',
  188
+      'ピ': 'ピ',
  189
+      'ブ': 'ブ',
  190
+      'プ': 'プ',
  191
+      'ベ': 'ベ',
  192
+      'ペ': 'ペ',
  193
+      'ボ': 'ボ',
  194
+      'ポ': 'ポ',
  195
+
  196
+      'ァ': 'ァ',
  197
+      'ア': 'ア',
  198
+      'ィ': 'ィ',
  199
+      'イ': 'イ',
  200
+      'ゥ': 'ゥ',
  201
+      'ウ': 'ウ',
  202
+      'ェ': 'ェ',
  203
+      'エ': 'エ',
  204
+      'ォ': 'ォ',
  205
+      'オ': 'オ',
  206
+      'カ': 'カ',
  207
+      'キ': 'キ',
  208
+      'ク': 'ク',
  209
+      'ケ': 'ケ',
  210
+      'コ': 'コ',
  211
+      'サ': 'サ',
  212
+      'シ': 'シ',
  213
+      'ス': 'ス',
  214
+      'セ': 'セ',
  215
+      'ソ': 'ソ',
  216
+      'タ': 'タ',
  217
+      'チ': 'チ',
  218
+      'ッ': 'ッ',
  219
+      'ツ': 'ツ',
  220
+      'テ': 'テ',
  221
+      'ト': 'ト',
  222
+      'ナ': 'ナ',
  223
+      'ニ': 'ニ',
  224
+      'ヌ': 'ヌ',
  225
+      'ネ': 'ネ',
  226
+      'ノ': 'ノ',
  227
+      'ハ': 'ハ',
  228
+      'ヒ': 'ヒ',
  229
+      'フ': 'フ',
  230
+      'ヘ': 'ヘ',
  231
+      'ホ': 'ホ',
  232
+      'マ': 'マ',
  233
+      'ミ': 'ミ',
  234
+      'ム': 'ム',
  235
+      'メ': 'メ',
  236
+      'モ': 'モ',
  237
+      'ャ': 'ャ',
  238
+      'ヤ': 'ヤ',
  239
+      'ュ': 'ュ',
  240
+      'ユ': 'ユ',
  241
+      'ョ': 'ョ',
  242
+      'ヨ': 'ヨ',
  243
+      'ラ': 'ラ',
  244
+      'リ': 'リ',
  245
+      'ル': 'ル',
  246
+      'レ': 'レ',
  247
+      'ロ': 'ロ',
  248
+      'ワ': 'ワ',
  249
+      'ヲ': 'ヲ',
  250
+      'ン': 'ン'
  251
+    }
  252
+  },
  253
+
  254
+  halfwidthToFullwidth: {}
  255
+};
  256
+
  257
+var fixFullwidthKana = {
  258
+  'ゝ゛': 'ゞ',
  259
+  'ヽ゛': 'ヾ',
  260
+
  261
+  'う゛': 'ゔ',
  262
+  'か゛': 'が',
  263
+  'き゛': 'ぎ',
  264
+  'く゛': 'ぐ',
  265
+  'け゛': 'げ',
  266
+  'こ゛': 'ご',
  267
+  'さ゛': 'ざ',
  268
+  'し゛': 'じ',
  269
+  'す゛': 'ず',
  270
+  'せ゛': 'ぜ',
  271
+  'そ゛': 'ぞ',
  272
+  'た゛': 'だ',
  273
+  'ち゛': 'ぢ',
  274
+  'つ゛': 'づ',
  275
+  'て゛': 'で',
  276
+  'と゛': 'ど',
  277
+  'は゛': 'ば',
  278
+  'は゜': 'ぱ',
  279
+  'ひ゛': 'び',
  280
+  'ひ゜': 'ぴ',
  281
+  'ふ゛': 'ぶ',
  282
+  'ふ゜': 'ぷ',
  283
+  'へ゛': 'べ',
  284
+  'へ゜': 'ぺ',
  285
+  'ほ゛': 'ぼ',
  286
+  'ほ゜': 'ぽ',
  287
+  'っな': 'んな',
  288
+  'っに': 'んに',
  289
+  'っぬ': 'んぬ',
  290
+  'っね': 'んね',
  291
+  'っの': 'んの',
  292
+
  293
+  'ウ゛': 'ヴ',
  294
+  'カ゛': 'ガ',
  295
+  'キ゛': 'ギ',
  296
+  'ク゛': 'グ',
  297
+  'ケ゛': 'ゲ',
  298
+  'コ゛': 'ゴ',
  299
+  'サ゛': 'ザ',
  300
+  'シ゛': 'ジ',
  301
+  'ス゛': 'ズ',
  302
+  'セ゛': 'ゼ',
  303
+  'ソ゛': 'ゾ',
  304
+  'タ゛': 'ダ',
  305
+  'チ゛': 'ヂ',
  306
+  'ツ゛': 'ヅ',
  307
+  'テ゛': 'デ',
  308
+  'ト゛': 'ド',
  309
+  'ハ゛': 'バ',
  310
+  'ハ゜': 'パ',
  311
+  'ヒ゛': 'ビ',
  312
+  'ヒ゜': 'ピ',
  313
+  'フ゛': 'ブ',
  314
+  'フ゜': 'プ',
  315
+  'ヘ゛': 'ベ',
  316
+  'ヘ゜': 'ペ',
  317
+  'ホ゛': 'ボ',
  318
+  'ホ゜': 'ポ',
  319
+  'ッナ': 'ンナ',
  320
+  'ッニ': 'ンニ',
  321
+  'ッヌ': 'ンヌ',
  322
+  'ッネ': 'ンネ',
  323
+  'ッノ': 'ンノ'
  324
+};
  325
+
  326
+// Fill in the conversion tables with the flipped tables.
  327
+conversionTables.halfwidthToFullwidth.alphabet = flip(conversionTables.fullwidthToHalfwidth.alphabet);
  328
+conversionTables.halfwidthToFullwidth.numbers = flip(conversionTables.fullwidthToHalfwidth.numbers);
  329
+conversionTables.halfwidthToFullwidth.punctuation = flip(conversionTables.fullwidthToHalfwidth.punctuation);
  330
+conversionTables.halfwidthToFullwidth.katakana = flip(conversionTables.fullwidthToHalfwidth.katakana);
  331
+
  332
+// Build the normalization table.
  333
+conversionTables.normalize = merge(
  334
+    conversionTables.fullwidthToHalfwidth.alphabet,
  335
+    conversionTables.fullwidthToHalfwidth.numbers,
  336
+    conversionTables.halfwidthToFullwidth.punctuation,
  337
+    conversionTables.halfwidthToFullwidth.katakana
  338
+    );
  339
+
  340
+var converters = {
  341
+  fullwidthToHalfwidth: {
  342
+    alphabet: replacer(conversionTables.fullwidthToHalfwidth.alphabet),
  343
+    numbers: replacer(conversionTables.fullwidthToHalfwidth.numbers),
  344
+    punctuation: replacer(conversionTables.fullwidthToHalfwidth.punctuation),
  345
+    katakana: replacer(conversionTables.fullwidthToHalfwidth.katakana)
  346
+  },
  347
+
  348
+  halfwidthToFullwidth: {
  349
+    alphabet: replacer(conversionTables.halfwidthToFullwidth.alphabet),
  350
+    numbers: replacer(conversionTables.halfwidthToFullwidth.numbers),
  351
+    punctuation: replacer(conversionTables.halfwidthToFullwidth.punctuation),
  352
+    katakana: replacer(conversionTables.halfwidthToFullwidth.katakana)
  353
+  },
  354
+
  355
+  fixFullwidthKana: replacer(fixFullwidthKana),
  356
+  normalize: replacer(conversionTables.normalize)
  357
+};
  358
+
  359
+
  360
+/**
  361
+ * Convert hiragana to fullwidth katakana.
  362
+ * According to http://jsperf.com/converting-japanese, these implementations are
  363
+ * faster than using lookup tables.
  364
+ *
  365
+ * @param {string} str A string.
  366
+ * @return {string} A string not containing hiragana.
  367
+ */
  368
+converters.hiraganaToKatakana = function(str) {
  369
+  str = converters.halfwidthToFullwidth.katakana(str);
  370
+  str = converters.fixFullwidthKana(str);
  371
+
  372
+  str = str.replace(/ゝ/g, 'ヽ');
  373
+  str = str.replace(/ゞ/g, 'ヾ');
  374
+  //str = str.replace(/?/g, '𛀀'); // Letter archaic E
  375
+
  376
+  str = str.replace(/[ぁ-ゖ]/g, function(str) {
  377
+    return String.fromCharCode(str.charCodeAt(0) + 96);
  378
+  });
  379
+
  380
+  return str;
  381
+};
  382
+
  383
+
  384
+/**
  385
+ * Convert katakana to hiragana.
  386
+ *
  387
+ * @param {string} str A string.
  388
+ * @return {string} A string not containing katakana.
  389
+ */
  390
+converters.katakanaToHiragana = function(str) {
  391
+  str = converters.halfwidthToFullwidth.katakana(str);
  392
+  str = converters.fixFullwidthKana(str);
  393
+
  394
+  str = str.replace(/ヽ/g, 'ゝ');
  395
+  str = str.replace(/ヾ/g, 'ゞ');
  396
+  //str = str.replace(/?/g, '𛀁'); // Letter archaic E
  397
+
  398
+  str = str.replace(/[ァ-ヶ]/g, function(str) {
  399
+    return String.fromCharCode(str.charCodeAt(0) - 96);
  400
+  });
  401
+
  402
+  return str;
  403
+};
  404
+
  405
+
  406
+/**
  407
+ * Fix kana and apply the following processes;
  408
+ * * Alphabet to halfwidth
  409
+ * * Numbers to halfwidth
  410
+ * * Punctuation to fullwidth
  411
+ * * Katakana to fullwidth
  412
+ *
  413
+ * @param {string} str
  414
+ * @return {string}
  415
+ */
  416
+var normalize_ja = function(str) {
  417
+  str = converters.normalize(str);
  418
+  str = converters.fixFullwidthKana(str);
  419
+
  420
+  return str;
  421
+};
  422
+
  423
+exports.normalize_ja = normalize_ja;
  424
+exports.converters = converters;
114  lib/natural/util/utils.js
... ...
@@ -0,0 +1,114 @@
  1
+/*
  2
+ Copyright (c) 2012, Guillaume Marty
  3
+
  4
+ Permission is hereby granted, free of charge, to any person obtaining a copy
  5
+ of this software and associated documentation files (the "Software"), to deal
  6
+ in the Software without restriction, including without limitation the rights
  7
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8
+ copies of the Software, and to permit persons to whom the Software is
  9
+ furnished to do so, subject to the following conditions:
  10
+
  11
+ The above copyright notice and this permission notice shall be included in
  12
+ all copies or substantial portions of the Software.
  13
+
  14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  20
+ THE SOFTWARE.
  21
+ */
  22
+
  23
+
  24
+/**
  25
+ * Generate a replacing function given a table of patterns. Inspired by:
  26
+ * http://code.google.com/p/jslibs/wiki/JavascriptTips#String_converter
  27
+ * The order of elements is significant. Longer elements should be listed first.
  28
+ *
  29
+ * @param {Object.<string, string>} translationTable The translation table of key value.
  30
+ * @return {function(string): string} A translating function.
  31
+ */
  32
+function replacer(translationTable) {
  33
+  /**
  34
+   * An array of translationTable keys.
  35
+   * @type {Array.<string>}
  36
+   */
  37
+  var pattern = [];
  38
+
  39
+  /**
  40
+   * The regular expression doing the replacement job.
  41
+   * @type {RegExp}
  42
+   */
  43
+  var regExp;
  44
+
  45
+  /**
  46
+   * Used to iterate over translationTable.
  47
+   * @type {string}
  48
+   */
  49
+  var key;
  50
+
  51
+  for (key in translationTable) {
  52
+    // Escaping regexp special chars.
  53
+    key = key.replace(/(\^|\$|\*|\+|\?|\.|\(|\)|\[|\]|\{|\}|\||\\)/g, '\\\$1');
  54
+    pattern.push(key);
  55
+  }
  56
+
  57
+  regExp = new RegExp(pattern.join('|'), 'g');
  58
+
  59
+  /**
  60
+   * @param {string} str Input string.
  61
+   * @return {string} The string replaced.
  62
+   */
  63
+  return function(str) {
  64
+    return str.replace(regExp, function(str) {
  65
+      return translationTable[str];
  66
+    });
  67
+  };
  68
+}
  69
+
  70
+
  71
+/**
  72
+ * Exchanges all keys with their associated values in an object.
  73
+ *
  74
+ * @param {Object.<string, string>} obj An object of strings.
  75
+ * @return {Object.<string, string>} An object of strings.
  76
+ */
  77
+function flip(obj) {
  78
+  var newObj = Object.create(null),
  79
+      key;
  80
+
  81
+  for (key in obj) {
  82
+    newObj[obj[key]] = key;
  83
+  }
  84
+
  85
+  return newObj;
  86
+}
  87
+
  88
+
  89
+/**
  90
+ * Merge several objects. Properties from earlier objects are overwritten by
  91
+ * laters's in case of conflict.
  92
+ *
  93
+ * @param {...Object.<string, string>} var_args One or more objects of strings.
  94
+ * @return {!Object.<string, string>} An object of strings.
  95
+ */
  96
+function merge(var_args) {
  97
+  var args = [].slice.call(arguments),
  98
+      newObj = Object.create(null),
  99
+      id = 0, key;
  100
+
  101
+  while (args[id]) {
  102
+    for (key in args[id]) {
  103
+      newObj[key] = args[id][key];
  104
+    }
  105
+
  106
+    id++;
  107
+  }
  108
+
  109
+  return newObj;
  110
+}
  111
+
  112
+exports.replacer = replacer;
  113
+exports.flip = flip;
  114
+exports.merge = merge;
140  spec/normalizer_ja_spec.js
... ...
@@ -0,0 +1,140 @@
  1
+/*
  2
+ Copyright (c) 2012, Guillaume Marty
  3
+
  4
+ Permission is hereby granted, free of charge, to any person obtaining a copy
  5
+ of this software and associated documentation files (the "Software"), to deal
  6
+ in the Software without restriction, including without limitation the rights
  7
+ to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
  8
+ copies of the Software, and to permit persons to whom the Software is
  9
+ furnished to do so, subject to the following conditions:
  10
+
  11
+ The above copyright notice and this permission notice shall be included in
  12
+ all copies or substantial portions of the Software.
  13
+
  14
+ THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15
+ IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16
+ FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
  17
+ AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
  18
+ LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
  19
+ OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
  20
+ THE SOFTWARE.
  21
+ */
  22
+
  23
+var normalize_ja = require('lib/natural/normalizers/normalizer_ja').normalize_ja;
  24
+var converters = require('lib/natural/normalizers/normalizer_ja').converters;
  25
+
  26
+describe('normalize_ja', function() {
  27
+  it('should fix badly formed hiragana', function() {
  28
+    expect(normalize_ja('う゛か゛き゛く゛は゜ひ゜ふ゜')).toEqual('ゔがぎぐぱぴぷ');
  29
+    expect(normalize_ja('ゔがぎぐぱぴぷ')).toEqual('ゔがぎぐぱぴぷ');
  30
+    expect(normalize_ja('まっなか')).toEqual('まんなか');
  31
+  });
  32
+
  33
+  it('should fix badly formed fullwidth katakana', function() {
  34
+    expect(normalize_ja('ウ゛カ゛キ゛ク゛ハ゜ヒ゜フ゜')).toEqual('ヴガギグパピプ');
  35
+    expect(normalize_ja('ヴガギグパピプ')).toEqual('ヴガギグパピプ');
  36
+  });
  37
+
  38
+  it('should fix badly formed halfwidth katakana', function() {
  39
+    expect(normalize_ja('ウ゛カ゛キ゛ク゛ハ゜ヒ゜フ゜')).toEqual('ヴガギグパピプ');
  40
+    expect(normalize_ja('ヴガギグパピプ')).toEqual('ヴガギグパピプ');
  41
+  });
  42
+
  43
+  it('should transform halfwidth katakana to fullwidth', function() {
  44
+    expect(normalize_ja('カタカナ')).toEqual('カタカナ');
  45
+  });
  46
+
  47
+  it('should transform fullwidth alphanumerical characters to halfwidth', function() {
  48
+    expect(normalize_ja('ABC123')).toEqual('ABC123');
  49
+  });
  50
+
  51
+  it('should transform fullwidth spaces to halfwidth', function() {
  52
+    expect(normalize_ja('空 空 空')).toEqual('空 空 空');
  53
+  });
  54
+
  55
+  it('should transform halfwidth punctuation signs to fullwidth', function() {
  56
+    // Taken from http://unicode.org/cldr/trac/browser/trunk/common/main/ja.xml
  57
+    expect(normalize_ja('‾ __ -- ‐ — ― 〜 ・ ・ ,, 、、 ;; :: !! ?? .. ‥ … 。。 '\ ‘ ’ "" “ ” (( )) [[ ]] {{ }} 〈 〉 《 》 「「 」」 『 』 【 】 〔 〕 ‖ § ¶ @@ ++ ^^ $$ ** // \\\ && ## %% ‰ † ‡ ′ ″ 〃 ※'))
  58
+      .toEqual('‾ __ ─- ‐ — ― 〜 ・ ・ ,, 、、 ;; :: !! ?? .. ‥ … 。。 '\ ‘ ’ "" “ ” (( )) [[ ]] {{ }} 〈 〉 《 》 「「 」」 『 』 【 】 〔 〕 ‖ § ¶ @@ ++ ^^ $$ ** // \\ && ## %% ‰ † ‡ ′ ″ 〃 ※');
  59
+  });
  60
+});
  61
+
  62
+var sample = 'ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ';
  63
+
  64
+describe('converters', function() {
  65
+  it('should all be reversible', function() {
  66
+    var sample = '半角カナ(はんかくカナ)とは、JIS X 0208など片仮名を含む他の文字集合と同時に運用される場合におけるJIS X 0201の片仮名文字集合の通称である。漢字を含む文字集合で定義された片仮名に対して、半分の文字幅で表示されることが一般的であったためこのように呼ばれる。JIS X 0201で規定される8ビット符号化およびShift_JISにおいて0xA1-0xDFの範囲の1バイト文字がこれにあたる。また、Shift_JISやEUC-JPなどの符号化方式やUnicodeでも互換性の目的でこの文字集合をもっている。';
  67
+    expect(converters.halfwidthToFullwidth.alphabet(converters.fullwidthToHalfwidth.alphabet(sample))).toEqual(converters.halfwidthToFullwidth.alphabet(sample));
  68
+    expect(converters.fullwidthToHalfwidth.alphabet(converters.halfwidthToFullwidth.alphabet(sample))).toEqual(converters.fullwidthToHalfwidth.alphabet(sample));
  69
+    expect(converters.halfwidthToFullwidth.numbers(converters.fullwidthToHalfwidth.numbers(sample))).toEqual(converters.halfwidthToFullwidth.numbers(sample));
  70
+    expect(converters.fullwidthToHalfwidth.numbers(converters.halfwidthToFullwidth.numbers(sample))).toEqual(converters.fullwidthToHalfwidth.numbers(sample));
  71
+    expect(converters.halfwidthToFullwidth.punctuation(converters.fullwidthToHalfwidth.punctuation(sample))).toEqual(converters.halfwidthToFullwidth.punctuation(sample));
  72
+    expect(converters.fullwidthToHalfwidth.punctuation(converters.halfwidthToFullwidth.punctuation(sample))).toEqual(converters.fullwidthToHalfwidth.punctuation(sample));
  73
+    expect(converters.halfwidthToFullwidth.katakana(converters.fullwidthToHalfwidth.katakana(sample))).toEqual(converters.halfwidthToFullwidth.katakana(sample));
  74
+    expect(converters.fullwidthToHalfwidth.katakana(converters.halfwidthToFullwidth.katakana(sample))).toEqual(converters.fullwidthToHalfwidth.katakana(sample));
  75
+  });
  76
+
  77
+  describe('.fullwidthToHalfwidth', function() {
  78
+    describe('.alphabet', function() {
  79
+      it('should transform fullwidth roman characters and space to halfwidth', function() {
  80
+        expect(converters.fullwidthToHalfwidth.alphabet(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ');
  81
+      });
  82
+    });
  83
+
  84
+    describe('.numbers', function() {
  85
+      it('should transform fullwidth numerical characters to halfwidth', function() {
  86
+        expect(converters.fullwidthToHalfwidth.numbers(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ');
  87
+      });
  88
+    });
  89
+
  90
+    describe('.punctuation', function() {
  91
+      it('should transform fullwidth punctuation signs to halfwidth', function() {
  92
+        expect(converters.fullwidthToHalfwidth.punctuation(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ');
  93
+      });
  94
+    });
  95
+
  96
+    describe('.katakana', function() {
  97
+      it('should transform fullwidth katakana to halfwidth', function() {
  98
+        expect(converters.fullwidthToHalfwidth.katakana(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ');
  99
+      });
  100
+    });
  101
+  });
  102
+
  103
+  describe('.halfwidthToFullwidth', function() {
  104
+    describe('.alphabet', function() {
  105
+      it('should transform halfwidth roman characters and space to fullwidth', function() {
  106
+        expect(converters.halfwidthToFullwidth.alphabet(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ');
  107
+      });
  108
+    });
  109
+
  110
+    describe('.numbers', function() {
  111
+      it('should transform halfwidth numerical characters to fullwidth', function() {
  112
+        expect(converters.halfwidthToFullwidth.numbers(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ');
  113
+      });
  114
+    });
  115
+
  116
+    describe('.punctuation', function() {
  117
+      it('should transform halfwidth punctuation signs to fullwidth', function() {
  118
+        expect(converters.halfwidthToFullwidth.punctuation(sample)).toEqual('ABC ABC 123123.,─.,─ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ');
  119
+      });
  120
+    });
  121
+
  122
+    describe('.katakana', function() {
  123
+      it('should transform halfwidth katakana to fullwidth', function() {
  124
+        expect(converters.halfwidthToFullwidth.katakana(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ');
  125
+      });
  126
+    });
  127
+  });
  128
+
  129
+  describe('.hiraganaToKatakana', function() {
  130
+    it('should transform hiragana to katakana', function() {
  131
+      expect(converters.hiraganaToKatakana(sample)).toEqual('ABC ABC 123123.,-.,-ヴアイウエオハバパカキクケコハバパヴカキクケコハバパ');
  132
+    });
  133
+  });
  134
+
  135
+  describe('.katakanaToHiragana', function() {
  136
+    it('should transform katakana to hiragana', function() {
  137
+      expect(converters.katakanaToHiragana(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱかきくけこはばぱゔかきくけこはばぱ');
  138
+    });
  139
+  });
  140
+});
Commit_comment_tip

Tip: You can add notes to lines in a file. Hover to the left of a line to make a note

Something went wrong with that request. Please try again.