Fixed the Japanese normalizer and stemmer to work with TypeScript dec…

…larations
NaturalNode · Apr 7, 2024 · 14b05b0 · 14b05b0
1 parent 925058b
commit 14b05b0
Show file tree

Hide file tree

Showing 16 changed files with 1,219 additions and 274 deletions.
diff --git a/lib/natural/normalizers/index.d.ts b/lib/natural/normalizers/index.d.ts
@@ -26,3 +26,24 @@ export function normalize (tokens: string | string[]): string[]
 // eslint-disable-next-line @typescript-eslint/naming-convention
 export function normalize_ja (str: string): string
 export function removeDiacritics (str: string): string
+
+export class Converters {
+  alphabetFH: (str: string) => string
+  alphabetHF: (str: string) => string
+  numbersFH: (str: string) => string
+  numbersHF: (str: string) => string
+  punctuationFH: (str: string) => string
+  punctuationHF: (str: string) => string
+  symbolFH: (str: string) => string
+  symbolHF: (str: string) => string
+  purePunctuationFH: (str: string) => string
+  purePunctuationHF: (str: string) => string
+  katakanaFH: (str: string) => string
+  katakanaHF: (str: string) => string
+  static fixFullwidthKana: (str: string) => string
+  static normalize: (str: string) => string
+}
+
+type FixCompositeSymbolsTable = Record<string, string>
+
+type NormalizeJa = (str: string) => string
diff --git a/lib/natural/normalizers/index.js b/lib/natural/normalizers/index.js
@@ -24,4 +24,5 @@ THE SOFTWARE.
 
 exports.normalize = require('./normalizer').normalizeTokens
 exports.normalize_ja = require('./normalizer_ja').normalizeJa
+exports.Converters = require('./normalizer_ja').Converters
 exports.removeDiacritics = require('./remove_diacritics')
diff --git a/lib/natural/normalizers/normalizer_ja.js b/lib/natural/normalizers/normalizer_ja.js
@@ -48,7 +48,7 @@
 
 const flip = require('../util/utils.js').flip
 const merge = require('../util/utils.js').merge
-const replacer = require('../util/utils').replacer
+const replacer = require('../util/utils.js').replacer
 
 // From http://fernweh.jp/b/mb_convert_kana_js/
 const conversionTables = {
@@ -522,73 +522,110 @@ conversionTables.normalize = merge(
   conversionTables.halfwidthToFullwidth.katakana
 )
 
-const converters = {
-  fullwidthToHalfwidth: {
-    alphabet: replacer(conversionTables.fullwidthToHalfwidth.alphabet),
-    numbers: replacer(conversionTables.fullwidthToHalfwidth.numbers),
-    symbol: replacer(conversionTables.fullwidthToHalfwidth.symbol),
-    purePunctuation: replacer(conversionTables.fullwidthToHalfwidth.purePunctuation),
-    punctuation: replacer(conversionTables.fullwidthToHalfwidth.punctuation),
-    katakana: replacer(conversionTables.fullwidthToHalfwidth.katakana)
-  },
-
-  halfwidthToFullwidth: {
-    alphabet: replacer(conversionTables.halfwidthToFullwidth.alphabet),
-    numbers: replacer(conversionTables.halfwidthToFullwidth.numbers),
-    symbol: replacer(conversionTables.halfwidthToFullwidth.symbol),
-    purePunctuation: replacer(conversionTables.halfwidthToFullwidth.purePunctuation),
-    punctuation: replacer(conversionTables.halfwidthToFullwidth.punctuation),
-    katakana: replacer(conversionTables.halfwidthToFullwidth.katakana)
-  },
-
-  fixFullwidthKana: replacer(fixFullwidthKana),
-  normalize: replacer(conversionTables.normalize)
-}
-
-const fixCompositeSymbols = replacer(fixCompositeSymbolsTable)
-
-/**
- * Convert hiragana to fullwidth katakana.
- * According to http://jsperf.com/converting-japanese, these implementations are
- * faster than using lookup tables.
- *
- * @param {string} str A string.
- * @return {string} A string not containing hiragana.
- */
-converters.hiraganaToKatakana = function (str) {
-  str = converters.halfwidthToFullwidth.katakana(str)
-  str = converters.fixFullwidthKana(str)
-
-  str = str.replace(/ゝ/g, 'ヽ')
-  str = str.replace(/ゞ/g, 'ヾ')
-  // str = str.replace(/?/g, '𛀀'); // Letter archaic E
-
-  str = str.replace(/[ぁ-ゖ]/g, function (str) {
-    return String.fromCharCode(str.charCodeAt(0) + 96)
-  })
-
-  return str
-}
-
-/**
- * Convert katakana to hiragana.
- *
- * @param {string} str A string.
- * @return {string} A string not containing katakana.
- */
-converters.katakanaToHiragana = function (str) {
-  str = converters.halfwidthToFullwidth.katakana(str)
-  str = converters.fixFullwidthKana(str)
-
-  str = str.replace(/ヽ/g, 'ゝ')
-  str = str.replace(/ヾ/g, 'ゞ')
-  // str = str.replace(/?/g, '𛀁'); // Letter archaic E
-
-  str = str.replace(/[ァ-ヶ]/g, function (str) {
-    return String.fromCharCode(str.charCodeAt(0) - 96)
-  })
-
-  return str
+class Converters {
+  alphabetFH (str) {
+    return replacer(conversionTables.fullwidthToHalfwidth.alphabet)(str)
+  }
+
+  numbersFH (str) {
+    return replacer(conversionTables.fullwidthToHalfwidth.numbers)(str)
+  }
+
+  symbolFH (str) {
+    return replacer(conversionTables.fullwidthToHalfwidth.symbol)(str)
+  }
+
+  purePunctuationFH (str) {
+    return replacer(conversionTables.fullwidthToHalfwidth.purePunctuation)(str)
+  }
+
+  punctuationFH (str) {
+    return replacer(conversionTables.fullwidthToHalfwidth.punctuation)(str)
+  }
+
+  katakanaFH (str) {
+    return replacer(conversionTables.fullwidthToHalfwidth.katakana)(str)
+  }
+
+  static fixFullwidthKana(str) {
+    return replacer(fixFullwidthKana)(str)
+  }
+
+  static normalize(str) {
+    return replacer(conversionTables.normalize)(str)
+  }
+
+  alphabetHF (str) {
+    return replacer(conversionTables.halfwidthToFullwidth.alphabet)(str)
+  }
+
+  numbersHF (str) {
+    return replacer(conversionTables.halfwidthToFullwidth.numbers)(str)
+  }
+
+  symbolHF (str) {
+    return replacer(conversionTables.halfwidthToFullwidth.symbol)(str)
+  }
+
+  purePunctuationHF (str) {
+    return replacer(conversionTables.halfwidthToFullwidth.purePunctuation)(str)
+  }
+
+  punctuationHF (str) {
+    return replacer(conversionTables.halfwidthToFullwidth.punctuation)(str)
+  }
+
+  katakanaHF (str) {
+    return replacer(conversionTables.halfwidthToFullwidth.katakana)(str)
+  }
+
+  static fixCompositeSymbols (str) {
+    return replacer(fixCompositeSymbolsTable)(str)
+  }
+
+  /**
+   * Convert hiragana to fullwidth katakana.
+   * According to http://jsperf.com/converting-japanese, these implementations are
+   * faster than using lookup tables.
+   *
+   * @param {string} str A string.
+   * @return {string} A string not containing hiragana.
+   */
+  hiraganaToKatakana (str) {
+    str = this.katakanaHF(str)
+    str = Converters.fixFullwidthKana(str)
+
+    str = str.replace(/ゝ/g, 'ヽ')
+    str = str.replace(/ゞ/g, 'ヾ')
+    // str = str.replace(/?/g, '𛀀'); // Letter archaic E
+
+    str = str.replace(/[ぁ-ゖ]/g, function (str) {
+      return String.fromCharCode(str.charCodeAt(0) + 96)
+    })
+
+    return str
+  }
+
+  /**
+   * Convert katakana to hiragana.
+   *
+   * @param {string} str A string.
+   * @return {string} A string not containing katakana.
+   */
+  katakanaToHiragana (str) {
+    str = this.katakanaHF(str)
+    str = Converters.fixFullwidthKana(str)
+
+    str = str.replace(/ヽ/g, 'ゝ')
+    str = str.replace(/ヾ/g, 'ゞ')
+    // str = str.replace(/?/g, '𛀁'); // Letter archaic E
+
+    str = str.replace(/[ァ-ヶ]/g, function (str) {
+      return String.fromCharCode(str.charCodeAt(0) - 96)
+    })
+
+    return str
+  }
 }
 
 /**
@@ -610,14 +647,14 @@ const normalizeJa = function (str) {
     .replace(/(..)々々/g, '$1$1')
     .replace(/(.)々/g, '$1$1')
 
-  str = converters.normalize(str)
-  str = converters.fixFullwidthKana(str)
+  str = Converters.normalize(str)
+  str = Converters.fixFullwidthKana(str)
 
   // Replace composite symbols.
-  str = fixCompositeSymbols(str)
+  str = Converters.fixCompositeSymbols(str)
 
   return str
 }
 
 exports.normalizeJa = normalizeJa
-exports.converters = converters
+exports.Converters = Converters
diff --git a/lib/natural/stemmers/index.d.ts b/lib/natural/stemmers/index.d.ts
@@ -67,3 +67,23 @@ export let PorterStemmerRu: Stemmer
 export let PorterStemmerSv: Stemmer
 export let StemmerId: Stemmer
 export let StemmerJa: Stemmer
+
+export declare type TokenCallback = (...args: number[]) => number[] | number
+
+export declare class Token {
+  vowels: string[] | string
+  regions: Record<string, number>
+  string: string
+  original: string
+
+  constructor (s: string)
+  usingVowels (vowels: string | string[]): Token
+  markRegion (region: string, args: number[] | number | null, callback?: TokenCallback, context?: unknown): Token
+  replaceAll (find: string, replace: string): Token
+  replaceSuffixInRegion (suffix: string, replace: string, region: string): Token
+  hasVowelAtIndex (index: number): boolean
+  nextVowelIndex (index: number): number
+  nextConsonantIndex (index: number): number
+  hasSuffix (suffix: string): number
+  hasSuffixInRegion (suffix: string, region: string): boolean
+}
diff --git a/lib/natural/stemmers/index.js b/lib/natural/stemmers/index.js
@@ -38,3 +38,4 @@ exports.PorterStemmerNl = require('./porter_stemmer_nl')
 exports.LancasterStemmer = require('./lancaster_stemmer')
 exports.StemmerJa = require('./stemmer_ja')
 exports.StemmerId = require('./indonesian/stemmer_id')
+exports.Token = require('./token')