Skip to content

Commit

Permalink
Fixed the Japanese normalizer and stemmer to work with TypeScript dec…
Browse files Browse the repository at this point in the history
…larations
  • Loading branch information
Hugo-ter-Doest committed Apr 7, 2024
1 parent 925058b commit 14b05b0
Show file tree
Hide file tree
Showing 16 changed files with 1,219 additions and 274 deletions.
21 changes: 21 additions & 0 deletions lib/natural/normalizers/index.d.ts
Expand Up @@ -26,3 +26,24 @@ export function normalize (tokens: string | string[]): string[]
// eslint-disable-next-line @typescript-eslint/naming-convention
export function normalize_ja (str: string): string
export function removeDiacritics (str: string): string

export class Converters {
alphabetFH: (str: string) => string
alphabetHF: (str: string) => string
numbersFH: (str: string) => string
numbersHF: (str: string) => string
punctuationFH: (str: string) => string
punctuationHF: (str: string) => string
symbolFH: (str: string) => string
symbolHF: (str: string) => string
purePunctuationFH: (str: string) => string
purePunctuationHF: (str: string) => string
katakanaFH: (str: string) => string
katakanaHF: (str: string) => string
static fixFullwidthKana: (str: string) => string
static normalize: (str: string) => string
}

type FixCompositeSymbolsTable = Record<string, string>

type NormalizeJa = (str: string) => string
1 change: 1 addition & 0 deletions lib/natural/normalizers/index.js
Expand Up @@ -24,4 +24,5 @@ THE SOFTWARE.

exports.normalize = require('./normalizer').normalizeTokens
exports.normalize_ja = require('./normalizer_ja').normalizeJa
exports.Converters = require('./normalizer_ja').Converters
exports.removeDiacritics = require('./remove_diacritics')
181 changes: 109 additions & 72 deletions lib/natural/normalizers/normalizer_ja.js
Expand Up @@ -48,7 +48,7 @@

const flip = require('../util/utils.js').flip
const merge = require('../util/utils.js').merge
const replacer = require('../util/utils').replacer
const replacer = require('../util/utils.js').replacer

// From http://fernweh.jp/b/mb_convert_kana_js/
const conversionTables = {
Expand Down Expand Up @@ -522,73 +522,110 @@ conversionTables.normalize = merge(
conversionTables.halfwidthToFullwidth.katakana
)

const converters = {
fullwidthToHalfwidth: {
alphabet: replacer(conversionTables.fullwidthToHalfwidth.alphabet),
numbers: replacer(conversionTables.fullwidthToHalfwidth.numbers),
symbol: replacer(conversionTables.fullwidthToHalfwidth.symbol),
purePunctuation: replacer(conversionTables.fullwidthToHalfwidth.purePunctuation),
punctuation: replacer(conversionTables.fullwidthToHalfwidth.punctuation),
katakana: replacer(conversionTables.fullwidthToHalfwidth.katakana)
},

halfwidthToFullwidth: {
alphabet: replacer(conversionTables.halfwidthToFullwidth.alphabet),
numbers: replacer(conversionTables.halfwidthToFullwidth.numbers),
symbol: replacer(conversionTables.halfwidthToFullwidth.symbol),
purePunctuation: replacer(conversionTables.halfwidthToFullwidth.purePunctuation),
punctuation: replacer(conversionTables.halfwidthToFullwidth.punctuation),
katakana: replacer(conversionTables.halfwidthToFullwidth.katakana)
},

fixFullwidthKana: replacer(fixFullwidthKana),
normalize: replacer(conversionTables.normalize)
}

const fixCompositeSymbols = replacer(fixCompositeSymbolsTable)

/**
* Convert hiragana to fullwidth katakana.
* According to http://jsperf.com/converting-japanese, these implementations are
* faster than using lookup tables.
*
* @param {string} str A string.
* @return {string} A string not containing hiragana.
*/
converters.hiraganaToKatakana = function (str) {
str = converters.halfwidthToFullwidth.katakana(str)
str = converters.fixFullwidthKana(str)

str = str.replace(//g, 'ヽ')
str = str.replace(//g, 'ヾ')
// str = str.replace(/?/g, '𛀀'); // Letter archaic E

str = str.replace(/[ぁ-ゖ]/g, function (str) {
return String.fromCharCode(str.charCodeAt(0) + 96)
})

return str
}

/**
* Convert katakana to hiragana.
*
* @param {string} str A string.
* @return {string} A string not containing katakana.
*/
converters.katakanaToHiragana = function (str) {
str = converters.halfwidthToFullwidth.katakana(str)
str = converters.fixFullwidthKana(str)

str = str.replace(//g, 'ゝ')
str = str.replace(//g, 'ゞ')
// str = str.replace(/?/g, '𛀁'); // Letter archaic E

str = str.replace(/[ァ-ヶ]/g, function (str) {
return String.fromCharCode(str.charCodeAt(0) - 96)
})

return str
class Converters {
alphabetFH (str) {
return replacer(conversionTables.fullwidthToHalfwidth.alphabet)(str)
}

numbersFH (str) {
return replacer(conversionTables.fullwidthToHalfwidth.numbers)(str)
}

symbolFH (str) {
return replacer(conversionTables.fullwidthToHalfwidth.symbol)(str)
}

purePunctuationFH (str) {
return replacer(conversionTables.fullwidthToHalfwidth.purePunctuation)(str)
}

punctuationFH (str) {
return replacer(conversionTables.fullwidthToHalfwidth.punctuation)(str)
}

katakanaFH (str) {
return replacer(conversionTables.fullwidthToHalfwidth.katakana)(str)
}

static fixFullwidthKana(str) {
return replacer(fixFullwidthKana)(str)
}

static normalize(str) {
return replacer(conversionTables.normalize)(str)
}

alphabetHF (str) {
return replacer(conversionTables.halfwidthToFullwidth.alphabet)(str)
}

numbersHF (str) {
return replacer(conversionTables.halfwidthToFullwidth.numbers)(str)
}

symbolHF (str) {
return replacer(conversionTables.halfwidthToFullwidth.symbol)(str)
}

purePunctuationHF (str) {
return replacer(conversionTables.halfwidthToFullwidth.purePunctuation)(str)
}

punctuationHF (str) {
return replacer(conversionTables.halfwidthToFullwidth.punctuation)(str)
}

katakanaHF (str) {
return replacer(conversionTables.halfwidthToFullwidth.katakana)(str)
}

static fixCompositeSymbols (str) {
return replacer(fixCompositeSymbolsTable)(str)
}

/**
* Convert hiragana to fullwidth katakana.
* According to http://jsperf.com/converting-japanese, these implementations are
* faster than using lookup tables.
*
* @param {string} str A string.
* @return {string} A string not containing hiragana.
*/
hiraganaToKatakana (str) {
str = this.katakanaHF(str)
str = Converters.fixFullwidthKana(str)

str = str.replace(//g, 'ヽ')
str = str.replace(//g, 'ヾ')
// str = str.replace(/?/g, '𛀀'); // Letter archaic E

str = str.replace(/[ぁ-ゖ]/g, function (str) {
return String.fromCharCode(str.charCodeAt(0) + 96)
})

return str
}

/**
* Convert katakana to hiragana.
*
* @param {string} str A string.
* @return {string} A string not containing katakana.
*/
katakanaToHiragana (str) {
str = this.katakanaHF(str)
str = Converters.fixFullwidthKana(str)

str = str.replace(//g, 'ゝ')
str = str.replace(//g, 'ゞ')
// str = str.replace(/?/g, '𛀁'); // Letter archaic E

str = str.replace(/[ァ-ヶ]/g, function (str) {
return String.fromCharCode(str.charCodeAt(0) - 96)
})

return str
}
}

/**
Expand All @@ -610,14 +647,14 @@ const normalizeJa = function (str) {
.replace(/(..)々々/g, '$1$1')
.replace(/(.)々/g, '$1$1')

str = converters.normalize(str)
str = converters.fixFullwidthKana(str)
str = Converters.normalize(str)
str = Converters.fixFullwidthKana(str)

// Replace composite symbols.
str = fixCompositeSymbols(str)
str = Converters.fixCompositeSymbols(str)

return str
}

exports.normalizeJa = normalizeJa
exports.converters = converters
exports.Converters = Converters
20 changes: 20 additions & 0 deletions lib/natural/stemmers/index.d.ts
Expand Up @@ -67,3 +67,23 @@ export let PorterStemmerRu: Stemmer
export let PorterStemmerSv: Stemmer
export let StemmerId: Stemmer
export let StemmerJa: Stemmer

export declare type TokenCallback = (...args: number[]) => number[] | number

export declare class Token {
vowels: string[] | string
regions: Record<string, number>
string: string
original: string

constructor (s: string)
usingVowels (vowels: string | string[]): Token
markRegion (region: string, args: number[] | number | null, callback?: TokenCallback, context?: unknown): Token
replaceAll (find: string, replace: string): Token
replaceSuffixInRegion (suffix: string, replace: string, region: string): Token
hasVowelAtIndex (index: number): boolean
nextVowelIndex (index: number): number
nextConsonantIndex (index: number): number
hasSuffix (suffix: string): number
hasSuffixInRegion (suffix: string, region: string): boolean
}
1 change: 1 addition & 0 deletions lib/natural/stemmers/index.js
Expand Up @@ -38,3 +38,4 @@ exports.PorterStemmerNl = require('./porter_stemmer_nl')
exports.LancasterStemmer = require('./lancaster_stemmer')
exports.StemmerJa = require('./stemmer_ja')
exports.StemmerId = require('./indonesian/stemmer_id')
exports.Token = require('./token')

0 comments on commit 14b05b0

Please sign in to comment.