From 14b05b08d423a4602b8b918fa479aeb41b56e722 Mon Sep 17 00:00:00 2001 From: Hugo-ter-Doest Date: Sun, 7 Apr 2024 10:06:27 +0200 Subject: [PATCH] Fixed the Japanese normalizer and stemmer to work with TypeScript declarations --- lib/natural/normalizers/index.d.ts | 21 ++ lib/natural/normalizers/index.js | 1 + lib/natural/normalizers/normalizer_ja.js | 181 ++++++----- lib/natural/stemmers/index.d.ts | 20 ++ lib/natural/stemmers/index.js | 1 + lib/natural/stemmers/token.js | 263 ++++++++-------- lib/natural/tokenizers/index.d.ts | 2 +- spec/normalizer_ja_spec.js | 35 +-- spec/tokenizer_ja_spec.js | 11 +- ts_spec/spellcheck_test.ts | 7 - ts_spec/stemmer_id_spec.ts | 374 +++++++++++++++++++++++ ts_spec/stemmer_ja_spec.ts | 69 +++++ ts_spec/stemmer_token_spec.ts | 163 ++++++++++ ts_spec/stemmers_test.ts | 40 --- ts_spec/tokenizer_case_spec.ts | 242 +++++++++++++++ ts_spec/tokenizer_ja_spec.ts | 63 ++++ 16 files changed, 1219 insertions(+), 274 deletions(-) delete mode 100644 ts_spec/spellcheck_test.ts create mode 100644 ts_spec/stemmer_id_spec.ts create mode 100644 ts_spec/stemmer_ja_spec.ts create mode 100644 ts_spec/stemmer_token_spec.ts delete mode 100644 ts_spec/stemmers_test.ts create mode 100644 ts_spec/tokenizer_case_spec.ts create mode 100644 ts_spec/tokenizer_ja_spec.ts diff --git a/lib/natural/normalizers/index.d.ts b/lib/natural/normalizers/index.d.ts index 2694ad2d6..318cc337f 100644 --- a/lib/natural/normalizers/index.d.ts +++ b/lib/natural/normalizers/index.d.ts @@ -26,3 +26,24 @@ export function normalize (tokens: string | string[]): string[] // eslint-disable-next-line @typescript-eslint/naming-convention export function normalize_ja (str: string): string export function removeDiacritics (str: string): string + +export class Converters { + alphabetFH: (str: string) => string + alphabetHF: (str: string) => string + numbersFH: (str: string) => string + numbersHF: (str: string) => string + punctuationFH: (str: string) => string + punctuationHF: (str: string) => string + symbolFH: (str: string) => string + symbolHF: (str: string) => string + purePunctuationFH: (str: string) => string + purePunctuationHF: (str: string) => string + katakanaFH: (str: string) => string + katakanaHF: (str: string) => string + static fixFullwidthKana: (str: string) => string + static normalize: (str: string) => string +} + +type FixCompositeSymbolsTable = Record + +type NormalizeJa = (str: string) => string diff --git a/lib/natural/normalizers/index.js b/lib/natural/normalizers/index.js index 97bbf98a7..9f8ffa1a7 100644 --- a/lib/natural/normalizers/index.js +++ b/lib/natural/normalizers/index.js @@ -24,4 +24,5 @@ THE SOFTWARE. exports.normalize = require('./normalizer').normalizeTokens exports.normalize_ja = require('./normalizer_ja').normalizeJa +exports.Converters = require('./normalizer_ja').Converters exports.removeDiacritics = require('./remove_diacritics') diff --git a/lib/natural/normalizers/normalizer_ja.js b/lib/natural/normalizers/normalizer_ja.js index 6721b9cb0..d0e1fed1b 100644 --- a/lib/natural/normalizers/normalizer_ja.js +++ b/lib/natural/normalizers/normalizer_ja.js @@ -48,7 +48,7 @@ const flip = require('../util/utils.js').flip const merge = require('../util/utils.js').merge -const replacer = require('../util/utils').replacer +const replacer = require('../util/utils.js').replacer // From http://fernweh.jp/b/mb_convert_kana_js/ const conversionTables = { @@ -522,73 +522,110 @@ conversionTables.normalize = merge( conversionTables.halfwidthToFullwidth.katakana ) -const converters = { - fullwidthToHalfwidth: { - alphabet: replacer(conversionTables.fullwidthToHalfwidth.alphabet), - numbers: replacer(conversionTables.fullwidthToHalfwidth.numbers), - symbol: replacer(conversionTables.fullwidthToHalfwidth.symbol), - purePunctuation: replacer(conversionTables.fullwidthToHalfwidth.purePunctuation), - punctuation: replacer(conversionTables.fullwidthToHalfwidth.punctuation), - katakana: replacer(conversionTables.fullwidthToHalfwidth.katakana) - }, - - halfwidthToFullwidth: { - alphabet: replacer(conversionTables.halfwidthToFullwidth.alphabet), - numbers: replacer(conversionTables.halfwidthToFullwidth.numbers), - symbol: replacer(conversionTables.halfwidthToFullwidth.symbol), - purePunctuation: replacer(conversionTables.halfwidthToFullwidth.purePunctuation), - punctuation: replacer(conversionTables.halfwidthToFullwidth.punctuation), - katakana: replacer(conversionTables.halfwidthToFullwidth.katakana) - }, - - fixFullwidthKana: replacer(fixFullwidthKana), - normalize: replacer(conversionTables.normalize) -} - -const fixCompositeSymbols = replacer(fixCompositeSymbolsTable) - -/** - * Convert hiragana to fullwidth katakana. - * According to http://jsperf.com/converting-japanese, these implementations are - * faster than using lookup tables. - * - * @param {string} str A string. - * @return {string} A string not containing hiragana. - */ -converters.hiraganaToKatakana = function (str) { - str = converters.halfwidthToFullwidth.katakana(str) - str = converters.fixFullwidthKana(str) - - str = str.replace(/ゝ/g, 'ヽ') - str = str.replace(/ゞ/g, 'ヾ') - // str = str.replace(/?/g, '𛀀'); // Letter archaic E - - str = str.replace(/[ぁ-ゖ]/g, function (str) { - return String.fromCharCode(str.charCodeAt(0) + 96) - }) - - return str -} - -/** - * Convert katakana to hiragana. - * - * @param {string} str A string. - * @return {string} A string not containing katakana. - */ -converters.katakanaToHiragana = function (str) { - str = converters.halfwidthToFullwidth.katakana(str) - str = converters.fixFullwidthKana(str) - - str = str.replace(/ヽ/g, 'ゝ') - str = str.replace(/ヾ/g, 'ゞ') - // str = str.replace(/?/g, '𛀁'); // Letter archaic E - - str = str.replace(/[ァ-ヶ]/g, function (str) { - return String.fromCharCode(str.charCodeAt(0) - 96) - }) - - return str +class Converters { + alphabetFH (str) { + return replacer(conversionTables.fullwidthToHalfwidth.alphabet)(str) + } + + numbersFH (str) { + return replacer(conversionTables.fullwidthToHalfwidth.numbers)(str) + } + + symbolFH (str) { + return replacer(conversionTables.fullwidthToHalfwidth.symbol)(str) + } + + purePunctuationFH (str) { + return replacer(conversionTables.fullwidthToHalfwidth.purePunctuation)(str) + } + + punctuationFH (str) { + return replacer(conversionTables.fullwidthToHalfwidth.punctuation)(str) + } + + katakanaFH (str) { + return replacer(conversionTables.fullwidthToHalfwidth.katakana)(str) + } + + static fixFullwidthKana(str) { + return replacer(fixFullwidthKana)(str) + } + + static normalize(str) { + return replacer(conversionTables.normalize)(str) + } + + alphabetHF (str) { + return replacer(conversionTables.halfwidthToFullwidth.alphabet)(str) + } + + numbersHF (str) { + return replacer(conversionTables.halfwidthToFullwidth.numbers)(str) + } + + symbolHF (str) { + return replacer(conversionTables.halfwidthToFullwidth.symbol)(str) + } + + purePunctuationHF (str) { + return replacer(conversionTables.halfwidthToFullwidth.purePunctuation)(str) + } + + punctuationHF (str) { + return replacer(conversionTables.halfwidthToFullwidth.punctuation)(str) + } + + katakanaHF (str) { + return replacer(conversionTables.halfwidthToFullwidth.katakana)(str) + } + + static fixCompositeSymbols (str) { + return replacer(fixCompositeSymbolsTable)(str) + } + + /** + * Convert hiragana to fullwidth katakana. + * According to http://jsperf.com/converting-japanese, these implementations are + * faster than using lookup tables. + * + * @param {string} str A string. + * @return {string} A string not containing hiragana. + */ + hiraganaToKatakana (str) { + str = this.katakanaHF(str) + str = Converters.fixFullwidthKana(str) + + str = str.replace(/ゝ/g, 'ヽ') + str = str.replace(/ゞ/g, 'ヾ') + // str = str.replace(/?/g, '𛀀'); // Letter archaic E + + str = str.replace(/[ぁ-ゖ]/g, function (str) { + return String.fromCharCode(str.charCodeAt(0) + 96) + }) + + return str + } + + /** + * Convert katakana to hiragana. + * + * @param {string} str A string. + * @return {string} A string not containing katakana. + */ + katakanaToHiragana (str) { + str = this.katakanaHF(str) + str = Converters.fixFullwidthKana(str) + + str = str.replace(/ヽ/g, 'ゝ') + str = str.replace(/ヾ/g, 'ゞ') + // str = str.replace(/?/g, '𛀁'); // Letter archaic E + + str = str.replace(/[ァ-ヶ]/g, function (str) { + return String.fromCharCode(str.charCodeAt(0) - 96) + }) + + return str + } } /** @@ -610,14 +647,14 @@ const normalizeJa = function (str) { .replace(/(..)々々/g, '$1$1') .replace(/(.)々/g, '$1$1') - str = converters.normalize(str) - str = converters.fixFullwidthKana(str) + str = Converters.normalize(str) + str = Converters.fixFullwidthKana(str) // Replace composite symbols. - str = fixCompositeSymbols(str) + str = Converters.fixCompositeSymbols(str) return str } exports.normalizeJa = normalizeJa -exports.converters = converters +exports.Converters = Converters diff --git a/lib/natural/stemmers/index.d.ts b/lib/natural/stemmers/index.d.ts index daca041cf..527bb68db 100644 --- a/lib/natural/stemmers/index.d.ts +++ b/lib/natural/stemmers/index.d.ts @@ -67,3 +67,23 @@ export let PorterStemmerRu: Stemmer export let PorterStemmerSv: Stemmer export let StemmerId: Stemmer export let StemmerJa: Stemmer + +export declare type TokenCallback = (...args: number[]) => number[] | number + +export declare class Token { + vowels: string[] | string + regions: Record + string: string + original: string + + constructor (s: string) + usingVowels (vowels: string | string[]): Token + markRegion (region: string, args: number[] | number | null, callback?: TokenCallback, context?: unknown): Token + replaceAll (find: string, replace: string): Token + replaceSuffixInRegion (suffix: string, replace: string, region: string): Token + hasVowelAtIndex (index: number): boolean + nextVowelIndex (index: number): number + nextConsonantIndex (index: number): number + hasSuffix (suffix: string): number + hasSuffixInRegion (suffix: string, region: string): boolean +} diff --git a/lib/natural/stemmers/index.js b/lib/natural/stemmers/index.js index 15988a4df..5e4e16a8e 100644 --- a/lib/natural/stemmers/index.js +++ b/lib/natural/stemmers/index.js @@ -38,3 +38,4 @@ exports.PorterStemmerNl = require('./porter_stemmer_nl') exports.LancasterStemmer = require('./lancaster_stemmer') exports.StemmerJa = require('./stemmer_ja') exports.StemmerId = require('./indonesian/stemmer_id') +exports.Token = require('./token') diff --git a/lib/natural/stemmers/token.js b/lib/natural/stemmers/token.js index 41ec8dbae..9a713e722 100644 --- a/lib/natural/stemmers/token.js +++ b/lib/natural/stemmers/token.js @@ -22,141 +22,140 @@ THE SOFTWARE. 'use strict' -module.exports = (function () { - /** - * Stemmer token constructor. - * - * @param {String} string Token string. - */ - const Token = function (string) { - this.vowels = '' - this.regions = {} - this.string = string - this.original = string +/** + * Stemmer token constructor. + * + * @param {String} string Token string. + */ +function Token (string) { + this.vowels = '' + this.regions = {} + this.string = string + this.original = string +} + +/** + * Set vowels. + * + * @param {String|Array} vowels List of vowels. + * @return {Token} Token instance. + */ +Token.prototype.usingVowels = function (vowels) { + this.vowels = vowels + return this +} + +/** + * Marks a region by defining its starting index or providing a callback + * function that does. + * + * @param {String} region Region name. + * @param {Array|Number} args Callback arguments or region start index. + * @param {Function} callback Function that determines the start index (optional). + * @param {Object} context Callback context (optional, defaults to this). + * @return {Token} Token instance. + */ +Token.prototype.markRegion = function (region, args, callback, context) { + if (typeof callback === 'function') { + // this.regions[region] = callback.apply(context || this, [].concat(args)) + this.regions[region] = callback.apply(context || this, [].concat(args)) + } else if (!isNaN(args)) { + this.regions[region] = args } - /** - * Set vowels. - * - * @param {String|Array} vowels List of vowels. - * @return {Token} Token instance. - */ - Token.prototype.usingVowels = function (vowels) { - this.vowels = vowels - return this - } - - /** - * Marks a region by defining its starting index or providing a callback - * function that does. - * - * @param {String} region Region name. - * @param {Array|Number} args Callback arguments or region start index. - * @param {Function} callback Function that determines the start index (optional). - * @param {Object} context Callback context (optional, defaults to this). - * @return {Token} Token instance. - */ - Token.prototype.markRegion = function (region, args, callback, context) { - if (typeof callback === 'function') { - this.regions[region] = callback.apply(context || this, [].concat(args)) - } else if (!isNaN(args)) { - this.regions[region] = args + return this +} + +/** + * Replaces all instances of a string with another. + * + * @param {String} find String to be replaced. + * @param {String} replace Replacement string. + * @return {Token} Token instance. + */ +Token.prototype.replaceAll = function (find, replace) { + this.string = this.string.split(find).join(replace) + return this +} + +/** + * Replaces the token suffix if in a region. + * + * @param {String} suffix Suffix to replace. + * @param {String} replace Replacement string. + * @param {String} region Region name. + * @return {Token} Token instance. + */ +Token.prototype.replaceSuffixInRegion = function (suffix, replace, region) { + const suffixes = [].concat(suffix) + for (let i = 0; i < suffixes.length; i++) { + if (this.hasSuffixInRegion(suffixes[i], region)) { + this.string = this.string.slice(0, -suffixes[i].length) + replace + return this } - - return this - } - - /** - * Replaces all instances of a string with another. - * - * @param {String} find String to be replaced. - * @param {String} replace Replacement string. - * @return {Token} Token instance. - */ - Token.prototype.replaceAll = function (find, replace) { - this.string = this.string.split(find).join(replace) - return this } - - /** - * Replaces the token suffix if in a region. - * - * @param {String} suffix Suffix to replace. - * @param {String} replace Replacement string. - * @param {String} region Region name. - * @return {Token} Token instance. - */ - Token.prototype.replaceSuffixInRegion = function (suffix, replace, region) { - const suffixes = [].concat(suffix) - for (let i = 0; i < suffixes.length; i++) { - if (this.hasSuffixInRegion(suffixes[i], region)) { - this.string = this.string.slice(0, -suffixes[i].length) + replace - return this - } - } - return this + return this +} + +/** + * Determines whether the token has a vowel at the provided index. + * + * @param {Integer} index Character index. + * @return {Boolean} Whether the token has a vowel at the provided index. + */ +Token.prototype.hasVowelAtIndex = function (index) { + return this.vowels.indexOf(this.string[index]) !== -1 +} + +/** + * Finds the next vowel in the token. + * + * @param {Integer} start Starting index offset. + * @return {Integer} Vowel index, or the end of the string. + */ +Token.prototype.nextVowelIndex = function (start) { + let index = (start >= 0 && start < this.string.length) ? start : this.string.length + while (index < this.string.length && !this.hasVowelAtIndex(index)) { + index++ } - - /** - * Determines whether the token has a vowel at the provided index. - * - * @param {Integer} index Character index. - * @return {Boolean} Whether the token has a vowel at the provided index. - */ - Token.prototype.hasVowelAtIndex = function (index) { - return this.vowels.indexOf(this.string[index]) !== -1 + return index +} + +/** + * Finds the next consonant in the token. + * + * @param {Integer} start Starting index offset. + * @return {Integer} Consonant index, or the end of the string. + */ +Token.prototype.nextConsonantIndex = function (start) { + let index = (start >= 0 && start < this.string.length) ? start : this.string.length + while (index < this.string.length && this.hasVowelAtIndex(index)) { + index++ } - - /** - * Finds the next vowel in the token. - * - * @param {Integer} start Starting index offset. - * @return {Integer} Vowel index, or the end of the string. - */ - Token.prototype.nextVowelIndex = function (start) { - let index = (start >= 0 && start < this.string.length) ? start : this.string.length - while (index < this.string.length && !this.hasVowelAtIndex(index)) { - index++ - } - return index - } - - /** - * Finds the next consonant in the token. - * - * @param {Integer} start Starting index offset. - * @return {Integer} Consonant index, or the end of the string. - */ - Token.prototype.nextConsonantIndex = function (start) { - let index = (start >= 0 && start < this.string.length) ? start : this.string.length - while (index < this.string.length && this.hasVowelAtIndex(index)) { - index++ - } - return index - } - - /** - * Determines whether the token has the provided suffix. - * @param {String} suffix Suffix to match. - * @return {Boolean} Whether the token string ends in suffix. - */ - Token.prototype.hasSuffix = function (suffix) { - return this.string.slice(-suffix.length) === suffix - } - - /** - * Determines whether the token has the provided suffix within the specified - * region. - * - * @param {String} suffix Suffix to match. - * @param {String} region Region name. - * @return {Boolean} Whether the token string ends in suffix. - */ - Token.prototype.hasSuffixInRegion = function (suffix, region) { - const regionStart = this.regions[region] || 0 - const suffixStart = this.string.length - suffix.length - return this.hasSuffix(suffix) && suffixStart >= regionStart - } - - return Token -})() + return index +} + +/** + * Determines whether the token has the provided suffix. + * @param {String} suffix Suffix to match. + * @return {Boolean} Whether the token string ends in suffix. + */ +Token.prototype.hasSuffix = function (suffix) { + return this.string.slice(-suffix.length) === suffix +} + +/** + * Determines whether the token has the provided suffix within the specified + * region. + * + * @param {String} suffix Suffix to match. + * @param {String} region Region name. + * @return {Boolean} Whether the token string ends in suffix. + */ +Token.prototype.hasSuffixInRegion = function (suffix, region) { + const regionStart = this.regions[region] || 0 + const suffixStart = this.string.length - suffix.length + return this.hasSuffix(suffix) && suffixStart >= regionStart +} + +module.exports = Token diff --git a/lib/natural/tokenizers/index.d.ts b/lib/natural/tokenizers/index.d.ts index 82276d2b6..8ba40a33f 100644 --- a/lib/natural/tokenizers/index.d.ts +++ b/lib/natural/tokenizers/index.d.ts @@ -92,7 +92,7 @@ export class AggressiveTokenizer extends Tokenizer { } export class CaseTokenizer extends Tokenizer { - tokenize (text: string): string[] + tokenize (text: string, preserveApostrophe?: boolean): string[] } declare interface RegexTokenizerOptions { diff --git a/spec/normalizer_ja_spec.js b/spec/normalizer_ja_spec.js index 02a3be527..a6717a28a 100644 --- a/spec/normalizer_ja_spec.js +++ b/spec/normalizer_ja_spec.js @@ -23,7 +23,8 @@ THE SOFTWARE. 'use strict' const normalizeJa = require('../lib/natural/normalizers/normalizer_ja').normalizeJa -const converters = require('../lib/natural/normalizers/normalizer_ja').converters +const Converters = require('../lib/natural/normalizers/normalizer_ja').Converters +const converters = new Converters() describe('normalizeJa', function () { it('should fix badly formed hiragana', function () { @@ -86,38 +87,38 @@ const sample = 'ABC ABC 123123.,-.,-ゔあいうえおはば describe('converters', function () { it('should all be reversible', function () { const sample = '半角カナ(はんかくカナ)とは、JIS X 0208など片仮名を含む他の文字集合と同時に運用される場合におけるJIS X 0201の片仮名文字集合の通称である。漢字を含む文字集合で定義された片仮名に対して、半分の文字幅で表示されることが一般的であったためこのように呼ばれる。JIS X 0201で規定される8ビット符号化およびShift_JISにおいて0xA1-0xDFの範囲の1バイト文字がこれにあたる。また、Shift_JISやEUC-JPなどの符号化方式やUnicodeでも互換性の目的でこの文字集合をもっている。' - expect(converters.halfwidthToFullwidth.alphabet(converters.fullwidthToHalfwidth.alphabet(sample))).toEqual(converters.halfwidthToFullwidth.alphabet(sample)) - expect(converters.fullwidthToHalfwidth.alphabet(converters.halfwidthToFullwidth.alphabet(sample))).toEqual(converters.fullwidthToHalfwidth.alphabet(sample)) - expect(converters.halfwidthToFullwidth.numbers(converters.fullwidthToHalfwidth.numbers(sample))).toEqual(converters.halfwidthToFullwidth.numbers(sample)) - expect(converters.fullwidthToHalfwidth.numbers(converters.halfwidthToFullwidth.numbers(sample))).toEqual(converters.fullwidthToHalfwidth.numbers(sample)) - expect(converters.halfwidthToFullwidth.punctuation(converters.fullwidthToHalfwidth.punctuation(sample))).toEqual(converters.halfwidthToFullwidth.punctuation(sample)) - expect(converters.fullwidthToHalfwidth.punctuation(converters.halfwidthToFullwidth.punctuation(sample))).toEqual(converters.fullwidthToHalfwidth.punctuation(sample)) - expect(converters.halfwidthToFullwidth.katakana(converters.fullwidthToHalfwidth.katakana(sample))).toEqual(converters.halfwidthToFullwidth.katakana(sample)) - expect(converters.fullwidthToHalfwidth.katakana(converters.halfwidthToFullwidth.katakana(sample))).toEqual(converters.fullwidthToHalfwidth.katakana(sample)) + expect(converters.alphabetHF(converters.alphabetFH(sample))).toEqual(converters.alphabetHF(sample)) + expect(converters.alphabetFH(converters.alphabetHF(sample))).toEqual(converters.alphabetFH(sample)) + expect(converters.numbersHF(converters.numbersFH(sample))).toEqual(converters.numbersHF(sample)) + expect(converters.numbersFH(converters.numbersHF(sample))).toEqual(converters.numbersFH(sample)) + expect(converters.punctuationHF(converters.punctuationFH(sample))).toEqual(converters.punctuationHF(sample)) + expect(converters.punctuationFH(converters.punctuationHF(sample))).toEqual(converters.punctuationFH(sample)) + expect(converters.katakanaHF(converters.katakanaFH(sample))).toEqual(converters.katakanaHF(sample)) + expect(converters.katakanaFH(converters.katakanaHF(sample))).toEqual(converters.katakanaFH(sample)) }) describe('.fullwidthToHalfwidth', function () { describe('.alphabet', function () { it('should transform fullwidth roman characters and space to halfwidth', function () { - expect(converters.fullwidthToHalfwidth.alphabet(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') + expect(converters.alphabetFH(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') }) }) describe('.numbers', function () { it('should transform fullwidth numerical characters to halfwidth', function () { - expect(converters.fullwidthToHalfwidth.numbers(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') + expect(converters.numbersFH(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') }) }) describe('.punctuation', function () { it('should transform fullwidth punctuation signs to halfwidth', function () { - expect(converters.fullwidthToHalfwidth.punctuation(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') + expect(converters.punctuationFH(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') }) }) describe('.katakana', function () { it('should transform fullwidth katakana to halfwidth', function () { - expect(converters.fullwidthToHalfwidth.katakana(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') + expect(converters.katakanaFH(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') }) }) }) @@ -125,25 +126,25 @@ describe('converters', function () { describe('.halfwidthToFullwidth', function () { describe('.alphabet', function () { it('should transform halfwidth roman characters and space to fullwidth', function () { - expect(converters.halfwidthToFullwidth.alphabet(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') + expect(converters.alphabetHF(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') }) }) describe('.numbers', function () { it('should transform halfwidth numerical characters to fullwidth', function () { - expect(converters.halfwidthToFullwidth.numbers(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') + expect(converters.numbersHF(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') }) }) describe('.punctuation', function () { it('should transform halfwidth punctuation signs to fullwidth', function () { - expect(converters.halfwidthToFullwidth.punctuation(sample)).toEqual('ABC ABC 123123.,─.,─ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') + expect(converters.punctuationHF(sample)).toEqual('ABC ABC 123123.,─.,─ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') }) }) describe('.katakana', function () { it('should transform halfwidth katakana to fullwidth', function () { - expect(converters.halfwidthToFullwidth.katakana(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') + expect(converters.katakanaHF(sample)).toEqual('ABC ABC 123123.,-.,-ゔあいうえおはばぱカキクケコハバパヴカキクケコハバパ') }) }) }) diff --git a/spec/tokenizer_ja_spec.js b/spec/tokenizer_ja_spec.js index 6e8ee18fd..6501f0628 100644 --- a/spec/tokenizer_ja_spec.js +++ b/spec/tokenizer_ja_spec.js @@ -49,12 +49,13 @@ describe('TokenizerJa', function () { }) it('should normalize input', function () { - const converters = require('../lib/natural/normalizers/normalizer_ja').converters + const Converters = require('../lib/natural/normalizers/normalizer_ja').Converters + const converters = new Converters() const tokens = tokenizer.tokenize( - converters.halfwidthToFullwidth.alphabet( - converters.halfwidthToFullwidth.numbers( - converters.fullwidthToHalfwidth.punctuation( - converters.fullwidthToHalfwidth.katakana(text))))) + converters.alphabetHF( + converters.numbersHF( + converters.punctuationFH( + converters.katakanaFH(text))))) expect(tokens).toEqual(result) }) }) diff --git a/ts_spec/spellcheck_test.ts b/ts_spec/spellcheck_test.ts deleted file mode 100644 index 596cb0800..000000000 --- a/ts_spec/spellcheck_test.ts +++ /dev/null @@ -1,7 +0,0 @@ -import { Spellcheck } from '../lib/natural/spellcheck' - -const corpus = ['something', 'soothing'] -const spellcheck = new Spellcheck(corpus) -spellcheck.isCorrect('cat') // false -console.log(spellcheck.getCorrections('soemthing', 1)) // ['something'] -console.log(spellcheck.getCorrections('soemthing', 2)) // ['something', 'soothing'] diff --git a/ts_spec/stemmer_id_spec.ts b/ts_spec/stemmer_id_spec.ts new file mode 100644 index 000000000..cb93fb906 --- /dev/null +++ b/ts_spec/stemmer_id_spec.ts @@ -0,0 +1,374 @@ +/* +Copyright (c) 2018 + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this softwé and associated documentation files (the "Softwé"), to deal +in the Softwé without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Softwé, and to permit persons to whom the Softwé is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Softwé. + +THE SOFTWé IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWé OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +'use strict' + +// Adapted from original Mocha test script in tests folder + +import { StemmerId as Stemmer } from 'lib/natural' + +declare type Tests = Array<[string, string]> +const data: Tests = [] + +// THE TESTCASES +// don't stem short words +data.push(['mei', 'mei']) +data.push(['bui', 'bui']) + +// lookup up the dictionary, to prevent overstemming +// don't stem nilai to nila +data.push(['nilai', 'nilai']) + +// lah|kah|tah|pun +data.push(['hancurlah', 'hancur']) +// data.push(['benarkah', 'benar']); +// data.push(['apatah', 'apa']); +// data.push(['siapapun', 'siapa']); + +// ku|mu|nya +data.push(['jubahku', 'jubah']) +data.push(['bajumu', 'baju']) +data.push(['celananya', 'celana']) + +// i|kan|an +data.push(['hantui', 'hantu']) +// data.push(['belikan', 'beli']); +data.push(['jualan', 'jual']) + +// combination of suffixes +data.push(['bukumukah', 'buku']) // gagal karena -ku dianggap suffix dan dihilangkan +data.push(['miliknyalah', 'milik']) +data.push(['kulitkupun', 'kulit']) // gagal karena -ku dianggap suffix dan dihilangkan +data.push(['berikanku', 'beri']) +data.push(['sakitimu', 'sakit']) +data.push(['beriannya', 'beri']) +data.push(['kasihilah', 'kasih']) + +// plain prefix +data.push(['dibuang', 'buang']) +data.push(['kesakitan', 'sakit']) +data.push(['sesuap', 'suap']) + +// data.push(['teriakanmu', 'teriak']); // wtf? kok jadi ria? +// teriakanmu -> te-ria-kan-mu + +// template formulas for derivation prefix rules (disambiguation) // + +// rule 1a : berV -> ber-V +data.push(['beradu', 'adu']) + +// rule 1b : berV -> be-rV +data.push(['berambut', 'rambut']) + +// rule 2 : berCAP -> ber-CAP +data.push(['bersuara', 'suara']) + +// rule 3 : berCAerV -> ber-CAerV where C != 'r' +data.push(['berdaerah', 'daerah']) + +// rule 4 : belajar -> bel-ajar +data.push(['belajar', 'ajar']) + +// rule 5 : beC1erC2 -> be-C1erC2 where C1 != {'r'|'l'} +// data.push(['bekerja', 'kerja']); +data.push(['beternak', 'ternak']) + +// rule 6a : terV -> ter-V +data.push(['terasing', 'asing']) + +// rule 6b : terV -> te-rV +data.push(['teraup', 'raup']) + +// rule 7 : terCerV -> ter-CerV where C != 'r' +data.push(['tergerak', 'gerak']) + +// rule 8 : terCP -> ter-CP where C != 'r' and P != 'er' +data.push(['terpuruk', 'puruk']) + +// rule 9 : teC1erC2 -> te-C1erC2 where C1 != 'r' +data.push(['teterbang', 'terbang']) + +// rule 10 : me{l|r|w|y}V -> me-{l|r|w|y}V +data.push(['melipat', 'lipat']) +data.push(['meringkas', 'ringkas']) +data.push(['mewarnai', 'warna']) +// data.push(['meyakinkan', 'yakin']); + +// rule 11 : mem{b|f|v} -> mem-{b|f|v} +data.push(['membangun', 'bangun']) +data.push(['memfitnah', 'fitnah']) +data.push(['memvonis', 'vonis']) + +// rule 12 : mempe{r|l} -> mem-pe +data.push(['memperbarui', 'baru']) +data.push(['mempelajari', 'ajar']) + +// rule 13a : mem{rV|V} -> mem{rV|V} +data.push(['meminum', 'minum']) + +// rule 13b : mem{rV|V} -> me-p{rV|V} +data.push(['memukul', 'pukul']) + +// rule 14 : men{c|d|j|z} -> men-{c|d|j|z} +data.push(['mencinta', 'cinta']) +data.push(['mendua', 'dua']) +data.push(['menjauh', 'jauh']) +data.push(['menziarah', 'ziarah']) + +// rule 15a : men{V} -> me-n{V} +data.push(['menuklir', 'nuklir']) + +// rule 15b : men{V} -> me-t{V} +data.push(['menangkap', 'tangkap']) + +// rule 16 : meng{g|h|q} -> meng-{g|h|q} +data.push(['menggila', 'gila']) +data.push(['menghajar', 'hajar']) +data.push(['mengqasar', 'qasar']) + +// rule 17a : mengV -> meng-V +data.push(['mengudara', 'udara']) + +// rule 17b : mengV -> meng-kV +data.push(['mengupas', 'kupas']) + +// rule 18 : menyV -> meny-sV +data.push(['menyuarakan', 'suara']) + +// rule 19 : mempV -> mem-pV where V != 'e' +data.push(['mempopulerkan', 'populer']) + +// rule 20 : pe{w|y}V -> pe-{w|y}V +data.push(['pewarna', 'warna']) +data.push(['peyoga', 'yoga']) + +// rule 21a : perV -> per-V +data.push(['peradilan', 'adil']) + +// rule 21b : perV -> pe-rV +data.push(['perumahan', 'rumah']) + +// rule 22 is missing in the document? + +// rule 23 : perCAP -> per-CAP where C != 'r' and P != 'er' +data.push(['permuka', 'muka']) + +// rule 24 : perCAerV -> per-CAerV where C != 'r' +data.push(['perdaerah', 'daerah']) + +// rule 25 : pem{b|f|v} -> pem-{b|f|v} +data.push(['pembangun', 'bangun']) +data.push(['pemfitnah', 'fitnah']) +data.push(['pemvonis', 'vonis']) + +// rule 26a : pem{rV|V} -> pe-m{rV|V} +data.push(['peminum', 'minum']) + +// rule 26b : pem{rV|V} -> pe-p{rV|V} +data.push(['pemukul', 'pukul']) + +// rule 27 : men{c|d|j|z} -> men-{c|d|j|z} +data.push(['pencinta', 'cinta']) +data.push(['pendahulu', 'dahulu']) +data.push(['penjarah', 'jarah']) +data.push(['penziarah', 'ziarah']) + +// rule 28a : pen{V} -> pe-n{V} +data.push(['penasihat', 'nasihat']) + +// rule 28b : pen{V} -> pe-t{V} +data.push(['penangkap', 'tangkap']) + +// rule 29 : peng{g|h|q} -> peng-{g|h|q} +data.push(['penggila', 'gila']) +data.push(['penghajar', 'hajar']) +data.push(['pengqasar', 'qasar']) + +// rule 30a : pengV -> peng-V +data.push(['pengudara', 'udara']) + +// rule 30b : pengV -> peng-kV +data.push(['pengupas', 'kupas']) + +// rule 31 : penyV -> peny-sV +data.push(['penyuara', 'suara']) + +// rule 32 : pelV -> pe-lV except pelajar -> ajar +data.push(['pelajar', 'ajar']) +data.push(['pelabuhan', 'labuh']) + +// rule 33 : peCerV -> per-erV where C != {r|w|y|l|m|n} +// TODO : find the examples + +// rule 34 : peCP -> pe-CP where C != {r|w|y|l|m|n} and P != 'er' +data.push(['petarung', 'tarung']) + +// CS additional rules + +// rule 35 : terC1erC2 -> ter-C1erC2 where C1 != 'r' +data.push(['terpercaya', 'percaya']) + +// rule 36 : peC1erC2 -> pe-C1erC2 where C1 != {r|w|y|l|m|n} +data.push(['pekerja', 'kerja']) +data.push(['peserta', 'serta']) + +// CS modify rule 12 +data.push(['mempengaruhi', 'pengaruh']) + +// CS modify rule 16 +data.push(['mengkritik', 'kritik']) + +// CS adjusting rule precedence +data.push(['bersekolah', 'sekolah']) // gagal sekolah -> seko why? +data.push(['bertahan', 'tahan']) +data.push(['mencapai', 'capai']) // gagal mencapai -> capa +// data.push(['dimulai', 'mulai']); +data.push(['petani', 'tani']) // gagal petani -> petan +data.push(['terabai', 'abai']) // gagal terabai -> aba + +// ECS +data.push(['mensyaratkan', 'syarat']) +data.push(['mensyukuri', 'syukur']) +data.push(['mengebom', 'bom']) +data.push(['mempromosikan', 'promosi']) +data.push(['memproteksi', 'proteksi']) +data.push(['memprediksi', 'prediksi']) +data.push(['pengkajian', 'kaji']) +data.push(['pengebom', 'bom']) + +// ECS loop pengembalian akhiran +data.push(['bersembunyi', 'sembunyi']) +data.push(['bersembunyilah', 'sembunyi']) +data.push(['pelanggan', 'langgan']) +data.push(['pelaku', 'laku']) +data.push(['pelangganmukah', 'langgan']) +data.push(['pelakunyalah', 'laku']) + +data.push(['perbaikan', 'baik']) +data.push(['kebaikannya', 'baik']) +data.push(['bisikan', 'bisik']) +// data.push(['menerangi', 'terang']); +// data.push(['berimanlah', 'iman']); + +// data.push(['memuaskan', 'puas']); +data.push(['berpelanggan', 'langgan']) +data.push(['bermakanan', 'makan']) + +// CC (Modified ECS) +data.push(['menyala', 'nyala']) +data.push(['menyanyikan', 'nyanyi']) +data.push(['menyatakannya', 'nyata']) + +data.push(['penyanyi', 'nyanyi']) +data.push(['penyawaan', 'nyawa']) + +// CC infix +// data.push(['rerata', 'rata']); +// data.push(['lelembut', 'lembut']); +data.push(['lemigas', 'ligas']) +data.push(['kinerja', 'kerja']) + +// plurals +data.push(['buku-buku', 'buku']) +data.push(['berbalas-balasan', 'balas']) +data.push(['bolak-balik', 'bolak-balik']) + +// combination of prefix + suffix +data.push(['bertebaran', 'tebar']) +data.push(['terasingkan', 'asing']) +data.push(['membangunkan', 'bangun']) +data.push(['mencintai', 'cinta']) +data.push(['menduakan', 'dua']) +data.push(['menjauhi', 'jauh']) +data.push(['menggilai', 'gila']) +data.push(['pembangunan', 'bangun']) + +// return the word if not found in the dictionary +data.push(['marwan', 'marwan']) +data.push(['subarkah', 'subarkah']) + +// recursively remove prefix +data.push(['memberdayakan', 'daya']) +data.push(['persemakmuran', 'makmur']) +data.push(['keberuntunganmu', 'untung']) +data.push(['kesepersepuluhnya', 'sepuluh']) + +// test stem sentence +// data.push(['siapakah memberdayakan pembangunan', 'siapa daya bangun']); + +// issues +data.push(['Perekonomian', 'ekonomi']) +data.push(['menahan', 'tahan']) + +// test stem multiple sentences +// var multipleSentence1 = 'Cinta telah bertebaran.Keduanya saling mencintai.'; +// var multipleSentence2 = "(Cinta telah bertebaran)\n\n\n\nKeduanya saling mencintai."; +// data.push([multipleSentence1, 'cinta telah tebar dua saling cinta']); +// data.push([multipleSentence2, 'cinta telah tebar dua saling cinta']); + +// failed on other method / algorithm but we should succeed +data.push(['peranan', 'peran']) +// data.push(['memberikan', 'beri']); +data.push(['medannya', 'medan']) + +// TODO: +// data.push(['sebagai', 'bagai']); +// data.push(['bagian', 'bagian']); +data.push(['berbadan', 'badan']) +data.push(['abdullah', 'abdullah']) + +// adopted foreign suffixes +// data.push(['budayawan', 'budaya']); +// data.push(['karyawati', 'karya']); +// data.push(['idealis', 'ideal']); +// data.push(['idealisme', 'ideal']); +data.push(['finalisasi', 'final']) + +// sastrawi additional rules +data.push(['penstabilan', 'stabil']) +data.push(['pentranskripsi', 'transkripsi']) + +data.push(['mentaati', 'taat']) +data.push(['meniru-nirukan', 'tiru']) +data.push(['menyepak-nyepak', 'sepak']) + +data.push(['melewati', 'lewat']) +data.push(['menganga', 'nganga']) + +data.push(['kupukul', 'pukul']) +data.push(['kauhajar', 'hajar']) + +data.push(['kuasa-Mu', 'kuasa']) +data.push(['malaikat-malaikat-Nya', 'malaikat']) +data.push(['nikmat-Ku', 'nikmat']) +data.push(['allah-lah', 'allah']) + +describe('Indonesian stemmer', function () { + data.forEach(function (testPair) { + it('should correctly tokenize and stem ' + testPair[0] + ' to ' + testPair[1], function () { + const output = Stemmer.tokenizeAndStem(testPair[0]) + const outputConcat = output.join(' ') + // console.log(output); + expect(outputConcat).toEqual(testPair[1]) + }) + }) +}) diff --git a/ts_spec/stemmer_ja_spec.ts b/ts_spec/stemmer_ja_spec.ts new file mode 100644 index 000000000..b5435ebf1 --- /dev/null +++ b/ts_spec/stemmer_ja_spec.ts @@ -0,0 +1,69 @@ +/* +Copyright (c) 2012, Guillaume Marty + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +'use strict' + +import { StemmerJa as stemmer } from 'lib/natural' + +const test = ['コピー', 'コーヒー', 'タクシー', 'パーティー', 'パーティ', 'ヘルプ・センター'] +const testResult = ['コピー', 'コーヒ', 'タクシ', 'パーティ', 'パーティ', 'ヘルプ・センタ'] +const text = '明後日パーティーに行く予定がある。図書館で資料をコピーしました。' + +describe('StemmerJa', function () { + it('should stem words', function () { + for (let i = 0; i < test.length; i++) { + expect(stemmer.stem(test[i])).toBe(testResult[i]) + } + }) + + it('should not tokenize halfwidth katakana', function () { + expect(stemmer.stem('タクシー')).toEqual('タクシー') + }) + + it('should tokenize, stem and exclude stop words (default behavior)', function () { + const tokens = [ + '明後日', 'パーティ', '行く', '予定', // パーティー should be stemmed + '図書館', '資料', 'コピー', 'まし'] + expect(stemmer.tokenizeAndStem(text)).toEqual(tokens) + expect(stemmer.tokenizeAndStem(text, false)).toEqual(tokens) + }) + + it('should tokenize, stem and keep stop words', function () { + expect(stemmer.tokenizeAndStem(text, true)).toEqual([ + '明後日', 'パーティ', 'に', '行く', '予定', 'が', 'ある', + '図書館', 'で', '資料', 'を', 'コピー', 'し', 'まし', 'た']) + }) + + /* + it('should attach new methods to String', function() { + stemmer.attach(); + expect('コーヒー'.stem()).toEqual('コーヒ'); + expect('コピー'.stem()).toEqual('コピー'); + expect('図書館で資料をコピーしました。'.tokenizeAndStem()).toEqual([ + '図書館', '資料', 'コピー', 'まし']); + expect('図書館で資料をコピーしました。'.tokenizeAndStem(false)).toEqual([ + '図書館', '資料', 'コピー', 'まし']); + expect('図書館で資料をコピーしました。'.tokenizeAndStem(true)).toEqual([ + '図書館', 'で', '資料', 'を', 'コピー', 'し', 'まし', 'た']); + }); + */ +}) diff --git a/ts_spec/stemmer_token_spec.ts b/ts_spec/stemmer_token_spec.ts new file mode 100644 index 000000000..c50f3f087 --- /dev/null +++ b/ts_spec/stemmer_token_spec.ts @@ -0,0 +1,163 @@ +/* +Copyright (c) 2014, Luís Rodrigues + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +'use strict' + +import { Token } from 'lib/natural' + +describe('stemmer token', function () { + it('should receive a string', function () { + const string = 'test' + const token = new Token(string) + + expect(token.string).toBe(string) + }) + + it('should hold the original token string', function () { + const string = 'test' + const token = new Token(string) + + expect(token.original).toBe(string) + }) + + it('should replace all instances of a string', function () { + const string = 'tester' + const token = new Token(string) + + token.replaceAll('e', 'a') + expect(token.string).toBe('tastar') + + token.replaceAll('ar', 'er') + expect(token.string).toBe('taster') + + token.replaceAll('r', '') + expect(token.string).toBe('taste') + }) + + it('should allow vowels to be set', function () { + const vowels = 'aeiou' + const vowelsArray = vowels.split('') + const token = new Token('') + + token.usingVowels(vowels) + expect(token.vowels).toBe(vowels) + + token.usingVowels(vowelsArray) + expect(token.vowels).toBe(vowelsArray) + }) + + it('should check for vowels', function () { + const vowels = 'aeiou' + const token = new Token('test') + + token.usingVowels(vowels) + + expect(token.hasVowelAtIndex(0)).toBe(false) + expect(token.hasVowelAtIndex(1)).toBe(true) + expect(token.hasVowelAtIndex(99)).toBe(false) + + token.usingVowels(vowels.split('')) + + expect(token.hasVowelAtIndex(0)).toBe(false) + expect(token.hasVowelAtIndex(1)).toBe(true) + expect(token.hasVowelAtIndex(99)).toBe(false) + }) + + it('should find the next vowel', function () { + const token = new Token('tester').usingVowels('aeiou') + + expect(token.nextVowelIndex(0)).toBe(1) + expect(token.nextVowelIndex(1)).toBe(1) + expect(token.nextVowelIndex(2)).toBe(4) + expect(token.nextVowelIndex(5)).toBe(6) + expect(token.nextVowelIndex(99)).toBe(6) + expect(token.nextVowelIndex(-1)).toBe(6) + }) + + it('should find the next consonant', function () { + const token = new Token('testee').usingVowels('aeiou') + + expect(token.nextConsonantIndex(0)).toBe(0) + expect(token.nextConsonantIndex(1)).toBe(2) + expect(token.nextConsonantIndex(5)).toBe(6) + expect(token.nextConsonantIndex(99)).toBe(6) + expect(token.nextConsonantIndex(-1)).toBe(6) + }) + + it('should mark regions', function () { + const token = new Token('tester') + + token.markRegion('test', 1) + expect(token.regions.test).toBe(1) + }) + + it('should mark regions with a callback', function () { + const token = new Token('tester') + const context = { value: 99 } + + token.markRegion('test', 1, function (arg) { return arg }) + expect(token.regions.test).toBe(1) + + token.markRegion('test', [1], function (arg) { return arg }) + expect(token.regions.test).toBe(1) + + token.markRegion('test', [1, 1], function (a1, a2) { return a1 + a2 }) + expect(token.regions.test).toBe(2) + + token.markRegion('test', null, function (this: Token) { + return this.string.length + }) + expect(token.regions.test).toBe(6) + + token.markRegion('test', null, function (this: any) { return this.value }, context) + expect(token.regions.test).toBe(99) + }) + + it('should check for suffixes', function () { + const token = new Token('tester') + + expect(token.hasSuffix('er')).toBeTruthy() + expect(token.hasSuffix('st')).toBeFalsy() + }) + + it('should check for suffixes within a region', function () { + const token = new Token('tester').markRegion('region', 2) + + expect(token.hasSuffixInRegion('st', 'region')).toBe(false) + expect(token.hasSuffixInRegion('ster', 'region')).toBe(true) + expect(token.hasSuffixInRegion('ester', 'region')).toBe(false) + }) + + it('should replace the suffix within a region', function () { + const t1 = new Token('tester').markRegion('region', 4) + const t2 = new Token('tester').markRegion('region', 0) + + t1.replaceSuffixInRegion('ter', '', 'region') + expect(t1.string).toBe('tester') + + t1.replaceSuffixInRegion('er', '', 'region') + expect(t1.string).toBe('test') + + t2.replaceSuffixInRegion('protester', '', 'region') + expect(t2.string).toBe('tester') + }) +}) diff --git a/ts_spec/stemmers_test.ts b/ts_spec/stemmers_test.ts deleted file mode 100644 index 53b42bbbb..000000000 --- a/ts_spec/stemmers_test.ts +++ /dev/null @@ -1,40 +0,0 @@ -import { - CarryStemmerFr, - LancasterStemmer, - PorterStemmer, - PorterStemmerEs, - PorterStemmerFa, - PorterStemmerFr, - PorterStemmerIt, - PorterStemmerNl, - PorterStemmerNo, - PorterStemmerPt, - PorterStemmerRu, - PorterStemmerUk, - PorterStemmerSv, - StemmerId, - StemmerJa -} from '../lib/natural' - -// Carry stemmers -console.log(CarryStemmerFr.stem('jugaría')) - -// Lancaster stemmers -console.log(LancasterStemmer.stem('words')) - -// Porter stemmers -console.log(PorterStemmer.stem('words')) // stem a single word -console.log(PorterStemmerEs.stem('jugaría')) -console.log(PorterStemmerFa.stem('jugaría')) -console.log(PorterStemmerFr.stem('jugaría')) -console.log(PorterStemmerIt.stem('jugaría')) -console.log(PorterStemmerNl.stem('tulp')) -console.log(PorterStemmerNo.stem('jugaría')) -console.log(PorterStemmerPt.stem('jugaría')) -console.log(PorterStemmerUk.stem('весною')) -console.log(PorterStemmerRu.stem('падший')) -console.log(PorterStemmerSv.stem('Riksdag')) - -// Other stemmers -console.log(StemmerId.stem('mie')) -console.log(StemmerJa.stem('言葉')) diff --git a/ts_spec/tokenizer_case_spec.ts b/ts_spec/tokenizer_case_spec.ts new file mode 100644 index 000000000..7e6a806ea --- /dev/null +++ b/ts_spec/tokenizer_case_spec.ts @@ -0,0 +1,242 @@ +/* +Copyright (c) 2011, Chris Umbel, Alex Langberg, Martijn de Boer + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +'use strict' + +import { CaseTokenizer } from 'lib/natural' +const tokenizer = new CaseTokenizer() + +describe('case_tokenizer_numbers', function () { + it('should tokenize numbers', function () { + expect(tokenizer.tokenize('0 1 2 3 4 5 6 7 8 9 10')).toEqual(['0', '1', '2', '3', '4', '5', '6', '7', '8', '9', '10']) + }) +}) + +describe('case_tokenizer_es', function () { + it('should tokenize strings', function () { + expect(tokenizer.tokenize('hola yo me llamo eduardo y esudié ingeniería')).toEqual(['hola', 'yo', 'me', 'llamo', 'eduardo', 'y', 'esudié', 'ingeniería']) + }) + + /* + it('should tokenize strings via attached string method', function() { + tokenizer.attach(); + expect('hola yo me llamo eduardo y esudié ingeniería'.tokenize()).toEqual(['hola', 'yo', 'me', 'llamo', 'eduardo', 'y', 'esudié', 'ingeniería']); + }); + */ +}) + +describe('case_tokenizer_fr', function () { + const text = "Affectueusement surnommé « Gabo » dans toute l'Amérique latine, le Colombien Gabriel Garcia Marquez, prix Nobel de littérature 1982, l'un des plus grands écrivains du XXe siècle, est mort À son domicile de Mexico jeudi 17 avril. Il était âgé de 87 ans. Son Œuvre a été traduite dans toutes les langues ou presque, et vendue à quelque 50 millions d'exemplaires." + + const tokenized = ['Affectueusement', + 'surnommé', + 'Gabo', + 'dans', + 'toute', + 'l', + 'Amérique', + 'latine', + 'le', + 'Colombien', + 'Gabriel', + 'Garcia', + 'Marquez', + 'prix', + 'Nobel', + 'de', + 'littérature', + '1982', + 'l', + 'un', + 'des', + 'plus', + 'grands', + 'écrivains', + 'du', + 'XXe', + 'siècle', + 'est', + 'mort', + 'À', + 'son', + 'domicile', + 'de', + 'Mexico', + 'jeudi', + '17', + 'avril', + 'Il', + 'était', + 'âgé', + 'de', + '87', + 'ans', + 'Son', + 'Œuvre', + 'a', + 'été', + 'traduite', + 'dans', + 'toutes', + 'les', + 'langues', + 'ou', + 'presque', + 'et', + 'vendue', + 'à', + 'quelque', + '50', + 'millions', + 'd', + 'exemplaires'] + + it('should tokenize strings', function () { + expect(tokenizer.tokenize(text)).toEqual(tokenized) + }) + + /* + it('should tokenize strings via attached string method', function() { + tokenizer.attach(); + expect(text.tokenize()).toEqual(tokenized); + }); + */ +}) + +describe('case_tokenizer_nl', function () { + it('should tokenize strings', function () { + expect(tokenizer.tokenize('\'s Morgens is het nog erg koud, vertelde de weerman over een van de radio\'s', true)).toEqual(['\'s', 'Morgens', 'is', 'het', 'nog', 'erg', 'koud', 'vertelde', 'de', 'weerman', 'over', 'een', 'van', 'de', 'radio\'s']) + }) + + /* + it('should tokenize strings via attached string method', function() { + tokenizer.attach(); + expect('\'s Morgens is het nog erg koud, vertelde de weerman over een van de radio\'s'.tokenize(true)).toEqual(['\'s','Morgens','is','het','nog','erg','koud','vertelde','de','weerman','over','een','van','de','radio\'s']); + }); + */ +}) + +describe('case_tokenizer_pt', function () { + it('should tokenize strings', function () { + expect(tokenizer.tokenize('isso é coração')).toEqual(['isso', 'é', 'coração']) + }) + + /* + it('should tokenize strings via attached string method', function() { + tokenizer.attach(); + expect('isso é coração'.tokenize()).toEqual(['isso', 'é', 'coração']); + }); + + it('should tokenize strings via attached string method', function() { + tokenizer.attach(); + expect('isso é coração'.tokenize()).toEqual(['isso', 'é', 'coração']); + }); + */ + + it('should swallow punctuation', function () { + expect(tokenizer.tokenize('isso é coração, no')).toEqual(['isso', 'é', 'coração', 'no']) + }) + + it('should swallow final punctuation', function () { + expect(tokenizer.tokenize('isso é coração, no?')).toEqual(['isso', 'é', 'coração', 'no']) + }) + + it('should swallow initial punctuation', function () { + expect(tokenizer.tokenize('.isso é coração, no')).toEqual(['isso', 'é', 'coração', 'no']) + }) + + it('should swallow duplicate punctuation', function () { + expect(tokenizer.tokenize('eu vou... pause')).toEqual(['eu', 'vou', 'pause']) + }) +}) + +describe('case_tokenizer_aggressive_tokenizer', function () { + it('should tokenize strings', function () { + expect(tokenizer.tokenize('these are things')).toEqual(['these', 'are', 'things']) + }) + + /* + it('should tokenize strings via attached string method', function() { + tokenizer.attach(); + expect('these are things'.tokenize()).toEqual(['these', 'are', 'things']); + }); + + it('should tokenize strings via attached string method', function() { + tokenizer.attach(); + expect('these are things'.tokenize()).toEqual(['these', 'are', 'things']); + }); + */ + + it('should swallow punctuation', function () { + expect(tokenizer.tokenize('these are things, no')).toEqual(['these', 'are', 'things', 'no']) + }) + + it('should swallow final punctuation', function () { + expect(tokenizer.tokenize('these are things, no?')).toEqual(['these', 'are', 'things', 'no']) + }) + + it('should swallow initial punctuation', function () { + expect(tokenizer.tokenize('.these are things, no')).toEqual(['these', 'are', 'things', 'no']) + }) + + it('should swallow duplicate punctuation', function () { + expect(tokenizer.tokenize('i shal... pause')).toEqual(['i', 'shal', 'pause']) + }) +}) + +describe('case_tokenizer_it', function () { + it('should tokenize strings', function () { + expect(tokenizer.tokenize('Mi piacerebbe visitare l\'Italia un giorno di questi!')).toEqual(['Mi', 'piacerebbe', 'visitare', 'l', 'Italia', 'un', 'giorno', 'di', 'questi']) + }) +}) + +describe('case_tokenizer_no', function () { + it('should tokenize strings', function () { + expect(tokenizer.tokenize('Gå rett fram. Så tek du til venstre/høgre.')).toEqual(['Gå', 'rett', 'fram', 'Så', 'tek', 'du', 'til', 'venstre', 'høgre']) + }) +}) + +// Made up tests from here. No idea but seem to work. + +describe('case_tokenizer_pl', function () { + it('should tokenize strings', function () { + expect(tokenizer.tokenize('Bardzo za tobą tęskniłem/tęskniłam!')).toEqual(['Bardzo', 'za', 'tobą', 'tęskniłem', 'tęskniłam']) + }) +}) + +describe('case_tokenizer_pt', function () { + it('should tokenize strings', function () { + expect(tokenizer.tokenize('Siga em frente, depois vire à esquerda/direita!')).toEqual(['Siga', 'em', 'frente', 'depois', 'vire', 'à', 'esquerda', 'direita']) + }) +}) + +describe('case_tokenizer_ru', function () { + it('should tokenize strings', function () { + expect(tokenizer.tokenize('Vy mOzhite mne pamOch? Вы можете мне помочь?')).toEqual(['Vy', 'mOzhite', 'mne', 'pamOch', 'Вы', 'можете', 'мне', 'помочь']) + }) +}) + +describe('case_tokenizer_fi', function () { + it('should tokenize strings', function () { + expect(tokenizer.tokenize('Mene suoraan käänny sitten vasempaan/oikeaan!')).toEqual(['Mene', 'suoraan', 'käänny', 'sitten', 'vasempaan', 'oikeaan']) + }) +}) diff --git a/ts_spec/tokenizer_ja_spec.ts b/ts_spec/tokenizer_ja_spec.ts new file mode 100644 index 000000000..bc6174d2d --- /dev/null +++ b/ts_spec/tokenizer_ja_spec.ts @@ -0,0 +1,63 @@ +/* +Copyright (c) 2012, Guillaume Marty + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in +all copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN +THE SOFTWARE. +*/ + +'use strict' + +import { + Converters, + TokenizerJa +} from 'lib/natural' +const tokenizer = new TokenizerJa() +const converters = new Converters() + +const text = '計算機科学における字句解析 (じくかいせき、英: Lexical Analysis) とは、ソースコードを構成する文字の並びを、トークン (token) の並びに変換することをいう。\n' + + 'ここでいう「トークン」とは、意味を持つコードの最小単位のこと。字句解析を行うプログラムは、字句解析器 (lexical analyzer, 略称:lexer) と呼ばれる。\n' + + '字句解析器はスキャナ (scanner) とトークナイザ (tokenizer) から構成される。\n' +const result = ['計算', '機科', '学', 'に', 'おける', '字句', '解析', + 'じくかい', 'せき', '英', 'Lexical', 'Analysis', 'と', 'は', 'ソースコード', + 'を', '構成', 'する', '文字', 'の', '並び', 'を', 'トークン', 'token', 'の', + '並び', 'に', '変換', 'する', 'こと', 'を', 'いう', 'ここ', 'でいう', 'トークン', + 'と', 'は', '意味', 'を', '持つ', 'コード', 'の', '最小', '単位', 'の', 'こと', + '字句', '解析', 'を', '行う', 'プログラム', 'は', '字句', '解析', '器', 'lexical', + 'analyzer', '略称', 'lexer', 'と', '呼ば', 'れる', '字句', '解析', '器', 'は', + 'スキャナ', 'scanner', 'と', 'トークナイザ', 'tokenizer', 'から', '構成', 'さ', + 'れる'] + +describe('TokenizerJa', function () { + it('should tokenize', function () { + const tokens = tokenizer.tokenize(text) + expect(tokens).toEqual(result) + + // This test is very hard to pass through, so we comment for now. + // tokens = tokenizer.tokenize('すもももももももものうち。'); + // expect(tokens).toEqual(['すもも', 'も', 'もも', 'も', 'もも', 'の', 'うち', '。']); + }) + + it('should normalize input', function () { + const tokens = tokenizer.tokenize( + converters.alphabetHF( + converters.numbersHF( + converters.punctuationFH( + converters.katakanaFH(text))))) + expect(tokens).toEqual(result) + }) +})