From a7a8a235496f089484ceccbea223ed3d55998168 Mon Sep 17 00:00:00 2001 From: Hugo ter Doest Date: Mon, 29 May 2023 19:39:07 +0200 Subject: [PATCH] Simplified regular expression and added support for ellipsis (#687) * Simplified regular expression and added support for ellipsis * Standard syntax * Standard syntax --- lib/natural/tokenizers/sentence_tokenizer.js | 2 +- spec/sentence_tokenizer_spec.js | 15 +++++++++++++++ 2 files changed, 16 insertions(+), 1 deletion(-) diff --git a/lib/natural/tokenizers/sentence_tokenizer.js b/lib/natural/tokenizers/sentence_tokenizer.js index abd4f6794..1aaa7773f 100644 --- a/lib/natural/tokenizers/sentence_tokenizer.js +++ b/lib/natural/tokenizers/sentence_tokenizer.js @@ -34,7 +34,7 @@ util.inherits(SentenceTokenizer, Tokenizer) SentenceTokenizer.prototype.tokenize = function (text) { // break string up in to sentences based on punctation and quotation marks - let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?(.*?[.?!])(\s[.?!])*["'’”'"\])}⟩]?(?=\s+|$)|(?<=\s+|^)\S(.*?[.?!])(\s[.?!])*(?=\s+|$)/g) + let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?(.*?[.?!…]|[^.?!…]+)(\s[.?!…])*["'’”'"\])}⟩]?(?=\s+|$)/g) DEBUG && console.log('SentenceTokenizer.tokenize: ' + tokens) diff --git a/spec/sentence_tokenizer_spec.js b/spec/sentence_tokenizer_spec.js index 2d7c26550..330743f1b 100644 --- a/spec/sentence_tokenizer_spec.js +++ b/spec/sentence_tokenizer_spec.js @@ -158,4 +158,19 @@ describe('sentence_tokenizer', function () { 'Test: Test (test) test “Test.”' ]) }) + + it('should handle text with the ellipsis symbol … and it should handle last sentence without punctuation (issue #648)', function () { + expect( + tokenizer.tokenize('We’re heading for a catastrophic global temperature rise… Fires are blazing from the Amazon to the Arctic.') + ).toEqual([ + 'We’re heading for a catastrophic global temperature rise…', + 'Fires are blazing from the Amazon to the Arctic.' + ]) + expect( + tokenizer.tokenize('We’re heading for a catastrophic global temperature rise. Fires are blazing from the Amazon to the Arctic') + ).toEqual([ + 'We’re heading for a catastrophic global temperature rise.', + 'Fires are blazing from the Amazon to the Arctic' + ]) + }) })