Skip to content

Commit

Permalink
Simplified regular expression and added support for ellipsis (#687)
Browse files Browse the repository at this point in the history
* Simplified regular expression and added support for ellipsis

* Standard syntax

* Standard syntax
  • Loading branch information
Hugo-ter-Doest committed May 29, 2023
1 parent 9c78b9d commit a7a8a23
Show file tree
Hide file tree
Showing 2 changed files with 16 additions and 1 deletion.
2 changes: 1 addition & 1 deletion lib/natural/tokenizers/sentence_tokenizer.js
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ util.inherits(SentenceTokenizer, Tokenizer)

SentenceTokenizer.prototype.tokenize = function (text) {
// break string up in to sentences based on punctation and quotation marks
let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?(.*?[.?!])(\s[.?!])*["'’”'"\])}⟩]?(?=\s+|$)|(?<=\s+|^)\S(.*?[.?!])(\s[.?!])*(?=\s+|$)/g)
let tokens = text.match(/(?<=\s+|^)["'‘“'"[({⟨]?(.*?[.?!…]|[^.?!…]+)(\s[.?!])*["'’”'"\])}⟩]?(?=\s+|$)/g)

DEBUG && console.log('SentenceTokenizer.tokenize: ' + tokens)

Expand Down
15 changes: 15 additions & 0 deletions spec/sentence_tokenizer_spec.js
Original file line number Diff line number Diff line change
Expand Up @@ -158,4 +158,19 @@ describe('sentence_tokenizer', function () {
'Test: Test (test) test “Test.”'
])
})

it('should handle text with the ellipsis symbol … and it should handle last sentence without punctuation (issue #648)', function () {
expect(
tokenizer.tokenize('We’re heading for a catastrophic global temperature rise… Fires are blazing from the Amazon to the Arctic.')
).toEqual([
'We’re heading for a catastrophic global temperature rise…',
'Fires are blazing from the Amazon to the Arctic.'
])
expect(
tokenizer.tokenize('We’re heading for a catastrophic global temperature rise. Fires are blazing from the Amazon to the Arctic')
).toEqual([
'We’re heading for a catastrophic global temperature rise.',
'Fires are blazing from the Amazon to the Arctic'
])
})
})

0 comments on commit a7a8a23

Please sign in to comment.