Skip to content

HTTPS clone URL

Subversion checkout URL

You can clone with
or
.
Download ZIP
Browse files

added test coverage for tokenizers

  • Loading branch information...
commit 5f893c64d7d1ed73b722cb4d69197e42bc32a044 1 parent 6180f77
@chrisumbel chrisumbel authored
View
3  .gitignore
@@ -1,2 +1,3 @@
*~
-\#*
+\#*
+*.kpf
View
2  Makefile
@@ -26,4 +26,4 @@
# POSSIBILITY OF SUCH DAMAGE.
clean:
- @rm -f **/*~ *~
+ @rm -f **/*~ *~ **/\#* \#*
View
12 lib/tokenizers.js
@@ -27,9 +27,19 @@ ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
POSSIBILITY OF SUCH DAMAGE.
*/
+function trim(array) {
+ if(array[array.length - 1] == '')
+ array.pop();
+
+ if(array[0] == '')
+ array.shift();
+
+ return array;
+}
+
// break a string up into an array of tokens by anything non-word
function tokenize(text) {
- return text.split(/\W+/);
+ return trim(text.split(/\W+/));
}
// expose the tokenize function
View
32 spec/porter_stemmer_spec.js
@@ -1,3 +1,31 @@
+/*
+Copyright (c) 2011, Chris Umbel
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+3. Neither the name of the PostgreSQL Global Development Group nor the names
+ of its contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
stemmer = require('lib/porter_stemmer');
stemmer.attach();
@@ -100,8 +128,6 @@ describe('porter_stemmer', function() {
});
it('should tokenize and stem', function() {
- var result = 'scoring stinks'.tokenizeAndStem();
- expect(result[0]).toBe('score');
- expect(result[1]).toBe('stink');
+ expect('scoring stinks'.tokenizeAndStem()).toEqual(['score', 'stink']);
});
});
View
52 spec/tokenizers_spec.js
@@ -0,0 +1,52 @@
+/*
+Copyright (c) 2011, Chris Umbel
+All rights reserved.
+
+Redistribution and use in source and binary forms, with or without
+modification, are permitted provided that the following conditions are met:
+
+1. Redistributions of source code must retain the above copyright notice,
+ this list of conditions and the following disclaimer.
+2. Redistributions in binary form must reproduce the above copyright notice,
+ this list of conditions and the following disclaimer in the documentation
+ and/or other materials provided with the distribution.
+3. Neither the name of the PostgreSQL Global Development Group nor the names
+ of its contributors may be used to endorse or promote products derived
+ from this software without specific prior written permission.
+
+THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
+AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
+IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
+ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER OR CONTRIBUTORS BE
+LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
+CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
+SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
+INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
+CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
+ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE
+POSSIBILITY OF SUCH DAMAGE.
+*/
+
+var tokenizer = require('lib/tokenizers');
+tokenizer.attach();
+
+describe('tokenizer', function() {
+ it('should tokenize strings', function() {
+ expect(tokenizer.tokenize('these are things')).toEqual(['these', 'are', 'things']);
+ });
+ it('should tokenize strings via instance method', function() {
+ expect('these are things'.tokenize()).toEqual(['these', 'are', 'things']);
+ });
+ it('should swollow punctuation', function() {
+ expect('these are things, no'.tokenize()).toEqual(['these', 'are', 'things', 'no']);
+ });
+ it('should swollow final punctuation', function() {
+ expect('these are things, no?'.tokenize()).toEqual(['these', 'are', 'things', 'no']);
+ });
+ it('should swollow initial punctuation', function() {
+ expect('.these are things, no'.tokenize()).toEqual(['these', 'are', 'things', 'no']);
+ });
+ it('should swollow duplicate punctuation', function() {
+ expect('i shal... pause'.tokenize()).toEqual(['i', 'shal', 'pause']);
+ });
+});
Please sign in to comment.
Something went wrong with that request. Please try again.