From abbe74d3102cb2c212d6e26ff680364d200cae80 Mon Sep 17 00:00:00 2001 From: Chris Umbel Date: Sat, 26 Nov 2011 23:12:15 -0500 Subject: [PATCH] classifiers accept text --- lib/natural/classifiers/bayes_classifier.js | 2 +- lib/natural/classifiers/classifier.js | 6 ++++ .../logistic_regression_classifier.js | 2 +- spec/bayes_classifier_spec.js | 17 ++++++++- spec/logistic_regression_classifier_spec.js | 35 +++++++++++++------ 5 files changed, 49 insertions(+), 13 deletions(-) diff --git a/lib/natural/classifiers/bayes_classifier.js b/lib/natural/classifiers/bayes_classifier.js index a88075b2d..23802cef4 100644 --- a/lib/natural/classifiers/bayes_classifier.js +++ b/lib/natural/classifiers/bayes_classifier.js @@ -26,7 +26,7 @@ Classifier = require('./classifier'), ApparatusBayesClassifier = require('apparatus').BayesClassifier; var BayesClassifier = function(stemmer) { - Classifier.call(this, new ApparatusBayesClassifier()); + Classifier.call(this, new ApparatusBayesClassifier(), stemmer); }; sys.inherits(BayesClassifier, Classifier); diff --git a/lib/natural/classifiers/classifier.js b/lib/natural/classifiers/classifier.js index 193468cf2..cf8a0a027 100644 --- a/lib/natural/classifiers/classifier.js +++ b/lib/natural/classifiers/classifier.js @@ -34,6 +34,9 @@ function addDocument(text, classification) { if(this.docs[classification] == null) this.docs[classification] = []; + if(typeof text === 'string') + text = this.stemmer.tokenizeAndStem(text); + this.docs[classification].push(text); for(var i = 0; i < text.length; i++) { @@ -44,6 +47,9 @@ function addDocument(text, classification) { function textToFeatures(observation) { var features = []; + if(typeof observation === 'string') + observation = this.stemmer.tokenizeAndStem(observation); + for(var feature in this.features) { if(observation.indexOf(feature) > -1) features.push(1); diff --git a/lib/natural/classifiers/logistic_regression_classifier.js b/lib/natural/classifiers/logistic_regression_classifier.js index 05e47915b..2d5403ac7 100644 --- a/lib/natural/classifiers/logistic_regression_classifier.js +++ b/lib/natural/classifiers/logistic_regression_classifier.js @@ -26,7 +26,7 @@ Classifier = require('./classifier'), ApparatusLogisticRegressionClassifier = require('apparatus').LogisticRegressionClassifier; var LogisticRegressionClassifier = function(stemmer) { - Classifier.call(this, new ApparatusLogisticRegressionClassifier()); + Classifier.call(this, new ApparatusLogisticRegressionClassifier(), stemmer); }; sys.inherits(LogisticRegressionClassifier, Classifier); diff --git a/spec/bayes_classifier_spec.js b/spec/bayes_classifier_spec.js index 14a265379..0914a8602 100644 --- a/spec/bayes_classifier_spec.js +++ b/spec/bayes_classifier_spec.js @@ -24,7 +24,7 @@ var natural = require('lib/natural'); describe('bayes classifier', function() { describe('classifier', function() { - it('should classify with mixed training data', function() { + it('should classify with arrays', function() { var classifier = new natural.BayesClassifier(); classifier.addDocument(['fix', 'box'], 'computing'); classifier.addDocument(['write', 'code'], 'computing'); @@ -38,5 +38,20 @@ describe('bayes classifier', function() { expect(classifier.classify(['bug', 'code'])).toBe('computing'); expect(classifier.classify(['read', 'thing'])).toBe('literature'); }); + + it('should classify with arrays', function() { + var classifier = new natural.BayesClassifier(); + classifier.addDocument('i fixed the box', 'computing'); + classifier.addDocument('i write code', 'computing'); + classifier.addDocument('nasty script code', 'computing'); + classifier.addDocument('write a book', 'literature'); + classifier.addDocument('read a book', 'literature'); + classifier.addDocument('study the books', 'literature'); + + classifier.train(); + + expect(classifier.classify('a bug in the code')).toBe('computing'); + expect(classifier.classify('read all the books')).toBe('literature'); + }); }); }); diff --git a/spec/logistic_regression_classifier_spec.js b/spec/logistic_regression_classifier_spec.js index de6e9f93d..3e107fde6 100644 --- a/spec/logistic_regression_classifier_spec.js +++ b/spec/logistic_regression_classifier_spec.js @@ -25,18 +25,33 @@ var natural = new require('lib/natural'), describe('logistic regression', function() { it('should classify with individually trained documents', function() { - var logistic = new LogisticRegressionClassifier(); + var classifier = new LogisticRegressionClassifier(); - logistic.addDocument(['have', 'computer'], 'IT'); - logistic.addDocument(['have', 'phone'], 'IT'); - logistic.addDocument(['computer', 'suck'], 'IT'); - logistic.addDocument(['field', 'goal'], 'sports'); - logistic.addDocument(['score', 'goal'], 'sports'); - logistic.addDocument(['great', 'speed'], 'sports'); + classifier.addDocument(['have', 'computer'], 'IT'); + classifier.addDocument(['have', 'phone'], 'IT'); + classifier.addDocument(['computer', 'suck'], 'IT'); + classifier.addDocument(['field', 'goal'], 'sports'); + classifier.addDocument(['score', 'goal'], 'sports'); + classifier.addDocument(['great', 'speed'], 'sports'); - logistic.train(); + classifier.train(); - expect(logistic.classify(['hate', 'computer'])).toBe('IT'); - expect(logistic.classify(['score', 'please'])).toBe('sports'); + expect(classifier.classify(['hate', 'computer'])).toBe('IT'); + expect(classifier.classify(['score', 'please'])).toBe('sports'); + }); + + it('should classify with arrays', function() { + var classifier = new natural.LogisticRegressionClassifier(); + classifier.addDocument('i fixed the box', 'computing'); + classifier.addDocument('i write code', 'computing'); + classifier.addDocument('nasty script code', 'computing'); + classifier.addDocument('write a book', 'literature'); + classifier.addDocument('read a book', 'literature'); + classifier.addDocument('study the books', 'literature'); + + classifier.train(); + + expect(classifier.classify('a bug in the code')).toBe('computing'); + expect(classifier.classify('read all the books')).toBe('literature'); }); });