Added removeDocument and retrain #36

Merged
merged 1 commit into from Nov 1, 2013
@@ -45,6 +45,33 @@ function addDocument(text, classification) {
}
}
+function removeDocument(text, classification) {
+ var docs = this.docs
+ , doc
+ , pos;
+
+ if (typeof text === 'string') {
+ text = this.stemmer.tokenizeAndStem(text);
+ }
+
+ for (var i = 0, ii = docs.length; i < ii; i++) {
+ doc = docs[i];
+ if (doc.text.join(' ') == text.join(' ') &&
+ doc.label == classification) {
+ pos = i;
+ }
+ }
+
+ // Remove if there's a match
+ if (!isNaN(pos)) {
+ this.docs.splice(pos, 1);
+
+ for (var i = 0, ii = text.length; i < ii; i++) {
+ delete this.features[text[i]];
+ }
+ }
+}
+
function textToFeatures(observation) {
var features = [];
@@ -71,6 +98,12 @@ function train() {
this.classifier.train();
}
+function retrain() {
+ this.classifier = new (this.classifier.constructor)();
+ this.lastAdded = 0;
+ this.train();
+}
+
function getClassifications(observation) {
return this.classifier.getClassifications(this.textToFeatures(observation));
}
@@ -106,7 +139,9 @@ function load(filename, callback) {
}
Classifier.prototype.addDocument = addDocument;
+Classifier.prototype.removeDocument = removeDocument;
Classifier.prototype.train = train;
+Classifier.prototype.retrain = retrain;
Classifier.prototype.classify = classify;
Classifier.prototype.textToFeatures = textToFeatures;
Classifier.prototype.save = save;
@@ -54,7 +54,7 @@ describe('bayes classifier', function() {
expect(classifier.getClassifications('i write code')[1].label).toBe('literature');
});
- it('should classify with arrays', function() {
+ it('should classify with strings', function() {
var classifier = new natural.BayesClassifier();
classifier.addDocument('i fixed the box', 'computing');
classifier.addDocument('i write code', 'computing');
@@ -69,6 +69,44 @@ describe('bayes classifier', function() {
expect(classifier.classify('read all the books')).toBe('literature');
});
+ it('should classify and re-classify after document-removal', function() {
+ var classifier = new natural.BayesClassifier()
+ , arr
+ , item
+ , classifications = {};
+
+ // Add some good/bad docs and train
+ classifier.addDocument('foo bar baz', 'good');
+ classifier.addDocument('qux zooby', 'bad');
+ classifier.addDocument('asdf qwer', 'bad');
+ classifier.train();
+
+ expect(classifier.classify('foo')).toBe('good');
+ expect(classifier.classify('qux')).toBe('bad');
+
+ // Remove one of the bad docs, retrain
+ classifier.removeDocument('qux zooby', 'bad');
+ classifier.retrain();
+
+ // Simple `classify` will still return a single result, even if
+ // ratio for each side is equal -- have to compare actual values in
+ // the classifications, should be equal since qux is unclassified
+ arr = classifier.getClassifications('qux');
+ for (var i = 0, ii = arr.length; i < ii; i++) {
+ item = arr[i];
+ classifications[item.label] = item.value;
+ }
+ expect(classifications.good).toEqual(classifications.bad);
+
+ // Re-classify as good, retrain
+ classifier.addDocument('qux zooby', 'good');
+ classifier.retrain();
+
+ // Should now be good, original docs should be unaffected
+ expect(classifier.classify('foo')).toBe('good');
+ expect(classifier.classify('qux')).toBe('good');
+ });
+
it('should serialize and deserialize a working classifier', function() {
var classifier = new natural.BayesClassifier();
classifier.addDocument('i fixed the box', 'computing');