Permalink
Browse files

classification uses apparatus now

  • Loading branch information...
chrisumbel committed Nov 28, 2011
1 parent abbe74d commit 2e602203a821471ac05da5868ce0695be94d5a5b
View
@@ -1,7 +1,7 @@
*~
\#*
*.kpf
-classifier.json
+*classifier.json
node_modules
.*
io_spec/test_data/wordnet/download/*
View
@@ -29,6 +29,7 @@ test_clean:
clean: test_clean
@find ./ -name *~ | xargs rm -f
@find ./ -name \#* | xargs rm -f
+ @rm *classifier.json
test:
@NODE_PATH=. jasmine-node spec/
View
@@ -75,20 +75,25 @@ the same thing can be done with a lancaster stemmer
console.log("i am waking up to the sounds of chainsaws".tokenizeAndStem());
console.log("chainsaws".stem());
-Naive Bayes Classifier
+Classifiers
----------------------
+Two classifiers are currently supported, Naive Bayes and logistic regression.
+The following examples use the BayesClassifier class, but the
+LogisticRegressionClassifier class could be substituted instead.
+
var natural = require('natural'),
classifier = new natural.BayesClassifier();
you can train the classifier on sample text. it will use reasonable defaults to
tokenize and stem the text.
+
+ classifier.addDocument('i am long qqqq', 'buy');
+ classifier.addDocument('buy the q's', 'buy');
+ classifier.addDocument('short gold', 'sell');
+ classifier.addDocument('sell gold', 'sell');
- classifier.train([{classification: 'buy', text: "i am long qqqq"},
- {classification: 'buy', text: "buy the q's"},
- {classification: 'sell', text: "short gold"},
- {classification: 'sell', text: "sell gold"}
- ]);
+ classifier.train();
outputs "sell"
@@ -104,57 +109,32 @@ the classifier can also be trained on and classify arrays of tokens, strings, or
any mixture. arrays let you use entirely custom data with your own
tokenization/stemming if any at all.
- classifier.train([{classification: 'hockey', text: ['puck', 'shoot']},
- {classification: 'hockey', text: 'goalies stop pucks.'},
- {classification: 'stocks', text: ['stop', 'loss']},
- {classification: 'stocks', text: 'creat a stop order'}
- ]);
-
- console.log(classifier.classify('stop out at $100'));
- console.log(classifier.classify('stop the puck, fool!'));
-
- console.log(classifier.classify(['stop', 'out']));
- console.log(classifier.classify(['stop', 'puck', 'fool']));
-
-A classifier can also be persisted and recalled so you can reuse a training.
+ classifier.addDocument(['sell', 'gold'], 'sell');
- var classifier = new natural.BayesClassifier();
-
- classifier.train([{classification: 'buy', text: ['long', 'qqqq']},
- {classification: 'buy', text: "buy the q's"},
- {classification: 'sell', text: "short gold"},
- {classification: 'sell', text: ['sell', 'gold']}
- ]);
-
-persist to a file on disk named "classifier.json"
+A classifier can also be persisted and recalled so you can reuse a training
classifier.save('classifier.json', function(err, classifier) {
// the classifier is saved to the classifier.json file!
});
and to recall from the classifier.json saved above:
- natural.BayesClassifier.load('classifier.json', function(err, classifier) {
+ natural.BayesClassifier.load('classifier.json', null, function(err, classifier) {
console.log(classifier.classify('long SUNW'));
console.log(classifier.classify('short SUNW'));
});
A classifier can also be serialized and deserialized as such
var classifier = new natural.BayesClassifier();
-
- classifier.train([{classification: 'buy', text: ['long', 'qqqq']},
- {classification: 'buy', text: "buy the q's"},
- {classification: 'sell', text: "short gold"},
- {classification: 'sell', text: ['sell', 'gold']}
- ]);
+ classifier.addDocument(['sell', 'gold'], 'sell');
+ classifier.addDocument(['buy', 'silver'], 'buy');
// serialize
var raw = JSON.stringify(classifier);
// deserialize
var restoredClassifier = natural.BayesClassifier.restore(raw);
- console.log(restoredClassifier.classify('i am short silver'));
- console.log(restoredClassifier.classify('i am long silver'));
+ console.log(restoredClassifier.classify('i should sell that'));
Phonetics
---------
@@ -31,4 +31,21 @@ var BayesClassifier = function(stemmer) {
sys.inherits(BayesClassifier, Classifier);
+function restore(classifier, stemmer) {
+ classifier = Classifier.restore(classifier);
+ classifier.__proto__ = BayesClassifier.prototype;
+ classifier.classifier = ApparatusBayesClassifier.restore(classifier.classifier);
+
+ return classifier;
+}
+
+function load(filename, stemmer, callback) {
+ Classifier.load(filename, function(err, classifier) {
+ callback(err, restore(classifier));
+ });
+}
+
+BayesClassifier.restore = restore;
+BayesClassifier.load = load;
+
module.exports = BayesClassifier;
@@ -25,19 +25,20 @@ sys = require('sys');
var Classifier = function(classifier, stemmer) {
this.classifier = classifier;
- this.docs = {};
+ this.docs = [];
this.features = {};
this.stemmer = stemmer || PorterStemmer;
+ this.lastAdded = 0;
};
function addDocument(text, classification) {
- if(this.docs[classification] == null)
- this.docs[classification] = [];
-
if(typeof text === 'string')
text = this.stemmer.tokenizeAndStem(text);
- this.docs[classification].push(text);
+ this.docs.push({
+ label: classification,
+ text: text
+ });
for(var i = 0; i < text.length; i++) {
this.features[text[i]] = 1;
@@ -61,41 +62,51 @@ function textToFeatures(observation) {
}
function train() {
- for(var classification in this.docs) {
- for(var i = 0; i < this.docs[classification].length; i++) {
- var features = this.textToFeatures(this.docs[classification][i]);
- this.classifier.addExample(features, classification);
- }
+ for(var i = this.lastAdded; i < this.docs.length; i++) {
+ var features = this.textToFeatures(this.docs[i].text);
+ this.classifier.addExample(features, this.docs[i].label);
+ this.lastAdded++;
}
this.classifier.train();
}
-/*
-function load(filename, callback) {
- Classifier.load(filename, function(err, classifier) {
- callback(err, restore(classifier));
- });
+function classify(observation) {
+ return this.classifier.classify(this.textToFeatures(observation));
}
function restore(classifier, stemmer) {
- classifier = Classifier.restore(classifier, stemmer);
- classifier.__proto__ = BayesClassifier.prototype;
-
- return classifier;
+ classifier.stemmer = stemmer || PorterStemmer;
+ return classifier;
}
-*/
-function classify(observation) {
- return this.classifier.classify(this.textToFeatures(observation));
+function save(filename, callback) {
+ var data = JSON.stringify(this);
+ var fs = require('fs');
+ fs.writeFile(filename, data, encoding='utf8', callback);
+}
+
+function load(filename, callback) {
+ var fs = require('fs');
+
+ fs.readFile(filename, encoding='utf8', function(err, data) {
+ var classifier;
+
+ if(!err) {
+ classifier = JSON.parse(data);
+ }
+
+ if(callback)
+ callback(err, classifier);
+ });
}
Classifier.prototype.addDocument = addDocument;
Classifier.prototype.train = train;
Classifier.prototype.classify = classify;
Classifier.prototype.textToFeatures = textToFeatures;
-
-//Classifier.load = load;
-//Classifier.restore = restore;
+Classifier.prototype.save = save;
+Classifier.restore = restore;
+Classifier.load = load;
module.exports = Classifier;
@@ -31,4 +31,30 @@ var LogisticRegressionClassifier = function(stemmer) {
sys.inherits(LogisticRegressionClassifier, Classifier);
+function restore(classifier, stemmer) {
+ classifier = Classifier.restore(classifier, stemmer);
+ classifier.__proto__ = LogisticRegressionClassifier.prototype;
+ classifier.classifier = ApparatusLogisticRegressionClassifier.restore(classifier.classifier);
+
+ return classifier;
+}
+
+function load(filename, stemmer, callback) {
+ Classifier.load(filename, function(err, classifier) {
+ callback(err, restore(classifier, stemmer));
+ });
+}
+
+function train() {
+ // we need to reset the traning state because logistic regression
+ // needs its matricies to have their widths synced, etc.
+ this.lastAdded = 0;
+ this.classifier = new ApparatusLogisticRegressionClassifier();
+ Classifier.prototype.train.call(this);
+}
+
+LogisticRegressionClassifier.prototype.train = train;
+LogisticRegressionClassifier.restore = restore;
+LogisticRegressionClassifier.load = load;
+
module.exports = LogisticRegressionClassifier;
View
@@ -1 +1 @@
-{"stemmer":{},"docs":{"buy":[["long","qqqq"],["bui"]],"sell":[["short","gold"],["sell","gold"]]},"features":["long","qqqq","bui","short","gold","sell"],"featurePositions":{"long":0,"qqqq":1,"bui":2,"short":3,"gold":4,"sell":5},"maxFeaturePosition":6,"classifications":["buy","sell"],"m":4,"theta":[{"elements":[0,2.994818955429701,2.994818955429701,-2.1878169311137983,-4.3756338622275965,-2.1878169311137983]},{"elements":[0,-2.9948189554297007,-2.9948189554297007,2.1878169311137987,4.375633862227597,2.1878169311137987]}]}
+{"classifier":{"examples":{"computing":[[1,1,0,0,0,0,0,0,0],[0,0,1,1,0,0,0,0,0],[0,0,0,1,1,1,0,0,0]],"literature":[[0,0,1,0,0,0,1,0,0],[0,0,0,0,0,0,1,1,0],[0,0,0,0,0,0,1,0,1]]},"features":[],"featurePositions":{},"maxFeaturePosition":0,"classifications":["computing","literature"],"exampleCount":6,"theta":[{"elements":[2.0057203693422903,2.0057203693422903,0.0904766662626176,3.447364098636818,0.9696521506748157,0.9696521506748157,-4.840706080886848,-1.2267353995937322,-1.2267353995937322]},{"elements":[-2.00572036934229,-2.00572036934229,-0.09047666626261795,-3.447364098636819,-0.9696521506748155,-0.9696521506748155,4.84070608088685,1.2267353995937313,1.2267353995937313]}]},"docs":[{"label":"computing","text":["fix","box"]},{"label":"computing","text":["write","code"]},{"label":"computing","text":["nasti","script","code"]},{"label":"literature","text":["write","book"]},{"label":"literature","text":["read","book"]},{"label":"literature","text":["studi","book"]}],"features":{"fix":1,"box":1,"write":1,"code":1,"nasti":1,"script":1,"book":1,"read":1,"studi":1},"stemmer":{},"lastAdded":6}
View
@@ -1,12 +1,13 @@
{
"name": "natural",
"description": "General natural language (tokenizing, stemming, classification, inflection, phonetics, tfidf, WordNet) facilities for node.",
- "version": "0.0.58",
+ "version": "0.0.60",
"engines": {
"node": ">=0.2.6"
},
"dependencies": {
- "sylvester": ">= 0.0.4",
+ "sylvester": ">= 0.0.8",
+ "apparatus": ">= 0.0.4",
"underscore": "*"
},
"devDependencies": {
@@ -53,5 +53,56 @@ describe('bayes classifier', function() {
expect(classifier.classify('a bug in the code')).toBe('computing');
expect(classifier.classify('read all the books')).toBe('literature');
});
+
+ it('should serialize and deserialize a working classifier', function() {
+ var classifier = new natural.BayesClassifier();
+ classifier.addDocument('i fixed the box', 'computing');
+ classifier.addDocument('i write code', 'computing');
+ classifier.addDocument('nasty script code', 'computing');
+ classifier.addDocument('write a book', 'literature');
+ classifier.addDocument('read a book', 'literature');
+ classifier.addDocument('study the books', 'literature');
+
+ var obj = JSON.stringify(classifier);
+ var newClassifier = natural.BayesClassifier.restore(JSON.parse(obj));
+
+ newClassifier.addDocument('kick a ball', 'sports');
+ newClassifier.addDocument('hit some balls', 'sports');
+ newClassifier.addDocument('kick and punch', 'sports');
+
+ newClassifier.train();
+
+ expect(newClassifier.classify('a bug in the code')).toBe('computing');
+ expect(newClassifier.classify('read all the books')).toBe('literature');
+ expect(newClassifier.classify('kick butt')).toBe('sports');
+ });
+
+ it('should save and load a working classifier', function() {
+ var classifier = new natural.BayesClassifier();
+ classifier.addDocument('i fixed the box', 'computing');
+ classifier.addDocument('i write code', 'computing');
+ classifier.addDocument('nasty script code', 'computing');
+ classifier.addDocument('write a book', 'literature');
+ classifier.addDocument('read a book', 'literature');
+ classifier.addDocument('study the books', 'literature');
+
+ classifier.train();
+
+ classifier.save('bayes_classifier.json', function(err) {
+ natural.BayesClassifier.load('bayes_classifier.json', null,
+ function(err, newClassifier){
+ newClassifier.addDocument('kick a ball', 'sports');
+ newClassifier.addDocument('hit some balls', 'sports');
+ newClassifier.addDocument('kick and punch', 'sports');
+
+ newClassifier.train();
+
+ expect(newClassifier.classify('a bug in the code')).toBe('computing');
+ expect(newClassifier.classify('read all the books')).toBe('literature');
+ expect(newClassifier.classify('kick butt')).toBe('sports');
+ asyncSpecDone();
+ });
+ });
+ });
});
});
@@ -54,4 +54,54 @@ describe('logistic regression', function() {
expect(classifier.classify('a bug in the code')).toBe('computing');
expect(classifier.classify('read all the books')).toBe('literature');
});
+
+ it('should serialize and deserialize a working classifier', function() {
+ var classifier = new natural.LogisticRegressionClassifier();
+ classifier.addDocument('i fixed the box', 'computing');
+ classifier.addDocument('i write code', 'computing');
+ classifier.addDocument('nasty script code', 'computing');
+ classifier.addDocument('write a book', 'literature');
+ classifier.addDocument('read a book', 'literature');
+ classifier.addDocument('study the books', 'literature');
+
+ var obj = JSON.stringify(classifier);
+ var newClassifier = natural.LogisticRegressionClassifier.restore(JSON.parse(obj));
+
+ newClassifier.addDocument('kick a ball', 'sports');
+ newClassifier.addDocument('hit some balls', 'sports');
+ newClassifier.addDocument('kick and punch', 'sports');
+
+ newClassifier.train();
+
+ expect(newClassifier.classify('a bug in the code')).toBe('computing');
+ expect(newClassifier.classify('read all the books')).toBe('literature');
+ expect(newClassifier.classify('kick butt')).toBe('sports');
+ });
+
+ it('should save and load a working classifier', function() {
+ var classifier = new natural.LogisticRegressionClassifier();
+ classifier.addDocument('i fixed the box', 'computing');
+ classifier.addDocument('i write code', 'computing');
+ classifier.addDocument('nasty script code', 'computing');
+ classifier.addDocument('write a book', 'literature');
+ classifier.addDocument('read a book', 'literature');
+ classifier.addDocument('study the books', 'literature');
+ classifier.train();
+
+ classifier.save('lr_classifier.json', function(err) {
+ natural.LogisticRegressionClassifier.load('lr_classifier.json', null,
+ function(err, newClassifier){
+ newClassifier.addDocument('hit some balls', 'sports');
+ newClassifier.addDocument('kick a ball', 'sports');
+ newClassifier.addDocument('kick and punch things', 'sports');
+ newClassifier.train();
+
+ expect(newClassifier.classify('a bug in the code')).toBe('computing');
+ expect(newClassifier.classify('read all the books')).toBe('literature');
+ expect(newClassifier.classify('kick butt')).toBe('sports');
+
+ asyncSpecDone();
+ });
+ });
+ });
});

0 comments on commit 2e60220

Please sign in to comment.