In [1]:
import classify
import score

Getting all of the inputs

In [2]:
train_texts = [x.strip() for x in open("Data/train.docs.txt", encoding='utf8')]
train_classes = [x.strip() for x in open("Data/train.classes.txt", encoding='utf8')]
test_texts = [x.strip() for x in open("Data/test.docs.txt", encoding='utf8')]
test_classes = [x.strip() for x in open("Data/test.classes.txt", encoding='utf8')]
pos_words = "Data/pos-words.txt"
neg_words = "Data/neg-words.txt"

Setting up the classifiers

In [3]:
baseline_classifier = classify.Baseline(train_classes=train_classes)
lexicon_classifier = classify.Lexicon(pos_words=pos_words, neg_words=neg_words)
logreg_classifier = classify.LogReg(texts=train_texts, classes=train_classes)
naivebayes_classifier = classify.NaiveBayes(texts=train_texts, classes=train_classes)
binary_naivebayes_classifier = classify.BinaryNaiveBayes(texts=train_texts, classes=train_classes)

Classifying the different testing documents with each classifier 

In [4]:
baseline_results = [baseline_classifier.classify(x) for x in test_texts]
lexicon_results = [lexicon_classifier.classify(x) for x in test_texts]
logreg_results = [logreg_classifier.classify(x) for x in test_texts]
naivebayes_results = [naivebayes_classifier.classify(x) for x in test_texts]
binary_naivebayes_results = [binary_naivebayes_classifier.classify(x) for x in test_texts]

# Scores

Getting the different statistics for each classifier on the testing documents

Baseline classifier results (predict the most common class always)

In [6]:
score.score(baseline_results, test_classes)

Current class: positive
Precision: 0
Recall: 0
F1-Score: 0

Current class: neutral
Precision: 0
Recall: 0
F1-Score: 0

Current class: negative
Precision: 0.67
Recall: 0.67
F1-Score: 0.67

Accuracy: 0.67
Macro averaged P: 0.223
Macro averaged R: 0.333
Macro averaged F: 0.267


Lexicon classifier results (tokens pre-defined as positive or negative)

In [7]:
score.score(lexicon_results, test_classes)

Current class: positive
Precision: 0.308
Recall: 0.308
F1-Score: 0.308

Current class: neutral
Precision: 0.217
Recall: 0.217
F1-Score: 0.217

Current class: negative
Precision: 0.817
Recall: 0.817
F1-Score: 0.817

Accuracy: 0.44
Macro averaged P: 0.447
Macro averaged R: 0.482
Macro averaged F: 0.411


Logistic Regression classifier results

In [8]:
score.score(logreg_results, test_classes)

Current class: positive
Precision: 0.343
Recall: 0.343
F1-Score: 0.343

Current class: neutral
Precision: 0.596
Recall: 0.596
F1-Score: 0.596

Current class: negative
Precision: 0.808
Recall: 0.808
F1-Score: 0.808

Accuracy: 0.7
Macro averaged P: 0.583
Macro averaged R: 0.564
Macro averaged F: 0.572


Non-Binarized Naive Bayes Classifier Results

In [9]:
score.score(naivebayes_results, test_classes)

Current class: positive
Precision: 0.371
Recall: 0.371
F1-Score: 0.371

Current class: neutral
Precision: 0.64
Recall: 0.64
F1-Score: 0.64

Current class: negative
Precision: 0.792
Recall: 0.792
F1-Score: 0.792

Accuracy: 0.708
Macro averaged P: 0.601
Macro averaged R: 0.561
Macro averaged F: 0.576


Binarized Naive Bayes Classifier Results (Each token only used once per document)

In [10]:
score.score(binary_naivebayes_results, test_classes)

Current class: positive
Precision: 0.388
Recall: 0.388
F1-Score: 0.388

Current class: neutral
Precision: 0.633
Recall: 0.633
F1-Score: 0.633

Current class: negative
Precision: 0.803
Recall: 0.803
F1-Score: 0.803

Accuracy: 0.713
Macro averaged P: 0.608
Macro averaged R: 0.571
Macro averaged F: 0.585
