In [37]:
from nltk.corpus import stopwords
from nltk.collocations import BigramCollocationFinder
from nltk.metrics import BigramAssocMeasures
from nltk.corpus import movie_reviews
import collections
from nltk.classify import NaiveBayesClassifier
from nltk.classify.util import accuracy
from nltk.classify import DecisionTreeClassifier
from nltk.classify import MaxentClassifier
from nltk.classify import scikitlearn
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB,BernoulliNB
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC,LinearSVC,NuSVC
from nltk import metrics
from nltk.metrics import BigramAssocMeasures
from nltk.probability import FreqDist, ConditionalFreqDist
import itertools
from nltk.classify import ClassifierI

In [38]:
def bag_of_words(words):

    return dict([(word,True) for word in words])


In [39]:
def bag_of_words_not_in_set(words,badwords):

    return bag_of_words(set(words) - set(badwords))


In [40]:
def bag_of_non_stopwords(words, stopfile='english'):

    badwords = stopwords.words(stopfile)

    return bag_of_words_not_in_set(words,badwords)


In [41]:
def bag_of_bigrams_words(words, score_fn = BigramAssocMeasures.chi_sq, n =200):

    bigram_finder = BigramCollocationFinder.from_words(words)

    bigrams = bigram_finder.nbest(score_fn,n)

    return bag_of_words(words+bigrams)


The label_feats_from_corpus() function assumes that the corpus is categorized,
and that a single file represents a single instance for feature extraction. It iterates over
each category label, and extracts features from each file in that category using the
feature_detector() function, which defaults to bag_of_words(). It returns a dict
whose keys are the category labels, and the values are lists of instances for that category.

In [42]:
def label_feats_from_corpus(corp, feature_detector = bag_of_words):

    label_feats = collections.defaultdict(list)

    for label in corp.categories():

        for fileid in corp.fileids(categories=[label]):

            feats = feature_detector(corp.words(fileids = [fileid]))
            label_feats[label].append(feats)

    return label_feats


Now we need to split the labeled feature sets into training and testing instances using
split_label_feats(). This function allows us to take a fair sample of labeled feature
sets from each label, using the split keyword argument to determine the size of the sample.
The split argument defaults to 0.75, which means the first 75% of the labeled feature sets
for each label will be used for training, and the remaining 25% will be used for testing.

In [43]:
def split_label_feats(lfeats,split=0.75):

    train_feats = []
    test_feats = []

    for label, feats in lfeats.items():

        cutoff = int(len(feats)*split)

        train_feats.extend([(feat,label) for feat in feats[:cutoff]])
        test_feats.extend([(feat, label) for feat in feats[cutoff:]])

    return train_feats,test_feats

In [44]:
lfeats = label_feats_from_corpus(movie_reviews)

lfeats.keys()

dict_keys(['pos', 'neg'])

In [45]:
train_feats, test_feats = split_label_feats(lfeats,split=0.75)

In [46]:
print(len(train_feats))
print(len(test_feats))

1500
500


In [47]:
nb_classifier = NaiveBayesClassifier.train(train_feats)

In [48]:
nb_classifier.labels()

['pos', 'neg']

In [49]:
review = bag_of_words(['the','plot','was','accessible'])
print(nb_classifier.classify(review))

pos


We can test the accuracy of the classifier using nltk.classify.util.accuracy()
and the test_feats variable created previously:

In [50]:
accuracy(nb_classifier,test_feats)

0.728

While the classify() method returns only a single label, you can use the
prob_classify() method to get the classification probability of each label.
This can be useful if you want to use probability thresholds for classification

In [51]:
probs = nb_classifier.prob_classify(test_feats[0][0])
probs.samples()

dict_keys(['pos', 'neg'])

In [52]:
print(probs.prob('pos'))
print(probs.prob('neg'))

1.0
1.744195869104262e-21


The show_most_informative_features() method will print out the results from
most_informative_features() and will also include the probability of a feature
pair belonging to each label:

The informativeness, or information gain, of each feature pair is based on the prior
probability of the feature pair occurring for each label.More informative features are
those that occur primarily in one label and not on the other

In [35]:
nb_classifier.show_most_informative_features(n=5)

Most Informative Features
             magnificent = True              pos : neg    =     15.0 : 1.0
             outstanding = True              pos : neg    =     13.6 : 1.0
               insulting = True              neg : pos    =     13.0 : 1.0
              vulnerable = True              pos : neg    =     12.3 : 1.0
               ludicrous = True              neg : pos    =     11.8 : 1.0


Using DecisionTreeClassifier

The entropy_cutoff value is used during the tree refinement process. The tree refinement
process is how the decision tree decides to create new branches. If the entropy of the
probability distribution of label choices in the tree is greater than the entropy_cutoff
value, then the tree is refined further by creating more branches. But if the entropy is lower
than the entropy_cutoff value, then tree refinement is halted.

The depth_cutoff value is also used during refinement to control the depth of the tree.
The final decision tree will never be deeper than the depth_cutoff value. The default
value is 100, which means that classification may require up to 100 decisions before
reaching a leaf node. Decreasing the depth_cutoff value will decrease the training time
and most likely decrease the accuracy as well.

In [36]:
dt_classifier = DecisionTreeClassifier.train(train_feats,binary=True, entropy_cutoff=0.8, depth_cutoff=5, support_cutoff=30)

KeyboardInterrupt: 

In [None]:
accuracy(dt_classifier,test_feats)

Training a maximum entropy classifier

The third classifier we will cover is the MaxentClassifier class, also known as a
conditional exponential classifier or logistic regression classifier. The maximum
entropy classifier converts labeled feature sets to vectors using encoding. This encoded
vector is then used to calculate weights for each feature that can then be combined to
determine the most likely label for a feature set.

In [None]:
me_classifier = MaxentClassifier.train(train_feats, algorithm='gis', trace=0, max_iter=10, min_lldelta=0.5)

In [None]:
accuracy(me_classifier, test_feats)

Training scikit-learn classifiers Scikit-learn

We won't be accessing the scikit-learn models directly in this recipe. Instead,
we'll be using NLTK's SklearnClassifier class, which is a wrapper class around a
scikit-learn model to make it conform to NLTK's ClassifierI interface

Training an SklearnClassifier class has a slightly different series of steps than classifiers
covered in the previous recipes of this chapter:
1. Create training features (covered in the previous recipes).
2. Choose and import an sklearn algorithm.
3. Construct an SklearnClassifier class with the chosen algorithm.
4. Train the SklearnClassifier class with your training features.



The SklearnClassifier class is a small wrapper class whose main job is to convert NLTK
feature dictionaries into sklearn compatible feature vectors

But not all the classification algorithms are compatible with the
SklearnClassifier class, because it uses sparse vectors. Sparse vectors are more
efficient because they only store the data they need, using a kind of data compression

In [53]:
sk_classifier = SklearnClassifier(MultinomialNB())
sk_classifier.train(train_feats)

<SklearnClassifier(MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))>

In [54]:
accuracy(sk_classifier, test_feats)

0.83

We can try it with BernoulliNB

In [55]:
sk_classifier = SklearnClassifier(BernoulliNB())
sk_classifier.train(train_feats)

<SklearnClassifier(BernoulliNB(alpha=1.0, binarize=0.0, class_prior=None, fit_prior=True))>

In [56]:
accuracy(sk_classifier, test_feats)

0.812

sk_classifier = SklearnClassifier(LogisticRegression())

In [57]:
sk_classifier = SklearnClassifier(LogisticRegression())
sk_classifier.train(train_feats)

<SklearnClassifier(LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False))>

In [58]:
accuracy(sk_classifier, test_feats)

0.892

Again, we see that the sklearn algorithm has better performance than NLTK's
MaxentClassifier, which only had 72.2% accuracy. The logistic regression algorithm
also has a much faster training time than the IIS or GIS algorithms, even when those
algorithms have a limited number of iterations. This can be explained by sklearn's focus
on optimized numeric processing using NumPy

A third family of algorithms that NLTK does not support directly is Support Vector
Machines, or SVM. These algorithms have been shown to be effective at learning
on high-dimensional data, such as text classification, where every word feature
counts as a dimension

In [59]:
sk_classifier = SklearnClassifier(SVC())
sk_classifier.train(train_feats)
accuracy(sk_classifier, test_feats)

0.69

In [60]:
sk_classifier = SklearnClassifier(LinearSVC())
sk_classifier.train(train_feats)
accuracy(sk_classifier, test_feats)

0.864

In [61]:
sk_classifier = SklearnClassifier(NuSVC())
sk_classifier.train(train_feats)
accuracy(sk_classifier, test_feats)

0.882

In addition to accuracy, there are a number of other metrics used to evaluate classifiers.
Two of the most common are precision and recall. To understand these two metrics, we must
first understand false positives and false negatives. False positives happen when a classifier
classifies a feature set with a label it shouldn't have gotten. False negatives happen when a
classifier doesn't assign a label to a feature set that should have it. In a binary classifier, these
errors happen at the same time.

The low information words are words that are common to all labels. It may be counter-intuitive,
but eliminating these words from the training data can actually improve accuracy, precision, and
recall. The reason this works is that using only high information words reduces the noise and
confusion of a classifier's internal model. If all the words/features are highly biased one way or
the other, it's much easier for the classifier to make a correct guess.

In [62]:
def bag_of_words_in_set(words, goodwords):
    return bag_of_words(set(words) & set(goodwords))

The high_information_words() function starts by counting the frequency of every word,
as well as the conditional frequency for each word within each label.Once we have the FreqDist and ConditionalFreqDist variables, we can score each
word on a per-label basis


The default score_fn is nltk.metrics.BigramAssocMeasures.chi_sq(), which
calculates the chi-square score for each word using the following parameters:
1. n_ii: This is the frequency of the word for the label
2. n_ix: This is the total frequency of the word across all labels
3. n_xi: This is the total frequency of all words that occurred for the label
4. n_xx: This is the total frequency for all words in all labels

The simplest way to think about these numbers is that the closer n_ii is to n_ix, the higher
the score. Or, the more often a word occurs in a label, relative to its overall occurrence, the
higher the score.

In [63]:
def high_information_words(labelled_words, score_fn=BigramAssocMeasures.chi_sq, min_score=5):
    word_fd = FreqDist()
    label_word_fd = ConditionalFreqDist()
    
    for label,words in labelled_words:
        
        for word in words:
            word_fd[word] += 1
            label_word_fd[label][word] += 1
            
    n_xx = label_word_fd.N()
    high_info_words = set()
    
    for label in label_word_fd.conditions():
        
        n_xi = label_word_fd[label].N()
        word_scores = collections.defaultdict(int)
        
        for word,n_ii in label_word_fd[label].items():
            
            n_ix = word_fd[word]
            score = score_fn(n_ii,(n_ix,n_xi),n_xx)
            word_scores[word] = score
            
            
        bestwords = [word for word,score in word_scores.items() if score >=min_score]
        high_info_words |= set(bestwords)
        
    return high_info_words
            
    
    

With this new feature detector, we can call label_feats_from_corpus() and get a
new train_feats and test_feats function using split_label_feats().

In [64]:
labels = movie_reviews.categories()
labeled_words = [(label,movie_reviews.words(categories=[label])) for label in labels]

In [65]:
high_info_words = set(high_information_words(labeled_words))

In [66]:
feat_det = lambda words: bag_of_words_in_set(words,high_info_words)
lfeats = label_feats_from_corpus(movie_reviews,feature_detector=feat_det)

In [67]:
train_feats,test_feats = split_label_feats(lfeats)

Now that we have new training and testing feature sets, let's train and evaluate a
NaiveBayesClassifier class:

In [68]:
nb_classifier = NaiveBayesClassifier.train(train_feats)

In [69]:
accuracy(nb_classifier, test_feats)

0.91

One way to improve classification performance is to combine classifiers. The simplest way to
combine multiple classifiers is to use voting, and choose whichever label gets the most votes.
For this style of voting, it's best to have an odd number of classifiers so that there are no ties.
This means combining at least three classifiers together. The individual classifiers should
also use different algorithms; the idea is that multiple algorithms are better than one, and
the combination of many can compensate for individual bias.

In [70]:
class MaxVoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers
        self._labels = sorted(set(itertools.chain(*[c.labels() for c in classifiers])))

    def labels(self):
        return self._labels
    
    def classify(self, feats):
        counts = FreqDist()
        for classifier in self._classifiers:
            counts[classifier.classify(feats)] += 1
        
        return counts.max()

In [71]:
nb_classifier = NaiveBayesClassifier.train(train_feats)
svc_classifier = SklearnClassifier(NuSVC()).train(train_feats)
mnb_classifier = SklearnClassifier(MultinomialNB()).train(train_feats)

mv_classifier = MaxVoteClassifier(nb_classifier, svc_classifier,mnb_classifier)

In [72]:
accuracy(mv_classifier, test_feats)

0.918