## Text Classification

In [1]:
import nltk
import random
from nltk.corpus import movie_reviews
import pickle
from nltk.classify.scikitlearn import SklearnClassifier 
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC
from nltk.classify import ClassifierI
from statistics import mode

In [2]:
documents = [(list(movie_reviews.words(fileid)), category)
            for category in movie_reviews.categories()
            for fileid in movie_reviews.fileids(category)]

In [3]:
random.shuffle(documents)

In [4]:
print(documents[1])

(['i', 'never', 'understood', 'what', 'the', 'clich', '?', '"', 'hell', 'on', 'earth', '"', 'truly', 'meant', 'until', 'very', 'recently', '.', 'i', "'", 've', 'just', 'never', 'experienced', 'anything', 'in', 'my', 'life', 'which', 'was', 'so', 'terrifying', ',', 'so', 'horrible', ',', 'so', 'monstrously', 'deplorable', 'that', 'it', 'justified', 'being', 'termed', 'as', '"', 'hell', 'on', 'earth', '"', '.', 'after', 'all', ',', 'i', "'", 've', 'never', 'been', 'to', 'war', ',', 'i', "'", 've', 'never', 'been', 'the', 'victim', 'of', 'any', 'violent', 'crime', ',', 'i', "'", 've', 'never', 'really', 'been', 'broken', 'hearted', ',', 'and', 'i', "'", 've', 'never', 'been', 'audited', ';', 'so', 'i', 'really', 'had', 'no', 'frame', 'of', 'reference', 'as', 'to', 'people', 'meant', 'when', 'they', 'said', 'that', 'something', 'was', '"', 'hell', 'on', 'earth', '"', 'then', 'i', 'saw', 'cruel', 'intentions', '.', 'in', 'a', 'perfect', 'world', ',', 'i', 'would', 'be', 'in', 'charge', 'of'

In [5]:
every_words = []

for w in movie_reviews.words():
    every_words.append(w.lower())
    
every_words = nltk.FreqDist(every_words)
print(every_words.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [6]:
print(every_words['vampires'])
print(every_words['death'])
print(every_words['flick'])

83
468
196


### Words as feature

In [7]:
word_features = list(every_words.keys())[:4000]

In [8]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)

    return features

In [9]:
print((find_features(movie_reviews.words('neg/cv000_29416.txt'))))

featuresets = [(find_features(rev), category) for (rev, category) in documents]



In [10]:
training_set = featuresets[:1900]
testing_set = featuresets[1900:]

### Naive Bayes Classifier

Naive Base algorithm makes the assumption that the occurrence of a certain feature is independent of the occurrence of the features.

Posterior = prior occurences * liklihood/evidence


Scalable and easy to understand.


In [11]:
classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

Classifier accuracy percent: 82.0
Most Informative Features
                   sucks = True              neg : pos    =     10.4 : 1.0
                  annual = True              pos : neg    =      9.5 : 1.0
               stupidity = True              neg : pos    =      9.2 : 1.0
             silverstone = True              neg : pos    =      7.8 : 1.0
           unimaginative = True              neg : pos    =      7.8 : 1.0
                 wasting = True              neg : pos    =      7.8 : 1.0
                 frances = True              pos : neg    =      7.5 : 1.0
                 idiotic = True              neg : pos    =      7.3 : 1.0
                  sexist = True              neg : pos    =      7.1 : 1.0
                  suvari = True              neg : pos    =      7.1 : 1.0
                    mena = True              neg : pos    =      7.1 : 1.0
                  crappy = True              neg : pos    =      7.1 : 1.0
              schumacher = True         

### Pickle

Pickle is python object serialization.

It is used for serializing and de-serializing a Python object structure. 

Any object in python can be pickled so that it can be saved on disk. What pickle does is that it “serialises” the object first before writing it to file.

Pickling is a way to convert a python object (list, dict, etc.) into a character stream. The idea is that this character stream contains all the information necessary to reconstruct the object in another python script.

In [13]:
save_classifier = open("naivebayes.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [14]:
classifier_f = open('naivebayes.pickle', 'rb')
classifier = pickle.load(classifier_f)
classifier_f.close()

In [15]:
print("Classifier accuracy percent:",(nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

Classifier accuracy percent: 82.0
Most Informative Features
                   sucks = True              neg : pos    =     10.4 : 1.0
                  annual = True              pos : neg    =      9.5 : 1.0
               stupidity = True              neg : pos    =      9.2 : 1.0
             silverstone = True              neg : pos    =      7.8 : 1.0
           unimaginative = True              neg : pos    =      7.8 : 1.0
                 wasting = True              neg : pos    =      7.8 : 1.0
                 frances = True              pos : neg    =      7.5 : 1.0
                 idiotic = True              neg : pos    =      7.3 : 1.0
                  sexist = True              neg : pos    =      7.1 : 1.0
                  suvari = True              neg : pos    =      7.1 : 1.0
                    mena = True              neg : pos    =      7.1 : 1.0
                  crappy = True              neg : pos    =      7.1 : 1.0
              schumacher = True         

### Scikit-Learn with nltk

In [17]:
MNB_classifier = SklearnClassifier(MultinomialNB()) 
MNB_classifier.train(training_set)
print("MNB Classifier accuracy percent:",(nltk.classify.accuracy(MNB_classifier, testing_set))*100)

MNB Classifier accuracy percent: 84.0


In [18]:
BNB_classifier = SklearnClassifier(BernoulliNB()) 
BNB_classifier.train(training_set)
print("BNB Classifier accuracy percent:",(nltk.classify.accuracy(BNB_classifier, testing_set))*100)

BNB Classifier accuracy percent: 80.0


In [19]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression()) 
LogisticRegression_classifier.train(training_set)
print("LogisticRegression Classifier accuracy percent:",(nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

LogisticRegression Classifier accuracy percent: 85.0


In [20]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier()) 
SGDClassifier_classifier.train(training_set)
print("SGDClassifier Classifier accuracy percent:",(nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)



SGDClassifier Classifier accuracy percent: 85.0


In [21]:
SVC_classifier = SklearnClassifier(SVC()) 
SVC_classifier.train(training_set)
print("SVC Classifier accuracy percent:",(nltk.classify.accuracy(SVC_classifier, testing_set))*100)

SVC Classifier accuracy percent: 79.0


In [22]:
LinearSVC_classifier = SklearnClassifier(LinearSVC()) 
LinearSVC_classifier.train(training_set)
print("LinearSVC Classifier accuracy percent:",(nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

LinearSVC Classifier accuracy percent: 82.0


In [23]:
NuSVC_classifier = SklearnClassifier(NuSVC()) 
NuSVC_classifier.train(training_set)
print("NuSVC Classifier accuracy percent:",(nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

NuSVC Classifier accuracy percent: 85.0


### Voted Accuracy

In [25]:
class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers):
        self._classifiers = classifiers

    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes)

    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)

        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [26]:
voted_classifier = VoteClassifier(classifier,
                                  NuSVC_classifier,
                                  LinearSVC_classifier,
                                  SGDClassifier_classifier,
                                  MNB_classifier,
                                  BNB_classifier,
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100) 

voted_classifier accuracy percent: 86.0


In [27]:
print("Classification:", voted_classifier.classify(testing_set[0][0]), "Confidence %:",voted_classifier.confidence(testing_set[0][0])*100)
print("Classification:", voted_classifier.classify(testing_set[1][0]), "Confidence %:",voted_classifier.confidence(testing_set[1][0])*100)
print("Classification:", voted_classifier.classify(testing_set[2][0]), "Confidence %:",voted_classifier.confidence(testing_set[2][0])*100)
print("Classification:", voted_classifier.classify(testing_set[3][0]), "Confidence %:",voted_classifier.confidence(testing_set[3][0])*100)
print("Classification:", voted_classifier.classify(testing_set[4][0]), "Confidence %:",voted_classifier.confidence(testing_set[4][0])*100)
print("Classification:", voted_classifier.classify(testing_set[5][0]), "Confidence %:",voted_classifier.confidence(testing_set[5][0])*100)


Classification: neg Confidence %: 100.0
Classification: neg Confidence %: 85.71428571428571
Classification: pos Confidence %: 100.0
Classification: pos Confidence %: 57.14285714285714
Classification: pos Confidence %: 100.0
Classification: neg Confidence %: 57.14285714285714
