In [2]:
import nltk
import random
from nltk.corpus import movie_reviews

In [3]:
documents = [(list(movie_reviews.words(fileid)), category)
                for category in movie_reviews.categories()
                    for fileid in movie_reviews.fileids(category)]

In [4]:
random.shuffle(documents)

In [5]:
print(documents[1])

(['john', 'cusack', 'is', 'the', 'kind', 'of', 'actor', 'who', 'seems', 'to', 'effortlessly', 'slide', 'into', 'his', 'respective', 'film', 'roles', '.', 'so', 'effortlessly', 'that', 'people', 'tend', 'to', 'forget', 'he', "'", 's', 'there', ',', 'much', 'in', 'the', 'way', 'people', 'rarely', 'recall', 'many', 'of', 'the', 'great', 'character', 'actors', '(', 'anyone', 'who', 'can', 'put', 'the', 'name', 'james', 'rebhorn', 'with', 'that', 'actor', "'", 's', 'face', 'is', 'invited', 'to', 'treat', 'themselves', 'to', 'a', 'product', 'from', 'one', 'of', 'my', 'sponsors', ')', '.', 'example', ':', 'the', 'other', 'day', 'my', 'mother', 'asked', 'me', '(', 'the', 'expert', ',', 'of', 'course', ')', 'if', 'there', 'were', 'any', 'movies', 'out', 'worth', 'seeing', ',', 'never', 'mind', 'that', 'our', 'tastes', 'couldn', "'", 't', 'be', 'more', 'divergent', '---', 'i', "'", 'll', 'never', 'forget', 'the', 'day', 'she', 'recommended', 'that', 'i', 'go', 'see', 'a', 'night', 'at', 'the', '

In [6]:
all_words = []
for w in movie_reviews.words():
    all_words.append(w.lower())

all_words_dist = nltk.FreqDist(all_words)

In [7]:
print(all_words_dist.most_common(15))

[(',', 77717), ('the', 76529), ('.', 65876), ('a', 38106), ('and', 35576), ('of', 34123), ('to', 31937), ("'", 30585), ('is', 25195), ('in', 21822), ('s', 18513), ('"', 17612), ('it', 16107), ('that', 15924), ('-', 15595)]


In [8]:
from nltk.corpus import stopwords
import string

stop_words = set(stopwords.words("english"))
stop_words = stop_words.union(set(string.punctuation))

print(string.punctuation)

filtered_words = []
for w in all_words:
    if w not in stop_words:
        filtered_words.append(w)
        
filtered_words_dist = nltk.FreqDist(filtered_words)

!"#$%&'()*+,-./:;<=>?@[\]^_`{|}~


In [9]:
print(filtered_words_dist.most_common(15))

[('film', 9517), ('one', 5852), ('movie', 5771), ('like', 3690), ('even', 2565), ('good', 2411), ('time', 2411), ('story', 2169), ('would', 2109), ('much', 2049), ('character', 2020), ('also', 1967), ('get', 1949), ('two', 1911), ('well', 1906)]


In [10]:
print(filtered_words_dist["stupid"])

253


In [11]:
print(filtered_words_dist["awesome"])

35


In [12]:
print(filtered_words_dist["awful"])

132


## CH 12 - Words as Features for Learning

In [13]:
word_features = list(filtered_words_dist.keys())[:3000]
###############################
# BUGBUG ??
###############################
word_features = filtered_words_dist.most_common(3000)
word_features = [word_feature[0]
                    for word_feature in word_features]
word_features.sort()
print(word_features)



In [14]:
def find_features(document):
    words = set(document)
    features = {}
    for w in word_features: # Why not enumerating words in the document? --> Then no way to find "false" words.
        features[w] = (w in words) # True if w is in words/the document.
        
    return features

print((find_features(movie_reviews.words('neg/cv000_29416.txt')))) # Negative review 



In [15]:
featuresets = [(find_features(rev), category) 
                   for (rev, category) in documents]

In [16]:
print(featuresets[0][1])

neg


# CH 13 Naive Bayes

In [17]:
training_set = featuresets[:1900]
testing_set = featuresets[1900:]

classifier = nltk.NaiveBayesClassifier.train(training_set)
print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier, testing_set))*100)
classifier.show_most_informative_features(15)

Naive Bayes Algo accuracy percent: 85.0
Most Informative Features
             outstanding = True              pos : neg    =     10.6 : 1.0
               ludicrous = True              neg : pos    =      9.9 : 1.0
                  finest = True              pos : neg    =      7.9 : 1.0
                  seagal = True              neg : pos    =      7.4 : 1.0
                   anger = True              pos : neg    =      7.2 : 1.0
                   mulan = True              pos : neg    =      7.0 : 1.0
                 idiotic = True              neg : pos    =      7.0 : 1.0
             wonderfully = True              pos : neg    =      6.8 : 1.0
              schumacher = True              neg : pos    =      6.6 : 1.0
            breathtaking = True              pos : neg    =      6.3 : 1.0
                   damon = True              pos : neg    =      6.1 : 1.0
                    lame = True              neg : pos    =      5.7 : 1.0
                   flynt = True   

# CH 14 Pickle

In [18]:
import pickle

In [19]:
save_classifier = open("naivebayes.pickle", "wb")
pickle.dump(classifier, save_classifier)
save_classifier.close()

In [20]:
classifier_f = open("naivebayes.pickle", "rb")
classifier_loaded = pickle.load(classifier_f)
classifier_f.close()

print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(classifier_loaded, testing_set))*100)
classifier_loaded.show_most_informative_features(15)

Naive Bayes Algo accuracy percent: 85.0
Most Informative Features
             outstanding = True              pos : neg    =     10.6 : 1.0
               ludicrous = True              neg : pos    =      9.9 : 1.0
                  finest = True              pos : neg    =      7.9 : 1.0
                  seagal = True              neg : pos    =      7.4 : 1.0
                   anger = True              pos : neg    =      7.2 : 1.0
                   mulan = True              pos : neg    =      7.0 : 1.0
                 idiotic = True              neg : pos    =      7.0 : 1.0
             wonderfully = True              pos : neg    =      6.8 : 1.0
              schumacher = True              neg : pos    =      6.6 : 1.0
            breathtaking = True              pos : neg    =      6.3 : 1.0
                   damon = True              pos : neg    =      6.1 : 1.0
                    lame = True              neg : pos    =      5.7 : 1.0
                   flynt = True   

# Ch15

In [21]:
from nltk.classify.scikitlearn import SklearnClassifier
from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC, NuSVC

In [22]:
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(training_set)
print("MNB_classifier accuracy percent:", (nltk.classify.accuracy(MNB_classifier, testing_set))*100)

MNB_classifier accuracy percent: 89.0


In [23]:
GaussianNB_classifier = SklearnClassifier(GaussianNB())
GaussianNB_classifier.train(training_set)
print("GaussianNB_classifier accuracy percent:", (nltk.classify.accuracy(GaussianNB_classifier, testing_set))*100)

TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.

In [24]:
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(training_set)
print("BernoulliNB_classifier accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier, testing_set))*100)

BernoulliNB_classifier accuracy percent: 85.0


In [26]:
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(training_set)
print("LogisticRegression_classifier accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier, testing_set))*100)

LogisticRegression_classifier accuracy percent: 81.0


In [28]:
SGDClassifier_classifier = SklearnClassifier(SGDClassifier())
SGDClassifier_classifier.train(training_set)
print("SGDClassifier_classifier accuracy percent:", (nltk.classify.accuracy(SGDClassifier_classifier, testing_set))*100)



SGDClassifier_classifier accuracy percent: 82.0


In [29]:
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(training_set)
print("SVC_classifier accuracy percent:", (nltk.classify.accuracy(SVC_classifier, testing_set))*100)

SVC_classifier accuracy percent: 83.0


In [30]:
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(training_set)
print("LinearSVC_classifier accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier, testing_set))*100)

LinearSVC_classifier accuracy percent: 81.0


In [31]:
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(training_set)
print("NuSVC_classifier accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier, testing_set))*100)

NuSVC_classifier accuracy percent: 82.0


In [41]:
from nltk.classify import ClassifierI
from statistics import mode

class VoteClassifier(ClassifierI):
    def __init__(self, *classifiers): # don't forget *
        self._classifiers = classifiers
        
    def classify(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        return mode(votes) # pick the most popular one. pos or neg
    
    def confidence(self, features):
        votes = []
        for c in self._classifiers:
            v = c.classify(features)
            votes.append(v)
        
        choice_votes = votes.count(mode(votes)) # the number of most popular one
        conf = choice_votes / len(votes)        # the ratio of the most popular one.
        
        return conf

In [42]:
voted_classifier = VoteClassifier(classifier,
                                  MNB_classifier,
                                  BernoulliNB_classifier,                                   
                                  LinearSVC_classifier,
                                  SVC_classifier,
                                  NuSVC_classifier, 
                                  LogisticRegression_classifier)

print("voted_classifier accuracy percent:", (nltk.classify.accuracy(voted_classifier, testing_set))*100)

voted_classifier accuracy percent: 89.0


In [50]:
        print("Classification", voted_classifier.classify(testing_set[0][0]), ", Confidence Score", voted_classifier.confidence(testing_set[0][0])*100, "%" )

Classification pos , Confidence Score 100.0 %


In [53]:
print("Classification", voted_classifier.classify(testing_set[1][0]), ", Confidence Score", voted_classifier.confidence(testing_set[1][0])*100, "%" )
print("Classification", voted_classifier.classify(testing_set[2][0]), ", Confidence Score", voted_classifier.confidence(testing_set[2][0])*100, "%" )
print("Classification", voted_classifier.classify(testing_set[3][0]), ", Confidence Score", voted_classifier.confidence(testing_set[3][0])*100, "%" )
print("Classification", voted_classifier.classify(testing_set[4][0]), ", Confidence Score", voted_classifier.confidence(testing_set[4][0])*100, "%" )
print("Classification", voted_classifier.classify(testing_set[5][0]), ", Confidence Score", voted_classifier.confidence(testing_set[5][0])*100, "%" )
print("Classification", voted_classifier.classify(testing_set[6][0]), ", Confidence Score", voted_classifier.confidence(testing_set[6][0])*100, "%" )
print("Classification", voted_classifier.classify(testing_set[7][0]), ", Confidence Score", voted_classifier.confidence(testing_set[7][0])*100, "%" )

Classification pos , Confidence Score 100.0 %
Classification neg , Confidence Score 100.0 %
Classification pos , Confidence Score 100.0 %
Classification pos , Confidence Score 100.0 %
Classification neg , Confidence Score 100.0 %
Classification pos , Confidence Score 100.0 %
Classification pos , Confidence Score 57.14285714285714 %
