In [15]:
import nltk
nltk.download('averaged_perceptron_tagger')
import random
#from nltk.corpus import movie_reviews
from nltk.classify.scikitlearn import SklearnClassifier #for scilearn classifier
import pickle
from nltk.tokenize import word_tokenize

from sklearn.naive_bayes import MultinomialNB, GaussianNB, BernoulliNB
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.svm import SVC, LinearSVC,NuSVC

from nltk.classify import ClassifierI #so we can inherate from the nltk classifier class
from statistics import mode #for the classifier vote system

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\grego\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [16]:
class ScoreClassifier(ClassifierI): #we pass a list of classifiers through this class
    def __init__(self, *classifiers):#init method to run any methods
        self._classifiers = classifiers #classifier list will be whatever list of classifiers passed 
        
    def classify(self, features):
        votes=[]
        for c in self._classifiers:
            v=c.classify(features)
            votes.append(v)
        return mode(votes)#returns number of votes
    
    def confidence(self,features):
        votes=[]
        for c in self._classifiers:
            v=c.classify(features)
            votes.append(v)
        choice_votes = votes.count(mode(votes))
        conf = choice_votes / len(votes)
        return conf

In [31]:
#building training sets

pos_review = open("positive_review.txt","r").read()
neg_review = open("negative_review.txt","r").read()

all_words = []
documents = []

#J is adjectiv, R is adverb and v is verb
allowed_word_types = ["J"]

for r in pos_review.split('\n'):
    documents.append((r,"pos"))
    words = word_tokenize(r)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())
        
for r in neg_review.split('\n'):
    documents.append((r,"neg"))
    words = word_tokenize(r)
    pos = nltk.pos_tag(words)
    for w in pos:
        if w[1][0] in allowed_word_types:
            all_words.append(w[0].lower())

#saving documents and all_words in pickle
save_doc=open("documents.pickle","wb")
pickle.dump(documents,save_doc)
save_doc.close

save_words=open("all_words.pickle","wb")
pickle.dump(all_words,save_words)
save_words.close

<function BufferedWriter.close>

In [32]:
#transforming all words in nltk freq distribution
all_words = nltk.FreqDist(all_words)

#top 15 most common words
print(all_words.most_common(15))

[('good', 369), ('more', 331), ('little', 265), ('funny', 245), ('much', 234), ('bad', 234), ('best', 208), ('new', 206), ('own', 185), ('many', 183), ('most', 167), ('other', 167), ('great', 160), ('big', 156), ('few', 139)]


In [33]:
print(all_words["bad"])

234


In [34]:
#features : 5000 words for training 
word_features = list(all_words.keys())[:5000]

#saving in pickle
save_features=open("word_features.pickle","wb")
pickle.dump(word_features,save_features)
save_features.close

<function BufferedWriter.close>

In [35]:
def find_features(document):
    words = word_tokenize(document)
    features = {}
    for w in word_features:
        features[w] = (w in words)
    
    return features

In [36]:
#convert words in dictionnary of 5000 words with category true (neg) or false (pos)
featuresets = [(find_features(rev), category) for (rev,category) in documents ]

save_featuresets=open("featuresets.pickle","wb")
pickle.dump(featuresets,save_featuresets)
save_featuresets.close

In [37]:
random.shuffle(featuresets)

In [38]:
D_train = featuresets[:10000]
D_test = featuresets[10000:]

In [39]:
NB_classifier = nltk.NaiveBayesClassifier.train(D_train)

In [40]:
#15 most informative features of our dictionnary:
NB_classifier.show_most_informative_features(15)
#engrossing appears 20.3 times more in a neg review than a pos

Most Informative Features
              engrossing = True              pos : neg    =     21.0 : 1.0
                 generic = True              neg : pos    =     16.4 : 1.0
                mediocre = True              neg : pos    =     16.4 : 1.0
                 routine = True              neg : pos    =     15.0 : 1.0
                    dull = True              neg : pos    =     14.8 : 1.0
               inventive = True              pos : neg    =     14.3 : 1.0
                    flat = True              neg : pos    =     14.2 : 1.0
                  boring = True              neg : pos    =     13.9 : 1.0
              refreshing = True              pos : neg    =     13.6 : 1.0
                    warm = True              pos : neg    =     12.6 : 1.0
                powerful = True              pos : neg    =     12.4 : 1.0
                   stale = True              neg : pos    =     11.7 : 1.0
                mindless = True              neg : pos    =     11.7 : 1.0

In [57]:
print("Naive Bayes Algo accuracy percent:", (nltk.classify.accuracy(NB_classifier,D_test))*100)

Naive Bayes Algo accuracy percent: 73.90648567119156


In [70]:
#saving classifier so that we dont have to re train it
save_classifier=open("NB_classifier.pickle","wb")
pickle.dump(NB_classifier,save_classifier)
save_classifier.close()

In [71]:
#Multinomial
MNB_classifier = SklearnClassifier(MultinomialNB())
MNB_classifier.train(D_train)

save_classifier=open("MNB_classifier.pickle","wb")
pickle.dump(MNB_classifier,save_classifier)
save_classifier.close()

print("Multinomial accuracy percent:", (nltk.classify.accuracy(MNB_classifier,D_test))*100)

Multinomial accuracy percent: 70.58823529411765


In [72]:
BernoulliNB_classifier = SklearnClassifier(BernoulliNB())
BernoulliNB_classifier.train(D_train)

save_classifier=open("BernoulliNB_classifier.pickle","wb")
pickle.dump(BernoulliNB_classifier,save_classifier)
save_classifier.close()

print("BernoulliNB accuracy percent:", (nltk.classify.accuracy(BernoulliNB_classifier,D_test))*100)

BernoulliNB accuracy percent: 74.81146304675717


In [73]:
#Logistic reg
LogisticRegression_classifier = SklearnClassifier(LogisticRegression())
LogisticRegression_classifier.train(D_train)

save_classifier=open("LogisticRegression_classifier.pickle","wb")
pickle.dump(LogisticRegression_classifier,save_classifier)
save_classifier.close()

print("LogisticRegression accuracy percent:", (nltk.classify.accuracy(LogisticRegression_classifier,D_test))*100)



LogisticRegression accuracy percent: 71.94570135746606


In [74]:
#Stochastic gradient classifier
StochasticGradient_classifier = SklearnClassifier(SGDClassifier())
StochasticGradient_classifier.train(D_train)

save_classifier=open("StochasticGradient_classifier.pickle","wb")
pickle.dump(StochasticGradient_classifier,save_classifier)
save_classifier.close()

print("StochasticGradient_classifier accuracy percent:", (nltk.classify.accuracy(StochasticGradient_classifier,D_test))*100)



StochasticGradient_classifier accuracy percent: 72.54901960784314


In [75]:
SVC_classifier = SklearnClassifier(SVC())
SVC_classifier.train(D_train)

save_classifier=open("SVC_classifier.pickle","wb")
pickle.dump(SVC_classifier,save_classifier)
save_classifier.close()

print("SVC accuracy percent:", (nltk.classify.accuracy(SVC_classifier,D_test))*100)



SVC accuracy percent: 49.17043740573152


In [76]:
#Linear SVM
LinearSVC_classifier = SklearnClassifier(LinearSVC())
LinearSVC_classifier.train(D_train)

save_classifier=open("LinearSVC_classifier.pickle","wb")
pickle.dump(LinearSVC_classifier,save_classifier)
save_classifier.close()

print("Linear SVC accuracy percent:", (nltk.classify.accuracy(LinearSVC_classifier,D_test))*100)

Linear SVC accuracy percent: 70.2865761689291


In [77]:
#with nu we can customize the nb of support vectors used
NuSVC_classifier = SklearnClassifier(NuSVC())
NuSVC_classifier.train(D_train)

save_classifier=open("NuSVC_classifier.pickle","wb")
pickle.dump(NuSVC_classifier,save_classifier)
save_classifier.close()

print("Nu SVC accuracy percent:", (nltk.classify.accuracy(NuSVC_classifier,D_test))*100)

Nu SVC accuracy percent: 72.2473604826546


In [78]:
voted_classifier = ScoreClassifier(NB_classifier, MNB_classifier,BernoulliNB_classifier, LogisticRegression_classifier,StochasticGradient_classifier,SVC_classifier,NuSVC_classifier)

save_classifier=open("voted_classifier.pickle","wb")
pickle.dump(voted_classifier,save_classifier)
save_classifier.close()

print("voted accuracy percent:", (nltk.classify.accuracy(voted_classifier ,D_test))*100)

voted accuracy percent: 73.00150829562594


In [67]:
print("Classification:", voted_classifier.classify(D_test[0][0]),"Confidence %:",voted_classifier.confidence(D_test[0][0])*100)

Classification: pos Confidence %: 100.0


In [68]:
print("Classification:", voted_classifier.classify(D_test[1][0]),"Confidence %:",voted_classifier.confidence(D_test[1][0])*100)

Classification: pos Confidence %: 100.0


In [69]:
print("Classification:", voted_classifier.classify(D_test[2][0]),"Confidence %:",voted_classifier.confidence(D_test[2][0])*100)

Classification: pos Confidence %: 100.0


In [54]:
print("Classification:", voted_classifier.classify(D_test[3][0]),"Confidence %:",voted_classifier.confidence(D_test[3][0])*100)

Classification: neg Confidence %: 85.71428571428571


In [55]:
print("Classification:", voted_classifier.classify(D_test[4][0]),"Confidence %:",voted_classifier.confidence(D_test[4][0])*100)

Classification: pos Confidence %: 100.0


In [56]:
print("Classification:", voted_classifier.classify(D_test[5][0]),"Confidence %:",voted_classifier.confidence(D_test[5][0])*100)

Classification: neg Confidence %: 85.71428571428571
