In [1]:
import nltk.classify.util
from nltk.classify import NaiveBayesClassifier
from nltk.corpus import movie_reviews

In [2]:
def extract_features(word_list):
    return dict([(word, True) for word in word_list ])

In [3]:
# Load positive and negative reviews
positive_fileids = movie_reviews.fileids('pos')
negative_fileids = movie_reviews.fileids('neg')

In [4]:
features_positive = [(extract_features(movie_reviews.words(fileids=[f])), 
                     'Positive') for f in positive_fileids]

features_negative = [(extract_features(movie_reviews.words(fileids=[f])), 
                     'Negative') for f in negative_fileids]

In [8]:
# features_positive[0]

In [9]:
# Split the data into train and test
threshold_factor = 0.8
threshold_positive = int(threshold_factor * len(features_positive))
threshold_negative = int(threshold_factor * len(features_negative))

In [10]:
features_train = features_positive[:threshold_positive] + features_negative[:threshold_negative]
features_test = features_positive[threshold_positive:] + features_negative[threshold_negative:]
print("Number of training datapoints :",len(features_train))
print("Number of test datapoints :",len(features_test))

Number of training datapoints : 1600
Number of test datapoints : 400


In [12]:
# Training a Naive Bayes Classifier
classfier = NaiveBayesClassifier.train(features_train)
print("Accuracy :",nltk.classify.accuracy(classfier, features_test))

Accuracy : 0.735


In [13]:
print("Top 10 most informative words :")
for item in classfier.most_informative_features()[:10]:
    print(item[0])

Top 10 most informative words :
outstanding
insulting
vulnerable
ludicrous
uninvolving
astounding
avoids
fascination
animators
affecting


In [9]:
# Sample input reviews
input_reviews = ["The movie is really nice",
                "Will watch it again",
                "That was totally bullshit",
                "Starting was boring but at the end it was nice",
                "Not so good"]

print("\nPredictions")
for review in input_reviews:
    print("\nReview :",review)
    
    probdist = classfier.prob_classify(extract_features(review.split()))
    pred_sentiment = probdist.max()
    
    print("Predicted Sentiment :",pred_sentiment)
    print("Probability :",round(probdist.prob(pred_sentiment), 2))


Predictions

Review : The movie is really nice
Predicted Sentiment : Positive
Probability : 0.53

Review : Will watch it again
Predicted Sentiment : Positive
Probability : 0.51

Review : That was totally bullshit
Predicted Sentiment : Negative
Probability : 0.57

Review : Starting was boring but at the end it was nice
Predicted Sentiment : Negative
Probability : 0.74

Review : Not so good
Predicted Sentiment : Negative
Probability : 0.52
