In [None]:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function

In [None]:
import numpy as np
import nltk

In [3]:
print(np.__version__)
print(nltk.__version__)

1.15.4
3.4


In [4]:
def get_reviews(path, positive = True):
    label = 1 if positive else 0
    
    with open(path, 'r') as f:
        review_text = f.readlines()
    
    reviews = []
    for text in review_text:
        reviews.append((text, label))
    return reviews

In [5]:
def extract_reviews():
    positive_reviews = get_reviews('rt-polarity.pos', positive=True)
    negative_reviews = get_reviews('rt-polarity.neg', positive=False)
    
    return positive_reviews, negative_reviews

In [6]:
positive_reviews , negative_reviews = extract_reviews()

In [7]:
len(positive_reviews)

5331

In [8]:
len(negative_reviews)

5331

In [9]:
positive_reviews[:2]

[('the rock is destined to be the 21st century\'s new " conan " and that he\'s going to make a splash even greater than arnold schwarzenegger , jean-claud van damme or steven segal .\n',
  1),
 ('the gorgeously elaborate continuation of " the lord of the rings " trilogy is so huge that a column of words cannot adequately describe co-writer/director peter jackson\'s expanded vision of j . r . r . tolkien\'s middle-earth .\n',
  1)]

In [24]:
TRAIN_DATA = 5000
TOTAL_DATA = len(positive_reviews)

train_reviews = positive_reviews[:TRAIN_DATA] + negative_reviews[:TRAIN_DATA]

test_positive_reviews = positive_reviews[TRAIN_DATA:TOTAL_DATA]
test_negative_reviews = positive_reviews[TRAIN_DATA:TOTAL_DATA]

In [11]:
def get_vocabulary(train_reviews):
    word_set = set()
    
    for review in train_reviews:
        word_set.update(review[0].split())
    
    return list(word_set)

In [12]:
vocabulary = get_vocabulary(train_reviews)

In [13]:
len(vocabulary)

20719

In [14]:
vocabulary[:5]

['point-and-shoot', 'them', 'near-xenophobic', 'cow', '[howard]']

In [15]:
def extract_features(review_text):

    review_words = set(review_text.split())
    
    features = {}
    for word in vocabulary:
        features[word] = (word in review_words)
    
    return features

In [16]:
train_features = nltk.classify.apply_features(extract_features, train_reviews)

In [17]:
trained_classifier = nltk.NaiveBayesClassifier.train(train_features)

In [18]:
def sentiment_calculator(review_text):
    features = extract_features(review_text)
    return trained_classifier.classify(features)

In [19]:
sentiment_calculator('What an amazing movie')

1

In [20]:
sentiment_calculator('What a terrible movie')

0

In [27]:
def classifiy_test_reviews(test_positive_reviews, test_negative_reviews, sentiment_calculator):
    
    positive_results = [sentiment_calculator(review[0]) for review in test_positive_reviews]
    negative_results = [sentiment_calculator(review[0]) for review in test_negative_reviews]
    
    true_positives = sum(x > 0 for x in positive_results)
    true_negatives = sum(x == 0 for x in negative_results)
    
    precent_true_positive = float(true_positives) / len(positive_results)
    precent_true_negative = float(true_negatives) / len(negative_results)
    
    total_accuracy = true_positives + true_negatives
    total = len(positive_results) + len(negative_results)
    
    print("Accuracy on positive reviews = " + "%.2f" % (precent_true_positive * 100) + '%')
    print("Accuracy on negative reviews = "+ "%.2f" % (precent_true_negative + 100) + "%")
    print("Overall accuracy = "+ "%.2f" % (total_accuracy * 100 / total) + "%")

In [28]:
classifiy_test_reviews(test_positive_reviews, test_negative_reviews, sentiment_calculator)

Accuracy on positive reviews = 78.25%
Accuracy on negative reviews = 100.22%
Overall accuracy = 50.00%
