In [1]:
from nltk.corpus import movie_reviews

In [3]:
print(len(movie_reviews.fileids()))

2000


In [4]:
print(movie_reviews.categories())

['neg', 'pos']


In [5]:
print(len(movie_reviews.fileids('pos')))

1000


In [6]:
print(len(movie_reviews.fileids('neg')))

1000


In [7]:
print(movie_reviews.fileids('neg')[1])

neg/cv001_19502.txt


In [11]:
documents = []
for category in movie_reviews.categories():
    for fileid in movie_reviews.fileids(category):
        documents.append((movie_reviews.words(fileid), category))

print(documents[0])

(['plot', ':', 'two', 'teen', 'couples', 'go', 'to', ...], 'neg')


In [14]:
from random import shuffle
shuffle(documents)

In [12]:
from nltk import FreqDist
all_words = [w.lower() for w in movie_reviews.words()]
all_words_frequency = FreqDist(all_words)

In [15]:
all_words_frequency.most_common(10)

[(',', 77717),
 ('the', 76529),
 ('.', 65876),
 ('a', 38106),
 ('and', 35576),
 ('of', 34123),
 ('to', 31937),
 ("'", 30585),
 ('is', 25195),
 ('in', 21822)]

In [18]:
from nltk.corpus import stopwords
stop_words = stopwords.words("english")

In [20]:
all_words_wo_stopwords = [w for w in all_words if w not in stop_words ]

In [21]:
print(all_words_wo_stopwords[:15])

['plot', ':', 'two', 'teen', 'couples', 'go', 'church', 'party', ',', 'drink', 'drive', '.', 'get', 'accident', '.']


In [23]:
import string
all_words_wo_sw_punc = [w for w in all_words_wo_stopwords if w not in string.punctuation]

In [24]:
all_words_wo_sw_punc

['plot',
 'two',
 'teen',
 'couples',
 'go',
 'church',
 'party',
 'drink',
 'drive',
 'get',
 'accident',
 'one',
 'guys',
 'dies',
 'girlfriend',
 'continues',
 'see',
 'life',
 'nightmares',
 'deal',
 'watch',
 'movie',
 'sorta',
 'find',
 'critique',
 'mind',
 'fuck',
 'movie',
 'teen',
 'generation',
 'touches',
 'cool',
 'idea',
 'presents',
 'bad',
 'package',
 'makes',
 'review',
 'even',
 'harder',
 'one',
 'write',
 'since',
 'generally',
 'applaud',
 'films',
 'attempt',
 'break',
 'mold',
 'mess',
 'head',
 'lost',
 'highway',
 'memento',
 'good',
 'bad',
 'ways',
 'making',
 'types',
 'films',
 'folks',
 'snag',
 'one',
 'correctly',
 'seem',
 'taken',
 'pretty',
 'neat',
 'concept',
 'executed',
 'terribly',
 'problems',
 'movie',
 'well',
 'main',
 'problem',
 'simply',
 'jumbled',
 'starts',
 'normal',
 'downshifts',
 'fantasy',
 'world',
 'audience',
 'member',
 'idea',
 'going',
 'dreams',
 'characters',
 'coming',
 'back',
 'dead',
 'others',
 'look',
 'like',
 'dead

In [25]:
all_words_frequency = FreqDist(all_words_wo_sw_punc)

In [26]:
all_words_frequency.most_common(10)

[('film', 9517),
 ('one', 5852),
 ('movie', 5771),
 ('like', 3690),
 ('even', 2565),
 ('good', 2411),
 ('time', 2411),
 ('story', 2169),
 ('would', 2109),
 ('much', 2049)]

In [28]:
most_common_words = all_words_frequency.most_common(2000)

In [29]:
most_common_words[:15]

[('film', 9517),
 ('one', 5852),
 ('movie', 5771),
 ('like', 3690),
 ('even', 2565),
 ('good', 2411),
 ('time', 2411),
 ('story', 2169),
 ('would', 2109),
 ('much', 2049),
 ('character', 2020),
 ('also', 1967),
 ('get', 1949),
 ('two', 1911),
 ('well', 1906)]

In [30]:
word_feature = [w[0] for w in most_common_words]

In [32]:
print(word_feature[:15])

['film', 'one', 'movie', 'like', 'even', 'good', 'time', 'story', 'would', 'much', 'character', 'also', 'get', 'two', 'well']


In [41]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_feature:
        features[word] = (word in document_words)
    return features

In [42]:
feature_set = [(document_features(doc), category) for (doc, category) in documents]

In [44]:
feature_set[0][0]

{'film': True,
 'one': True,
 'movie': True,
 'like': True,
 'even': False,
 'good': True,
 'time': True,
 'story': True,
 'would': True,
 'much': True,
 'character': True,
 'also': True,
 'get': True,
 'two': False,
 'well': True,
 'characters': True,
 'first': True,
 '--': True,
 'see': False,
 'way': False,
 'make': True,
 'life': True,
 'really': True,
 'films': True,
 'plot': False,
 'little': True,
 'people': True,
 'could': True,
 'scene': True,
 'man': False,
 'bad': False,
 'never': False,
 'best': True,
 'new': True,
 'scenes': False,
 'many': True,
 'director': True,
 'know': True,
 'movies': False,
 'action': False,
 'great': False,
 'another': False,
 'love': False,
 'go': True,
 'made': True,
 'us': True,
 'big': False,
 'end': True,
 'something': False,
 'back': True,
 'still': False,
 'world': False,
 'seems': True,
 'work': True,
 'makes': False,
 'however': True,
 'every': False,
 'though': False,
 'better': False,
 'real': False,
 'audience': True,
 'enough': True,
 

In [45]:
training_set = feature_set[400:]
testing_set = feature_set[:400]

In [47]:
from nltk import NaiveBayesClassifier

In [48]:
classifier = NaiveBayesClassifier.train(training_set)

In [49]:
from nltk import classify

In [50]:
accuracy = classify.accuracy(classifier, testing_set)

In [51]:
print(accuracy)

0.75


In [52]:
print(classifier.show_most_informative_features(10))

Most Informative Features
             outstanding = True              pos : neg    =     11.9 : 1.0
             wonderfully = True              pos : neg    =     10.1 : 1.0
                  seagal = True              neg : pos    =      7.1 : 1.0
                   damon = True              pos : neg    =      7.0 : 1.0
                    jedi = True              pos : neg    =      6.8 : 1.0
                   mulan = True              pos : neg    =      5.9 : 1.0
                 sandler = True              neg : pos    =      5.6 : 1.0
                  wasted = True              neg : pos    =      5.4 : 1.0
                   waste = True              neg : pos    =      5.1 : 1.0
                   awful = True              neg : pos    =      5.0 : 1.0
None


In [53]:
custom_review = "I hated the restaurant. It was a disaster eating there. Poor service, arrogant waiters."

In [57]:
from nltk import word_tokenize
custom_review_tokens = word_tokenize(custom_review)
custom_review_set = document_features(custom_review_tokens)
print(classifier.classify(custom_review_set))

neg


In [58]:
prob_result = classifier.prob_classify(custom_review_set)
print(prob_result.max())
print(prob_result.prob('pos'))
print(prob_result.prob('neg'))

neg
5.353732364357548e-08
0.9999999464626705
