In [1]:
# This code is from: https://www.datacamp.com/community/tutorials/simplifying-sentiment-analysis-python
import nltk
from nltk.corpus import movie_reviews
import random

# importing documents and shuffling them at random
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

random.shuffle(documents)

In [2]:
# format of documents is [(['i','am','mr','sentence','!'], 'pos')]
documents[0:1]

[(['my',
   'first',
   'press',
   'screening',
   'of',
   '1998',
   'and',
   'already',
   'i',
   "'",
   've',
   'gotten',
   'a',
   'prime',
   'candidate',
   'for',
   'my',
   'worst',
   'ten',
   'of',
   'the',
   'year',
   'list',
   '.',
   'what',
   'an',
   'auspicious',
   'beginning',
   '!',
   'welcome',
   'to',
   'the',
   'dog',
   'days',
   'of',
   'winter',
   'when',
   'the',
   'only',
   'film',
   'openings',
   'of',
   'merit',
   'are',
   'those',
   'oscar',
   'contenders',
   'that',
   'the',
   'studios',
   'opened',
   'in',
   'late',
   'december',
   'in',
   'new',
   'york',
   'and',
   'l',
   '.',
   'a',
   '.',
   'and',
   'which',
   'are',
   'just',
   'now',
   'beginning',
   'to',
   'appear',
   'elsewhere',
   '.',
   'firestorm',
   ',',
   'the',
   'directorial',
   'debut',
   'of',
   'dances',
   'with',
   'wolves',
   "'",
   's',
   'academy',
   'award',
   'winning',
   'cinematographer',
   'dean',
   'sem

In [3]:
# gathering all words and taking only the top 2000
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = list(all_words)[:2000]
word_features[0:10]

['plot', ':', 'two', 'teen', 'couples', 'go', 'to', 'a', 'church', 'party']

In [4]:
# defining a function which tells whether a document contains a word
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    return features

In [5]:
# format of featuresets is [({'contains(wow)' : True, ..., 'contains(wonderful)' : True}, 'pos')]
featuresets = [(document_features(d), c) for (d,c) in documents]
featuresets[0]

({'contains(plot)': False,
  'contains(:)': True,
  'contains(two)': True,
  'contains(teen)': False,
  'contains(couples)': False,
  'contains(go)': False,
  'contains(to)': True,
  'contains(a)': True,
  'contains(church)': False,
  'contains(party)': False,
  'contains(,)': True,
  'contains(drink)': False,
  'contains(and)': True,
  'contains(then)': True,
  'contains(drive)': False,
  'contains(.)': True,
  'contains(they)': True,
  'contains(get)': True,
  'contains(into)': True,
  'contains(an)': True,
  'contains(accident)': False,
  'contains(one)': True,
  'contains(of)': True,
  'contains(the)': True,
  'contains(guys)': True,
  'contains(dies)': False,
  'contains(but)': True,
  'contains(his)': True,
  'contains(girlfriend)': False,
  'contains(continues)': False,
  'contains(see)': False,
  'contains(him)': False,
  'contains(in)': True,
  'contains(her)': True,
  'contains(life)': True,
  'contains(has)': True,
  'contains(nightmares)': False,
  'contains(what)': True,
 

In [6]:
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [7]:
nltk.classify.accuracy(classifier,test_set)

0.78