In [7]:
import nltk
from nltk.corpus import movie_reviews
import random
from nltk.corpus import wordnet as wn

In [8]:
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]

In [9]:
all_synsets = nltk.FreqDist(wn.synsets(w.lower())[0] for w in movie_reviews.words() if len(wn.synsets(w))>0)

In [10]:
len(all_synsets)

18775

In [11]:
all_synsets.most_common(20)

[(Synset('angstrom.n.01'), 38106),
 (Synset('be.v.01'), 28462),
 (Synset('inch.n.01'), 21852),
 (Synset('second.n.01'), 18988),
 (Synset('information_technology.n.01'), 18377),
 (Synset('movie.n.01'), 18048),
 (Synset('arsenic.n.02'), 11378),
 (Synset('merely.r.01'), 9222),
 (Synset('iodine.n.01'), 8889),
 (Synset('helium.n.01'), 8869),
 (Synset('on.a.01'), 7385),
 (Synset('are.n.01'), 6949),
 (Synset('thymine.n.01'), 6410),
 (Synset('one.n.01'), 6336),
 (Synset('by.r.01'), 6261),
 (Synset('beryllium.n.01'), 6185),
 (Synset('associate_in_nursing.n.01'), 5744),
 (Synset('not.r.01'), 5743),
 (Synset('world_health_organization.n.01'), 5695),
 (Synset('astatine.n.01'), 4986)]

In [10]:
synset_features = list(all_synsets)[:2000]

In [11]:
def document_features(document): 
    document_synsets = set([wn.synsets(w.lower())[0] for w in document if len(wn.synsets(w))>0]) 
    features = {}
    for synset in synset_features:
        '''
        boolean = False
        for lemma in synset.lemma_names():
            if lemma in document_words:
                boolean = True
                break
        features['contains({})'.format(synset)] = (boolean)
        '''
        features['contains({})'.format(synset)] = (synset in document_synsets)
    return features
# 0.85, 0.81, 0.88, 0.79, 0.79, 0.86, 0.82, 0.77, 0.85, 0.77, 0.81
#new: 0.84, 0.86, 0.85, 0.78

In [25]:
random.shuffle(documents)

In [26]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [27]:
print(nltk.classify.accuracy(classifier, test_set))

0.78


In [28]:
classifier.show_most_informative_features(5)

Most Informative Features
contains(Synset('annual.n.01')) = True              pos : neg    =      9.0 : 1.0
contains(Synset('chaff.n.01')) = True              neg : pos    =      8.4 : 1.0
contains(Synset('sterile.s.03')) = True              neg : pos    =      8.4 : 1.0
contains(Synset('dumbbell.n.02')) = True              neg : pos    =      7.7 : 1.0
contains(Synset('turkey.n.01')) = True              neg : pos    =      7.3 : 1.0


In [29]:
house = wn.synsets('house')[0]
house.path_similarity(wn.synset('sign_of_the_zodiac.n.01'))

0.1111111111111111