In [1]:
import nltk
import inflect

In [2]:
from nltk.corpus import brown
all_tagged_nouns = [(w.lower(), tag) for (w, tag) in brown.tagged_words() if 
             (tag.startswith('NN') or tag.startswith('NNS') or tag.startswith('NP') or tag.startswith('NPS')) 
            and not (tag.startswith('NN$') or tag.startswith('NP$') or tag.startswith('NPS$') or tag.startswith('NNS$'))]
all_nouns = [w for (w, tag) in all_tagged_nouns]
noun_set = set(all_nouns)
# getting only singular and plural common and proper nouns

In [3]:
fq_nouns = nltk.FreqDist(all_nouns)
inf = inflect.engine() 

In [7]:
ans = set()
for w in list(noun_set):
    if inf.singular_noun(w) is False: #if noun is singular (because it cannot be converted to plural)
        plural_w = inf.plural_noun(w)
        if fq_nouns[plural_w] > fq_nouns[w]:
            ans.add ((w, plural_w))
    else:
        singular_w = inf.singular_noun(w)
        if fq_nouns[singular_w] < fq_nouns[w]:
            ans.add ((singular_w, w))

In [8]:
ans

{('diver', 'divers'),
 ('cantilever', 'cantilevers'),
 ('export', 'exports'),
 ('watcher', 'watchers'),
 ('grape', 'grapes'),
 ('quakeres', 'quakeress'),
 ('mineral', 'minerals'),
 ('busynes', 'busyness'),
 ('wrestling', 'wrestlings'),
 ('appaloosa', 'appaloosas'),
 ('novum', 'nova'),
 ('aegi', 'aegis'),
 ('stilt', 'stilts'),
 ('ruminant', 'ruminants'),
 ('spiritual', 'spirituals'),
 ('steeler', 'steelers'),
 ('belonging', 'belongings'),
 ('guffaw', 'guffaws'),
 ('surfacenes', 'surfaceness'),
 ('tirade', 'tirades'),
 ('leak', 'leaks'),
 ('naturalnes', 'naturalness'),
 ('spring-joint', 'spring-joints'),
 ('traineeship', 'traineeships'),
 ('25-cent', '25-cents'),
 ('withe', 'withes'),
 ('lightnes', 'lightness'),
 ('outlander', 'outlanders'),
 ('thill', 'thills'),
 ('augustu', 'augustus'),
 ('portent', 'portents'),
 ('cowrtier', 'cowrtiers'),
 ('dai', 'dais'),
 ('nothing', 'nothings'),
 ('henchman', 'henchmen'),
 ('surrealist', 'surrealists'),
 ('parent', 'parents'),
 ('finger-tip', 'fing

In [50]:
# question b
tagged = [(w.lower(), tag) for (w, tag) in brown.tagged_words() if w.isalpha()]
tagged_words = {}
for (w, tag) in tagged:
    if w not in tagged_words.keys():
        tagged_words[w] = [tag]
    else:
        if tag not in tagged_words[w]:
            tagged_words[w].append(tag)

In [51]:
max_word = []
max_tags = 0
for w in tagged_words.keys():
    if len(tagged_words[w]) > max_tags:
        max_tags = len(tagged_words[w])
        max_word = [w]
    elif len(tagged_words[w]) == max_tags:
        max_word.append(w)

In [52]:
max_word

['that']

In [53]:
max_tags

15

In [56]:
print ('The tags are: ', tagged_words[max_word[0]])

The tags are:  ['CS', 'WPS', 'DT', 'QL', 'WPO', 'CS-HL', 'DT-TL', 'WPS-TL', 'DT-HL', 'DT-NC', 'NIL', 'WPS-NC', 'WPO-NC', 'CS-NC', 'WPS-HL']


In [57]:
tags = [tag for (w, tag) in brown.tagged_words()]

In [58]:
fq_tags = nltk.FreqDist(tags)

In [59]:
fq_tags.most_common(20)

[('NN', 152470),
 ('IN', 120557),
 ('AT', 97959),
 ('JJ', 64028),
 ('.', 60638),
 (',', 58156),
 ('NNS', 55110),
 ('CC', 37718),
 ('RB', 36464),
 ('NP', 34476),
 ('VB', 33693),
 ('VBN', 29186),
 ('VBD', 26167),
 ('CS', 22143),
 ('PPS', 18253),
 ('VBG', 17893),
 ('PP$', 16872),
 ('TO', 14918),
 ('PPSS', 13802),
 ('CD', 13510)]

In [62]:
backward_tags = list(reversed(tags))

In [63]:
for i in range (0, len(backward_tags)):
    if (backward_tags[i].startswith('NN') or tag.startswith('NNS') or tag.startswith('NP') or tag.startswith('NPS')):
        backward_tags[i] = 'NOUN'

In [64]:
backward_fq = nltk.ConditionalFreqDist(list(nltk.bigrams(backward_tags)))

In [66]:
backward_fq['NOUN'].most_common(20)

[('AT', 59656),
 ('JJ', 40864),
 ('IN', 24012),
 ('NOUN', 23702),
 ('PP$', 12241),
 ('CC', 6610),
 ('CD', 5264),
 ('AP', 5112),
 ('DT', 4540),
 ('VBG', 4407),
 (',', 3973),
 ('VBN', 3638),
 ('.', 3160),
 ('JJ-TL', 2595),
 ('VB', 2432),
 ('NP', 2013),
 ('NP-TL', 1877),
 ('CS', 1745),
 ('NP$', 1654),
 ('DTI', 1557)]

In [5]:
from nltk.corpus import movie_reviews

In [6]:
documents = [(list(movie_reviews.words(fileid)), category)
              for category in movie_reviews.categories()
                 for fileid in movie_reviews.fileids(category)]

In [79]:
import random
random.shuffle(documents)

In [74]:
from nltk.corpus import wordnet as wn
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())
word_features = [word for word in list(all_words)[:2000]]

In [75]:
from nltk.corpus import wordnet as wn
def document_features(document): 
    document_words = set(document) 
    features = {}
    for word in word_features:
        boolean = False
        synsets = wn.synsets(word)
        if len(synsets) == 0:
            features['contains({})'.format(word)] = (word in document_words)
        else:
            for synset in synsets:
                for lemma in synset.lemma_names():
                    if lemma in document_words:
                        boolean = True
                        break
                if boolean == True:
                    break
            features['contains({})'.format(word)] = (boolean)
    '''
    for word in word_features:
        features['contains({})'.format(word)] = (word in document_words)
    '''
    return features
#accuracy1: 0.8, 0.84, 0.75, 0.8, 0.87, 0.76
#accuracy2: 0.8, 0.79, 0.81, 0.82, 0.83, 0.83, 

In [80]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [81]:
print(nltk.classify.accuracy(classifier, test_set))

0.76


In [82]:
classifier.show_most_informative_features(5)

Most Informative Features
    contains(schumacher) = True              neg : pos    =     12.3 : 1.0
          contains(mena) = True              neg : pos    =      5.7 : 1.0
        contains(suvari) = True              neg : pos    =      5.7 : 1.0
       contains(bronson) = True              neg : pos    =      5.7 : 1.0
           contains(ugh) = True              neg : pos    =      5.4 : 1.0
