## Chapter 5

In [1]:
import nltk
from nltk import *
from nltk.corpus import brown
from collections import defaultdict
from operator import itemgetter


#### 1. What are the most common adverbs in the brown corpus (categories="news")? Please output the 10 most frequent ones (Please use the universal tagset).

In [24]:
brown_news_tagged = brown.tagged_words(categories = 'news', tagset = 'universal')
cfd_1 = nltk.ConditionalFreqDist((tag,word) for (word,tag) in brown_news_tagged)
cfd_1['ADV'].most_common(10)

[('not', 254),
 ('when', 128),
 ('also', 120),
 ('now', 76),
 ('as', 75),
 ('here', 67),
 ('where', 58),
 ('then', 56),
 ('back', 55),
 ('about', 49)]

#### 2. What are the part-of-speech tags before the word “religion” in the brown corpus (categories="religion")? (Please use the universal tagset)

In [36]:
religion_tagged = brown.tagged_words(categories='religion', tagset='universal')
rel_tags = [a[1] for (a,b) in nltk.bigrams(religion_tagged) if b[0]=='religion']
fd_rel = nltk.FreqDist(rel_tags)
fd_rel.tabulate()


NOUN    .  ADJ  ADP CONJ  DET 
   3    2    2    1    1    1 


#### 3. What are the words that are highly ambiguous as to their part-of-speech tags ((i.e. the word has more than 3 pos tags) in the brown corpus (categories="adventure") (Please use the universal tagset).

In [13]:
brown_adv_tagged = brown.tagged_words(categories = 'adventure', tagset = 'universal')
data_3 = nltk.ConditionalFreqDist((word.lower(), tag)
                                  for (word,tag) in brown_adv_tagged)


for word in sorted(data_3.conditions()):
    if len(data_3[word]) > 3:
        tags = [tag for (tag, _) in data_3[word].most_common()]
        print(word, ' '.join(tags))


back ADV NOUN VERB ADJ
last ADJ ADV VERB NOUN
outside ADV ADP ADJ NOUN
past ADP ADV ADJ NOUN
that ADP DET PRON ADV


#### 4. Train a unigram tagger on the brown corpus (categories="humor"). a) Split the data into training and testing dataset- training on the 95% of data and testing on the remaining 5%. b) Evaluate the performance of this tagger on the test dataset. c) Use this tagger to tag some new text ['this','is','a','NLP','class']. d) Observe that some words are not assigned a tag. Explain why not?  (Please do not use the universal tagset)

In [27]:
#a)
brown_humor_tagged_sents = brown.tagged_sents(categories = 'humor', tagset = 'universal')
size_4 = int(len(brown_humor_tagged_sents) * .95)
train_sents_4 = brown_humor_tagged_sents[:size_4]
test_sents_4 = brown_humor_tagged_sents[size_4:]
unigram_tagger = nltk.UnigramTagger(train_sents_4)
#b)
unigram_tagger.evaluate(test_sents_4)

0.7271062271062271

In [62]:
string_4 = 'this is a string diddly doo da'
tokens_4 = word_tokenize(string_4)
(unigram_tagger.tag(tokens_4))

[('this', 'DET'),
 ('is', 'VERB'),
 ('a', 'DET'),
 ('string', None),
 ('diddly', None),
 ('doo', None),
 ('da', None)]

### Some are given a None tag because the word was not encountered in the training data set

#### 5. Explore the nps_chat corpus and find out what part-of-speech tags occur before a noun, with the most frequent ones first (Please use the universal tagset).

In [14]:
chat_tagged_words = nltk.corpus.nps_chat.tagged_words(tagset = 'universal')
chat_tag_pairs=nltk.bigrams(chat_tagged_words)
noun_preceders = [a[1] for (a, b) in chat_tag_pairs if b[1] == 'NOUN']
fdist = nltk.FreqDist(noun_preceders)
[tag for (tag, _) in fdist.most_common()]

['X',
 'DET',
 'NOUN',
 'VERB',
 'ADJ',
 'PRON',
 '.',
 'ADP',
 'CONJ',
 'NUM',
 'ADV',
 'PRT']

#### 6. Explore the brown corpus (categories="romance") to find out all tags starting with VB and its associated (word, frequency) pairs (no more than 6 pairs). (Please do not use the universal tagset)
#### For example, one of the outputs should look like:
#### VBG [('going', 59), ('looking', 36), ('trying', 23), ('thinking', 21), ('watching', 20), ('taking', 19)]

In [16]:
def findtags(tag_prefix, tagged_text):
    cfd = nltk.ConditionalFreqDist((tag, word) for (word, tag) in tagged_text
if tag.startswith(tag_prefix))
    return dict((tag, cfd[tag].most_common(6)) for tag in cfd.conditions())


In [17]:
tagdict = findtags('VB', nltk.corpus.brown.tagged_words(categories='romance'))

In [18]:
for tag in sorted(tagdict):
    print(tag, tagdict[tag])

VB [('get', 92), ('know', 88), ('go', 76), ('see', 74), ('take', 62), ('say', 59)]
VB+PPO [("Let's", 10), ("let's", 5)]
VBD [('said', 318), ('went', 82), ('thought', 80), ('came', 75), ('knew', 69), ('looked', 68)]
VBG [('going', 59), ('looking', 36), ('trying', 23), ('thinking', 21), ('watching', 20), ('taking', 19)]
VBG+TO [('gonna', 4)]
VBG-TL [("Racin'", 1), ('Dancing', 1), ('Surviving', 1)]
VBN [('got', 36), ('come', 29), ('done', 29), ('gone', 25), ('seen', 20), ('made', 20)]
VBN+TO [('gotta', 1)]
VBN-TL [('United', 3), ('Armed', 1), ('Forked', 1)]
VBZ [('says', 7), ('wants', 7), ('goes', 5), ('gets', 4), ('thinks', 4), ('makes', 4)]


#### 7. Write programs to process the brown corpus (categories="editorial")  and find answers to the following questions (Please do not use the universal tagset):
#### a.Which nouns are more common in their plural form (e.g. tag='NNS'), rather than their singular form (e.g.tag='NN')? (Only consider regular plurals, formed with the -s suffix.)
#### b.What do the 10 most frequent tags represent in the Brown Corpus? Please output the tags and explain the meaning for each tag.


In [10]:
#A
brown_ed_tagged = brown.tagged_words(categories = 'editorial')
cfd_7 = nltk.ConditionalFreqDist(brown_ed_tagged)


In [11]:
conditions = cfd_7.conditions()

In [13]:
for condition in conditions:
    if cfd_7[condition]['NNS'] > cfd_7[condition]['NN']:
        print(condition)

schools
appropriations
powers
procedures
legislators
programs
ways
musts
years
bills
pensions
roads
municipalities
limits
problems
fireworks
sales
questions
resolutions
minds
answers
heroics
stands
contributions
women
affairs
assemblies
judges
governors
wolves
Newspapermen
politicians
workers
pros
cons
issues
Cities
counties
months
eyes
members
$83,750
$30,000
neighbors
results
aggressions
forces
wars
shows
jobs
efforts
expenditures
services
taxes
commissioners
officials
difficulties
signs
imperialists
backs
dollars
forecasts
demands
$1,750,000
$1,250,000
conditions
needs
employes
meals
attendants
hours
nurses
doctors
Patients
funds
$3.15
$4
experts
$9
reforms
areas
men
means
people
scores
deaths
schemes
leaders
countries
arms
things
facilities
jails
prisoners
lobbies
relations
confines
lines
slogans
institutions
buses
plans
projects
stations
barricades
tanks
zones
troops
effects
laws
offices
governments
notes
taunts
replies
authorities
remains
facts
fusillades
shots
residents
incident

In [14]:
#B
tags_7 = [tag for word, tag in brown_ed_tagged]

In [15]:
fd_7 = nltk.FreqDist(tags_7)

In [18]:
common_tags = fd_7.most_common(10)

In [19]:
common_tags

[('NN', 7675),
 ('IN', 6204),
 ('AT', 5311),
 ('JJ', 3593),
 ('.', 2988),
 ('NNS', 2972),
 (',', 2741),
 ('VB', 2129),
 ('NP', 1884),
 ('CC', 1835)]

#### NN - Noun, common, singular or mass
#### IN - Preposition or conjunction, subordinating
#### AT - Article
#### JJ - Adjective or numeral, ordinal
#### . - Period
#### NNS - Noun, common, plural
#### , - Comma
#### VB - Verb
#### NP - Proper Noun
#### CC - Conjunction, coordinating

#### 8. Write code to search the brown corpus (categories="hobbies") for particular words and phrases according to tags, to answer the following questions (please do not use the universal tagset):
#### a. Produce an alphabetically sorted list of the distinct words tagged as MD.
#### b. Identify three-word prepositional phrases of the form IN + AT + NN (eg. in the lab).

In [45]:
brown_hobbies_tagged_words = brown.tagged_words(categories = 'hobbies')
hobbies_tag_fd = nltk.FreqDist(brown_hobbies_tagged_words)
sorted(set([wt[0].lower() for (wt, _) in hobbies_tag_fd.most_common() if wt[1] == 'MD']))

['can',
 'could',
 'dare',
 'may',
 'might',
 'must',
 'need',
 'shall',
 'should',
 'will',
 'would']

In [25]:
brown_hobbies_tagged_sents = brown.tagged_sents(categories = 'hobbies')
def process(sentence):
    for (w1,t1), (w2,t2), (w3,t3) in nltk. trigrams(sentence):
        if (t1 == ('IN') and t2 =='AT' and t3 == 'NN'):
            print(w1,w2,w3)
            
for tagged_sent in brown_hobbies_tagged_sents:
    process(tagged_sent)

at the time
from the sterno-cleido
of the neck
of the leg
to the value
of a muscle
of the wide-grip
on the chest
with the barbell
in the pecs
to the Aj
of the pin-point
with a bit
to the pecs
into the serratus
of every muscle
with the knowledge
in the limbo
at the hipline
of the champion
under the skin
on the leg
for the bodybuilder
to the height
against the back
to the rear
to the front
in a nutshell
at the back
of the neck
with the bar
of the neck
to the front
from a pansy
from a dealer
in a week
of the year
with a fog
with a board
with the plant
over the bed
of a compost
to the color
through the winter
in a flat
but a mat
over the glass
in a border
at the expense
to the earth
for the rest
of the season
from the mother
at a time
over the winter
in the year
to a meal
in the light
throughout the world
to the season
in the world
on the tree
into a sort
of the fruit
of the avocado
to the consumer
of the blood
at the pit
of a weapon
With a nation
to the advantage
to the missile
in the air

in a closet
From the coil
in the yard
in a mild-winter
above the cost
in the basement
in the attic
to a point
than the price
on a variety
besides the nature
from the outside
of a conditioner
in an hour
to the cooling
for the horsepower
of the compressor
of the unit
With a unit
on the outside
of the house
in the roof
of the house
of a gas
to the moisture
in the ceiling
in the side
to a minimum
in the installation
on the basis
above a bedroom
of a site
through the work
of the site
from the county
in the field
during the time
of the year
of the climate
to the sun
in the field
at the site
at the office
in the area
regarding the site
to the site
by a group
on the character
of the site
of the investigator
of the area
for the future
to the public
of the recreation
at the site
on a body
for a park
of a valley
on the coast
of the surf
on a beach
through the beauty
for the public
for a recreation
on an ocean
of the water
in the springtime
on a reservoir
of the water
of the water
to the site
to t

#### 9. Use a default dictionary and itemgetter (n) to sort the most frequent tags used in the brown corpus (categories="reviews"). Please first convert the tags into the universal tags.

In [26]:
brown_reviews_tagged_words = brown.tagged_words(categories = 'reviews',tagset = 'universal')
counts = defaultdict(int)
for (word, tag) in brown_reviews_tagged_words:
    counts[tag]+=1

[t for t, c in sorted(counts.items(), key=itemgetter(1), reverse = True)]


['NOUN',
 'VERB',
 '.',
 'ADP',
 'DET',
 'ADJ',
 'ADV',
 'CONJ',
 'PRON',
 'PRT',
 'NUM',
 'X']

#### 10. Explore the brown corpus (categories="learned") to find out the most 200 frequent words and store their most likely tags. We can then use this information as the model for a "lookup tagger" (an NLTK UnigramTagger). If the words are not among the 200 most frequent words, we would like to assign the default tag of "NN" to them. Then use this lookup tagger to tag a new sentence of your own. 

In [27]:
brown_tagged_sents = brown.tagged_sents(categories ='learned')
learned_fd = nltk.FreqDist(brown.words(categories='learned'))
learned_cfd = nltk.ConditionalFreqDist(brown.tagged_words(categories='learned'))
most_freq_10 = learned_fd.most_common(200)
likely_tags = dict((word,learned_cfd[word].max()) for (word, _) in most_freq_10)
baseline_tagger_10 = nltk.UnigramTagger(model=likely_tags)
baseline_tagger_10.evaluate(brown_tagged_sents)

0.5305627638986629

In [28]:
baseline_tagger_10 = nltk.UnigramTagger(model=likely_tags, backoff=nltk.DefaultTagger('NN'))

In [31]:
string_10 = 'yes indeed this is a very fine and dandy sentence indeed'
tokens_10 = word_tokenize(string_10)
(baseline_tagger_10.tag(tokens_10))

[('yes', 'NN'),
 ('indeed', 'NN'),
 ('this', 'DT'),
 ('is', 'BEZ'),
 ('a', 'AT'),
 ('very', 'QL'),
 ('fine', 'NN'),
 ('and', 'CC'),
 ('dandy', 'NN'),
 ('sentence', 'NN'),
 ('indeed', 'NN')]