# Notes
- Confusion matrix
    - observations and truth
                   TRUTH
                   T   F
              T    TP  FP
      OBS
              F    FN  TN
    - Example
            Noun  Verb  Det
      Noun
      
      Verb
      
      Det
      
- TP: are relevant items we correctly identify as relevant
- TN: are irrelevant items we correctly identify as irrelevant
- FP (Type I Errors) : are irrelevant items we incorrectly identified as relevant
- FN (Type II Errors): are relevant items we incorrectly identified as irrelevant

- Precision: indicates how many of the items identified were relevant
    - TP/(TP+FP)
- Recall: indicates how many of the actually relevant items
    - TP/(TP+FN) #trues found divided by the total of truths we should (could) have found
- F-measure
    - (2 \* Precision \* Recall) / (Precision + Recall)
- Accuracy
    - (TP+TN)/(TP+TN+FP+FN)

In [1]:
import nltk
from nltk.corpus import brown

In [2]:
# Check today's notes "Examples" on the course website

In [4]:
brown_tagged_sents = brown.tagged_sents()
len(brown_tagged_sents)

57340

In [7]:
brown_sents = brown.sents()
brown_sents[5]

['It',
 'recommended',
 'that',
 'Fulton',
 'legislators',
 'act',
 '``',
 'to',
 'have',
 'these',
 'laws',
 'studied',
 'and',
 'revised',
 'to',
 'the',
 'end',
 'of',
 'modernizing',
 'and',
 'improving',
 'them',
 "''",
 '.']

In [6]:
brown_tagged_sents[5]

[('It', 'PPS'),
 ('recommended', 'VBD'),
 ('that', 'CS'),
 ('Fulton', 'NP'),
 ('legislators', 'NNS'),
 ('act', 'VB'),
 ('``', '``'),
 ('to', 'TO'),
 ('have', 'HV'),
 ('these', 'DTS'),
 ('laws', 'NNS'),
 ('studied', 'VBN'),
 ('and', 'CC'),
 ('revised', 'VBN'),
 ('to', 'IN'),
 ('the', 'AT'),
 ('end', 'NN'),
 ('of', 'IN'),
 ('modernizing', 'VBG'),
 ('and', 'CC'),
 ('improving', 'VBG'),
 ('them', 'PPO'),
 ("''", "''"),
 ('.', '.')]

In [8]:
brown_sents = brown.sents()
tags = [tag for (word, tag) in brown.tagged_words()]
tags_fd = nltk.FreqDist(tags)
tags_fd.max()

'NN'

In [9]:
tags_fd['NN']

152470

In [12]:
text = 'The simplest possible tagger assigns the same tag to each token. This may seem to be a rather banal step, but it estabilishes an important baseline for tagger performance.'
tokens = nltk.word_tokenize(text)
tokens

['The',
 'simplest',
 'possible',
 'tagger',
 'assigns',
 'the',
 'same',
 'tag',
 'to',
 'each',
 'token',
 '.',
 'This',
 'may',
 'seem',
 'to',
 'be',
 'a',
 'rather',
 'banal',
 'step',
 ',',
 'but',
 'it',
 'estabilishes',
 'an',
 'important',
 'baseline',
 'for',
 'tagger',
 'performance',
 '.']

In [13]:
tokens = nltk.word_tokenize ('jr. took the test')
tokens

['jr.', 'took', 'the', 'test']

In [14]:
tokens = nltk.word_tokenize ('i went with jr.')
tokens #didn't handle the sentence correctly, period should be attached to jr

['i', 'went', 'with', 'jr', '.']

In [15]:
tokens = nltk.word_tokenize(text)
default_tagger = nltk.DefaultTagger('NN') #tag everything as noun
tokens_default_tagged = default_tagger.tag(tokens)
tokens_default_tagged

[('The', 'NN'),
 ('simplest', 'NN'),
 ('possible', 'NN'),
 ('tagger', 'NN'),
 ('assigns', 'NN'),
 ('the', 'NN'),
 ('same', 'NN'),
 ('tag', 'NN'),
 ('to', 'NN'),
 ('each', 'NN'),
 ('token', 'NN'),
 ('.', 'NN'),
 ('This', 'NN'),
 ('may', 'NN'),
 ('seem', 'NN'),
 ('to', 'NN'),
 ('be', 'NN'),
 ('a', 'NN'),
 ('rather', 'NN'),
 ('banal', 'NN'),
 ('step', 'NN'),
 (',', 'NN'),
 ('but', 'NN'),
 ('it', 'NN'),
 ('estabilishes', 'NN'),
 ('an', 'NN'),
 ('important', 'NN'),
 ('baseline', 'NN'),
 ('for', 'NN'),
 ('tagger', 'NN'),
 ('performance', 'NN'),
 ('.', 'NN')]

In [18]:
tokens_tagged_sents = [[('The', 'NN'), ('simplest', 'NN'), ('possible',
'NN'), ('tagger', 'NN'), ('assigns', 'NN'), ('the', 'NN'), ('same', 'NN'),
('tag', 'NN'), ('to', 'NN'), ('each', 'NN'), ('token.', 'NN')], [('This',
'NN'), ('may', 'NN'), ('seem', 'NN'), ('to', 'NN'), ('be', 'NN'), ('a',
'NN'), ('rather', 'NN'), ('banal', 'NN'), ('step', 'NN'), (',', 'NN'),
('but', 'NN'), ('it', 'NN'), ('estabilishes', 'NN'), ('an', 'NN'),
('important', 'NN'), ('baseline', 'NN'), ('for', 'NN'), ('tagger', 'NN'),
('performance', 'NN'), ('.', 'NN')]]
default_tagger.evaluate(tokens_tagged_sents) #returns 100% accuracy

1.0

In [19]:
brown_news_tagged = brown.tagged_sents(categories='news')
default_tagger.evaluate(brown_news_tagged)

0.13089484257215028

In [20]:
brown_humor_tagged = brown.tagged_sents(categories='humor')
default_tagger.evaluate(brown_humor_tagged)

0.11832219405392948

In [21]:
patterns = [
	(r'.*ing$', 'VBG'),              # gerunds
	(r'.*ed$', 'VBD'),               # simple past
	(r'.*es$', 'VBZ'),               # 3rd singular present
	(r'.*ould$', 'MD'),              # modals
	(r'.*\'s$', 'NN$'),              # possesive nouns
	(r'.*s$', 'NNS'),                # plural nouns
	(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
	(r'.*', 'NN')                    # nouns (default)
	]

In [22]:
re_tagger = nltk.RegexpTagger(patterns)
re_tagger.tag(tokens)

[('The', 'NN'),
 ('simplest', 'NN'),
 ('possible', 'NN'),
 ('tagger', 'NN'),
 ('assigns', 'NNS'),
 ('the', 'NN'),
 ('same', 'NN'),
 ('tag', 'NN'),
 ('to', 'NN'),
 ('each', 'NN'),
 ('token', 'NN'),
 ('.', 'NN'),
 ('This', 'NNS'),
 ('may', 'NN'),
 ('seem', 'NN'),
 ('to', 'NN'),
 ('be', 'NN'),
 ('a', 'NN'),
 ('rather', 'NN'),
 ('banal', 'NN'),
 ('step', 'NN'),
 (',', 'NN'),
 ('but', 'NN'),
 ('it', 'NN'),
 ('estabilishes', 'VBZ'),
 ('an', 'NN'),
 ('important', 'NN'),
 ('baseline', 'NN'),
 ('for', 'NN'),
 ('tagger', 'NN'),
 ('performance', 'NN'),
 ('.', 'NN')]

In [23]:
brown_news_tagged = brown.tagged_sents(categories='news')
brown_sents[5]

['It',
 'recommended',
 'that',
 'Fulton',
 'legislators',
 'act',
 '``',
 'to',
 'have',
 'these',
 'laws',
 'studied',
 'and',
 'revised',
 'to',
 'the',
 'end',
 'of',
 'modernizing',
 'and',
 'improving',
 'them',
 "''",
 '.']

In [24]:
brown_re_tagged = re_tagger.tag(brown_sents[5])
brown_re_tagged

[('It', 'NN'),
 ('recommended', 'VBD'),
 ('that', 'NN'),
 ('Fulton', 'NN'),
 ('legislators', 'NNS'),
 ('act', 'NN'),
 ('``', 'NN'),
 ('to', 'NN'),
 ('have', 'NN'),
 ('these', 'NN'),
 ('laws', 'NNS'),
 ('studied', 'VBD'),
 ('and', 'NN'),
 ('revised', 'VBD'),
 ('to', 'NN'),
 ('the', 'NN'),
 ('end', 'NN'),
 ('of', 'NN'),
 ('modernizing', 'VBG'),
 ('and', 'NN'),
 ('improving', 'VBG'),
 ('them', 'NN'),
 ("''", 'NN'),
 ('.', 'NN')]

In [25]:
patterns2 = [
	(r'.*ing$', 'VBG'),              # gerunds
	(r'.*ed$', 'VBD'),               # simple past
	(r'.*es$', 'VBZ'),               # 3rd singular present
	(r'.*ould$', 'MD'),              # modals
	(r'.*\'s$', 'NN$'),              # possesive nouns
	(r'.*s$', 'NNS'),                # plural nouns
	(r'^-?[0-9]+(.[0-9]+)?$', 'CD'), # cardinal numbers
	]

In [26]:
re_tagger_backoff = nltk.RegexpTagger(patterns2)
re_tagger_backoff

<Regexp Tagger: size=7>

In [27]:
brown_re_tagged_backoff = re_tagger_backoff.tag(brown_sents[5])
brown_re_tagged_backoff

[('It', None),
 ('recommended', 'VBD'),
 ('that', None),
 ('Fulton', None),
 ('legislators', 'NNS'),
 ('act', None),
 ('``', None),
 ('to', None),
 ('have', None),
 ('these', None),
 ('laws', 'NNS'),
 ('studied', 'VBD'),
 ('and', None),
 ('revised', 'VBD'),
 ('to', None),
 ('the', None),
 ('end', None),
 ('of', None),
 ('modernizing', 'VBG'),
 ('and', None),
 ('improving', 'VBG'),
 ('them', None),
 ("''", None),
 ('.', None)]

In [28]:
re_tagger_backoff = nltk.RegexpTagger(patterns2, backoff=default_tagger) #Refer to default tagger when None
brown_re_tagged_backoff = re_tagger_backoff.tag(brown_sents[5])
brown_re_tagged_backoff

[('It', 'NN'),
 ('recommended', 'VBD'),
 ('that', 'NN'),
 ('Fulton', 'NN'),
 ('legislators', 'NNS'),
 ('act', 'NN'),
 ('``', 'NN'),
 ('to', 'NN'),
 ('have', 'NN'),
 ('these', 'NN'),
 ('laws', 'NNS'),
 ('studied', 'VBD'),
 ('and', 'NN'),
 ('revised', 'VBD'),
 ('to', 'NN'),
 ('the', 'NN'),
 ('end', 'NN'),
 ('of', 'NN'),
 ('modernizing', 'VBG'),
 ('and', 'NN'),
 ('improving', 'VBG'),
 ('them', 'NN'),
 ("''", 'NN'),
 ('.', 'NN')]

In [29]:
fd = nltk.FreqDist(brown.words())
cfd = nltk.ConditionalFreqDist(brown.tagged_words())
fd_keys = list( fd.keys() )
most_freq_words = fd_keys[:100]
most_freq_words

['The',
 'Fulton',
 'County',
 'Grand',
 'Jury',
 'said',
 'Friday',
 'an',
 'investigation',
 'of',
 "Atlanta's",
 'recent',
 'primary',
 'election',
 'produced',
 '``',
 'no',
 'evidence',
 "''",
 'that',
 'any',
 'irregularities',
 'took',
 'place',
 '.',
 'jury',
 'further',
 'in',
 'term-end',
 'presentments',
 'the',
 'City',
 'Executive',
 'Committee',
 ',',
 'which',
 'had',
 'over-all',
 'charge',
 'deserves',
 'praise',
 'and',
 'thanks',
 'Atlanta',
 'for',
 'manner',
 'was',
 'conducted',
 'September-October',
 'term',
 'been',
 'charged',
 'by',
 'Superior',
 'Court',
 'Judge',
 'Durwood',
 'Pye',
 'to',
 'investigate',
 'reports',
 'possible',
 'hard-fought',
 'won',
 'Mayor-nominate',
 'Ivan',
 'Allen',
 'Jr.',
 'Only',
 'a',
 'relative',
 'handful',
 'such',
 'received',
 'considering',
 'widespread',
 'interest',
 'number',
 'voters',
 'size',
 'this',
 'city',
 'it',
 'did',
 'find',
 'many',
 "Georgia's",
 'registration',
 'laws',
 'are',
 'outmoded',
 'or',
 'inadeq

In [30]:
likely_tags = dict((word, cfd[word].max()) for word in most_freq_words)
likely_tags

{"''": "''",
 ',': ',',
 '.': '.',
 'Allen': 'NP',
 'Atlanta': 'NP',
 "Atlanta's": 'NP$',
 'City': 'NN-TL',
 'Committee': 'NN-TL',
 'County': 'NN-TL',
 'Court': 'NN-TL',
 'Durwood': 'NP',
 'Executive': 'JJ-TL',
 'Friday': 'NR',
 'Fulton': 'NP-TL',
 "Georgia's": 'NP$',
 'Grand': 'JJ-TL',
 'It': 'PPS',
 'Ivan': 'NP',
 'Jr.': 'NP',
 'Judge': 'NN-TL',
 'Jury': 'NN-TL',
 'Mayor-nominate': 'NN-TL',
 'Only': 'RB',
 'Pye': 'NP',
 'September-October': 'NP',
 'Superior': 'JJ-TL',
 'The': 'AT',
 '``': '``',
 'a': 'AT',
 'act': 'NN',
 'ambiguous': 'JJ',
 'an': 'AT',
 'and': 'CC',
 'any': 'DTI',
 'are': 'BER',
 'been': 'BEN',
 'by': 'IN',
 'charge': 'NN',
 'charged': 'VBN',
 'city': 'NN',
 'conducted': 'VBN',
 'considering': 'VBG',
 'deserves': 'VBZ',
 'did': 'DOD',
 'election': 'NN',
 'evidence': 'NN',
 'find': 'VB',
 'for': 'IN',
 'further': 'RBR',
 'had': 'HVD',
 'handful': 'NN',
 'hard-fought': 'JJ',
 'have': 'HV',
 'in': 'IN',
 'inadequate': 'JJ',
 'interest': 'NN',
 'investigate': 'VB',
 'inv

In [31]:
lookup_tagger = nltk.UnigramTagger(model=likely_tags)
lookup_tagger.tag(brown_sents[5])

[('It', 'PPS'),
 ('recommended', 'VBN'),
 ('that', 'CS'),
 ('Fulton', 'NP-TL'),
 ('legislators', 'NNS'),
 ('act', 'NN'),
 ('``', '``'),
 ('to', 'TO'),
 ('have', 'HV'),
 ('these', None),
 ('laws', 'NNS'),
 ('studied', None),
 ('and', 'CC'),
 ('revised', None),
 ('to', 'TO'),
 ('the', 'AT'),
 ('end', None),
 ('of', 'IN'),
 ('modernizing', None),
 ('and', 'CC'),
 ('improving', None),
 ('them', None),
 ("''", "''"),
 ('.', '.')]

In [33]:
lookup_tagger = nltk.UnigramTagger(model=likely_tags, backoff=re_tagger_backoff)
lookup_tagger.tag(brown_sents[5])

[('It', 'PPS'),
 ('recommended', 'VBN'),
 ('that', 'CS'),
 ('Fulton', 'NP-TL'),
 ('legislators', 'NNS'),
 ('act', 'NN'),
 ('``', '``'),
 ('to', 'TO'),
 ('have', 'HV'),
 ('these', 'NN'),
 ('laws', 'NNS'),
 ('studied', 'VBD'),
 ('and', 'CC'),
 ('revised', 'VBD'),
 ('to', 'TO'),
 ('the', 'AT'),
 ('end', 'NN'),
 ('of', 'IN'),
 ('modernizing', 'VBG'),
 ('and', 'CC'),
 ('improving', 'VBG'),
 ('them', 'NN'),
 ("''", "''"),
 ('.', '.')]

In [35]:
size = int(len(brown_news_tagged) * 0.9)
train = brown_news_tagged[:size]
train

[[('The', 'AT'), ('Fulton', 'NP-TL'), ('County', 'NN-TL'), ('Grand', 'JJ-TL'), ('Jury', 'NN-TL'), ('said', 'VBD'), ('Friday', 'NR'), ('an', 'AT'), ('investigation', 'NN'), ('of', 'IN'), ("Atlanta's", 'NP$'), ('recent', 'JJ'), ('primary', 'NN'), ('election', 'NN'), ('produced', 'VBD'), ('``', '``'), ('no', 'AT'), ('evidence', 'NN'), ("''", "''"), ('that', 'CS'), ('any', 'DTI'), ('irregularities', 'NNS'), ('took', 'VBD'), ('place', 'NN'), ('.', '.')], [('The', 'AT'), ('jury', 'NN'), ('further', 'RBR'), ('said', 'VBD'), ('in', 'IN'), ('term-end', 'NN'), ('presentments', 'NNS'), ('that', 'CS'), ('the', 'AT'), ('City', 'NN-TL'), ('Executive', 'JJ-TL'), ('Committee', 'NN-TL'), (',', ','), ('which', 'WDT'), ('had', 'HVD'), ('over-all', 'JJ'), ('charge', 'NN'), ('of', 'IN'), ('the', 'AT'), ('election', 'NN'), (',', ','), ('``', '``'), ('deserves', 'VBZ'), ('the', 'AT'), ('praise', 'NN'), ('and', 'CC'), ('thanks', 'NNS'), ('of', 'IN'), ('the', 'AT'), ('City', 'NN-TL'), ('of', 'IN-TL'), ('Atlant

In [36]:
test = brown_tagged_sents[size:]
test

[[('But', 'CC'), ('in', 'IN'), ('all', 'ABN'), ('its', 'PP$'), ('175', 'CD'), ('years', 'NNS'), (',', ','), ('not', '*'), ('a', 'AT'), ('single', 'AP'), ('Negro', 'NP'), ('student', 'NN'), ('has', 'HVZ'), ('entered', 'VBN'), ('its', 'PP$'), ('classrooms', 'NNS'), ('.', '.')], [('Last', 'AP'), ('week', 'NN'), ('Federal', 'JJ-TL'), ('District', 'NN-TL'), ('Judge', 'NN-TL'), ('William', 'NP'), ('A.', 'NP'), ('Bootle', 'NP'), ('ordered', 'VBD'), ('the', 'AT'), ('university', 'NN'), ('to', 'TO'), ('admit', 'VB'), ('immediately', 'RB'), ('a', 'AT'), ('``', '``'), ('qualified', 'VBN'), ("''", "''"), ('Negro', 'NP'), ('boy', 'NN'), ('and', 'CC'), ('girl', 'NN'), ('.', '.')], ...]

In [37]:
tagger = nltk.UnigramTagger(train, backoff=re_tagger_backoff)
tagger.evaluate(test)

0.8445133939370731