In [1]:
import nltk
from nltk.corpus import brown

train = brown.tagged_sents(categories = 'news')
validation = brown.tagged_sents(categories = 'editorial')
test = brown.tagged_sents(categories = 'reviews')

In [2]:
patterns = [('.*ing$', 'VBG'), 
           ('.*ed$', 'VBD'), 
           ('.*es$', 'VBZ'), 
           ('.*ould$', 'MD'), 
           ('.*\'s$', 'NN$'), 
           ('.*s$', 'NNS'), 
           ('^-?[0-9]+(.[0-9]+)?$', 'CD'), 
           ('.*', 'NN')]

In [3]:
t0 = nltk.RegexpTagger(patterns)
t1 = nltk.UnigramTagger(train, backoff = t0)
t2 = nltk.BigramTagger(train, backoff = t1)
t3 = nltk.TrigramTagger(train, backoff = t2)

In [4]:
t3.accuracy(test)

0.8320066823899371

### observe errors 

In [5]:
valid_words = nltk.corpus.brown.words(categories = 'editorial')

In [6]:
guess_tags = t3.tag(valid_words)
data_tags = [(_, tag) for sent in validation for (_, tag) in sent]

In [7]:
valid_zip = list(zip(guess_tags, data_tags))

In [8]:
total_errors = []
for i, j in valid_zip:
    if i != j:
        total_errors.append((i[1], j[1]))

In [9]:
total_errors

[('NN-TL', 'NN-HL'),
 ('NN', 'NN-HL'),
 ('VBD', 'VBD-HL'),
 ('AP', 'AP-HL'),
 ('JJ', 'NN-HL'),
 ('NNS', 'VBZ'),
 ('VBD', 'VBN'),
 ('JJ', 'NN'),
 ('VBN', 'VBD'),
 ('IN', 'TO'),
 ('NN', 'VB'),
 ('VBD', 'VBN'),
 ('NNS', 'JJ'),
 ('VBD', 'VBN'),
 ('NN-HL', 'NN'),
 ('VBD', 'VBN'),
 ('NN', 'VB'),
 ('VBG', 'NN'),
 ('NN', 'JJ'),
 ('VBG', 'NN'),
 ('VBN', 'VBD'),
 ('QL', 'AP'),
 ('NN', 'RB'),
 ('NN', 'AP'),
 ('JJ-TL', 'NR-TL'),
 ('NN', 'VB'),
 ('NN', 'VB'),
 ('TO', 'IN'),
 ('QL', 'JJ'),
 ('NN-TL', 'NN-HL'),
 ('RB', 'RB-HL'),
 ('VBZ', 'VBZ-HL'),
 ('IN', 'IN-HL'),
 ('AT', 'AT-HL'),
 ('NN', 'NN-HL'),
 ('IN', 'IN-HL'),
 ('NN', 'NN-HL'),
 ('NNS', 'NNS-TL'),
 ('PPO', 'PPS'),
 ('NN', 'RB'),
 ('JJ', 'VBG'),
 ('RP', 'IN'),
 ('TO', 'IN'),
 ('NN$', 'NP$'),
 ('CD', 'PN'),
 ('NN', 'VB'),
 ('RB', 'CS'),
 ('VBN', 'VB'),
 ('NN', 'JJ'),
 ('JJ', 'NN'),
 ('VBN', 'VB'),
 ('VBZ', 'NNS'),
 ('NN', 'NNS'),
 ('VBG', 'JJ'),
 ('NN', 'JJ'),
 ('TO', 'IN'),
 ('NN', 'VB'),
 ('TO', 'IN'),
 ('JJ', 'NN'),
 ('RBR', 'JJR'),
 ('NN',

In [10]:
len(total_errors)

8414

In [11]:
nltk.FreqDist(total_errors).most_common()

[(('NN', 'JJ'), 782),
 (('VBD', 'VBN'), 589),
 (('NN', 'VB'), 563),
 (('NN', 'NP'), 550),
 (('TO', 'IN'), 374),
 (('VBZ', 'NNS'), 268),
 (('NN', 'RB'), 238),
 (('NN', 'NN-TL'), 222),
 (('IN', 'TO'), 218),
 (('NN', 'NN-HL'), 175),
 (('NNS', 'VBZ'), 150),
 (('VBN', 'VBD'), 137),
 (('NP', 'NP-TL'), 125),
 (('VB', 'NN'), 122),
 (('VBG', 'NN'), 113),
 (('NN$', 'NP$'), 98),
 (('IN', 'CS'), 97),
 (('JJ', 'NN'), 96),
 (('VBD', 'JJ'), 96),
 (('AT', 'AT-HL'), 94),
 (('IN', 'IN-HL'), 76),
 (('AP', 'QL'), 76),
 (('RB', 'QL'), 73),
 (('NNS', 'NN'), 70),
 (('IN', 'RP'), 66),
 (('NP-TL', 'NP'), 65),
 (('NN', 'NP-TL'), 62),
 (('NNS', 'JJ'), 59),
 (('QL', 'RB'), 59),
 (('CS', 'QL'), 54),
 (('VBG', 'JJ'), 51),
 (('CS', 'DT'), 50),
 (('RP', 'IN'), 48),
 (('CS', 'WPS'), 47),
 (('NP', 'NP-HL'), 45),
 (('NNS', 'NNS-TL'), 43),
 (('NNS', 'NPS'), 42),
 (('NNS', 'NNS-HL'), 40),
 (('NN', 'VBG'), 39),
 (('IN', 'IN-TL'), 39),
 (('NN', 'NNS'), 38),
 ((':', ':-HL'), 36),
 (('NN', 'JJ-TL'), 34),
 (('JJ-TL', 'JJ'), 33

### filter errors made by nltk.RegexpTagger

In [12]:
valid_words = nltk.corpus.brown.words(categories = 'editorial')

In [13]:
t1_test = nltk.UnigramTagger(train)
t2_test = nltk.BigramTagger(train, backoff = t1_test)
t3_test = nltk.TrigramTagger(train, backoff = t2_test)

In [14]:
t3_test.tag(valid_words)

[('Assembly', 'NN-TL'),
 ('session', 'NN'),
 ('brought', 'VBD'),
 ('much', 'AP'),
 ('good', 'JJ'),
 ('The', 'AT'),
 ('General', 'JJ-TL'),
 ('Assembly', 'NN-TL'),
 (',', ','),
 ('which', 'WDT'),
 ('adjourns', None),
 ('today', 'NR'),
 (',', ','),
 ('has', 'HVZ'),
 ('performed', 'VBD'),
 ('in', 'IN'),
 ('an', 'AT'),
 ('atmosphere', 'NN'),
 ('of', 'IN'),
 ('crisis', 'NN'),
 ('and', 'CC'),
 ('struggle', 'NN'),
 ('from', 'IN'),
 ('the', 'AT'),
 ('day', 'NN'),
 ('it', 'PPS'),
 ('convened', None),
 ('.', '.'),
 ('It', 'PPS'),
 ('was', 'BEDZ'),
 ('faced', 'VBN'),
 ('immediately', 'RB'),
 ('with', 'IN'),
 ('a', 'AT'),
 ('showdown', 'NN'),
 ('on', 'IN'),
 ('the', 'AT'),
 ('schools', 'NNS'),
 (',', ','),
 ('an', 'AT'),
 ('issue', 'NN'),
 ('which', 'WDT'),
 ('was', 'BEDZ'),
 ('met', 'VBN'),
 ('squarely', 'RB'),
 ('in', 'IN'),
 ('conjunction', None),
 ('with', 'IN'),
 ('the', 'AT'),
 ('governor', 'NN'),
 ('with', 'IN'),
 ('a', 'AT'),
 ('decision', 'NN'),
 ('not', '*'),
 ('to', 'TO'),
 ('risk', 'VB'

In [15]:
data_tags = [(_, tag) for sent in validation for (_, tag) in sent]
test_tags = t3_test.tag(valid_words) # 얘는 nltk.RegExpTagger 안 거친 아이 

compa_I = list(zip(data_tags, test_tags))

- extract lexicons which value is assigned as None and its true label

In [16]:
Nones = [word for word, _ in t3_test.tag(valid_words) if _ == None]

In [17]:
Nones

['adjourns',
 'convened',
 'conjunction',
 'abandoning',
 'budgeting',
 'Long-range',
 'musts',
 'crisis-to-crisis',
 'insured',
 'shuffle',
 'await',
 'drag',
 'limits',
 'repeal',
 'penal',
 'banning',
 'fireworks',
 'reappraisal',
 'Fortunately',
 'spared',
 'spate',
 'resolutions',
 'empire',
 'congratulate',
 'answers',
 'proudly',
 'windup',
 "Jefferson's",
 'dictum',
 'Keep',
 'inattentive',
 'governors',
 'wolves',
 'Newspapermen',
 'penetrating',
 'cons',
 'wade',
 'regularity',
 'Coosa',
 'Cities',
 'peeled',
 'Coosa',
 'Coupling',
 '$83,750',
 '$30,000',
 'five-member',
 'undertaking',
 'abundant',
 'banding',
 'area-wide',
 'go-it-alone',
 'strengthens',
 'replacing',
 'retaliation',
 'removes',
 'A-bombs',
 'deter',
 'hers',
 'delusion',
 'deterrence',
 'besides',
 'stance',
 'guess',
 'extremity',
 'Soviets',
 'leeway',
 'low-grade',
 'brushfire',
 'aggressions',
 'gearing',
 'junks',
 'bluff',
 'brinkmanship',
 'builds',
 'DeKalb',
 'beam',
 "DeKalb's",
 'minimum',
 'bee

In [18]:
data_tags_ofNone = [i for i, j in compa_I if i != j and j[1] == None] #validation의 정답들

In [19]:
data_tags_ofNone

[('adjourns', 'VBZ'),
 ('convened', 'VBD'),
 ('conjunction', 'NN'),
 ('abandoning', 'VBG'),
 ('budgeting', 'VBG'),
 ('Long-range', 'NN'),
 ('musts', 'NNS'),
 ('crisis-to-crisis', 'JJ'),
 ('insured', 'VBN'),
 ('shuffle', 'NN'),
 ('await', 'VB'),
 ('drag', 'NN'),
 ('limits', 'NNS'),
 ('repeal', 'NN'),
 ('penal', 'JJ'),
 ('banning', 'NN'),
 ('fireworks', 'NNS'),
 ('reappraisal', 'NN'),
 ('Fortunately', 'RB'),
 ('spared', 'VBD'),
 ('spate', 'NN'),
 ('resolutions', 'NNS'),
 ('empire', 'NN'),
 ('congratulate', 'VB'),
 ('answers', 'NNS'),
 ('proudly', 'RB'),
 ('windup', 'NN'),
 ("Jefferson's", 'NP$'),
 ('dictum', 'NN'),
 ('Keep', 'VB'),
 ('inattentive', 'JJ'),
 ('governors', 'NNS'),
 ('wolves', 'NNS'),
 ('Newspapermen', 'NNS'),
 ('penetrating', 'JJ'),
 ('cons', 'NNS'),
 ('wade', 'VB'),
 ('regularity', 'NN'),
 ('Coosa', 'NP-TL'),
 ('Cities', 'NNS'),
 ('peeled', 'VBN'),
 ('Coosa', 'NP-TL'),
 ('Coupling', 'VBG'),
 ('$83,750', 'NNS'),
 ('$30,000', 'NNS'),
 ('five-member', 'JJ'),
 ('undertaking', 

- tag results of lexicons that has value None 

In [20]:
Reg1tags_ofNone = t0.tag(Nones)

In [21]:
Reg1tags_ofNone

[('adjourns', 'NNS'),
 ('convened', 'VBD'),
 ('conjunction', 'NN'),
 ('abandoning', 'VBG'),
 ('budgeting', 'VBG'),
 ('Long-range', 'NN'),
 ('musts', 'NNS'),
 ('crisis-to-crisis', 'NNS'),
 ('insured', 'VBD'),
 ('shuffle', 'NN'),
 ('await', 'NN'),
 ('drag', 'NN'),
 ('limits', 'NNS'),
 ('repeal', 'NN'),
 ('penal', 'NN'),
 ('banning', 'VBG'),
 ('fireworks', 'NNS'),
 ('reappraisal', 'NN'),
 ('Fortunately', 'NN'),
 ('spared', 'VBD'),
 ('spate', 'NN'),
 ('resolutions', 'NNS'),
 ('empire', 'NN'),
 ('congratulate', 'NN'),
 ('answers', 'NNS'),
 ('proudly', 'NN'),
 ('windup', 'NN'),
 ("Jefferson's", 'NN$'),
 ('dictum', 'NN'),
 ('Keep', 'NN'),
 ('inattentive', 'NN'),
 ('governors', 'NNS'),
 ('wolves', 'VBZ'),
 ('Newspapermen', 'NN'),
 ('penetrating', 'VBG'),
 ('cons', 'NNS'),
 ('wade', 'NN'),
 ('regularity', 'NN'),
 ('Coosa', 'NN'),
 ('Cities', 'VBZ'),
 ('peeled', 'VBD'),
 ('Coosa', 'NN'),
 ('Coupling', 'VBG'),
 ('$83,750', 'NN'),
 ('$30,000', 'NN'),
 ('five-member', 'NN'),
 ('undertaking', 'VBG')

In [22]:
compa_II = list(zip(Reg1tags_ofNone, data_tags_ofNone))

- errors made by t0

In [23]:
errors = [((guess[1], answer[1]), guess[0]) for guess, answer in compa_II if guess != answer]

In [24]:
errors

[(('NNS', 'VBZ'), 'adjourns'),
 (('NNS', 'JJ'), 'crisis-to-crisis'),
 (('VBD', 'VBN'), 'insured'),
 (('NN', 'VB'), 'await'),
 (('NN', 'JJ'), 'penal'),
 (('VBG', 'NN'), 'banning'),
 (('NN', 'RB'), 'Fortunately'),
 (('NN', 'VB'), 'congratulate'),
 (('NN', 'RB'), 'proudly'),
 (('NN$', 'NP$'), "Jefferson's"),
 (('NN', 'VB'), 'Keep'),
 (('NN', 'JJ'), 'inattentive'),
 (('VBZ', 'NNS'), 'wolves'),
 (('NN', 'NNS'), 'Newspapermen'),
 (('VBG', 'JJ'), 'penetrating'),
 (('NN', 'VB'), 'wade'),
 (('NN', 'NP-TL'), 'Coosa'),
 (('VBZ', 'NNS'), 'Cities'),
 (('VBD', 'VBN'), 'peeled'),
 (('NN', 'NP-TL'), 'Coosa'),
 (('NN', 'NNS'), '$83,750'),
 (('NN', 'NNS'), '$30,000'),
 (('NN', 'JJ'), 'five-member'),
 (('VBG', 'NN'), 'undertaking'),
 (('NN', 'JJ'), 'abundant'),
 (('NN', 'JJ'), 'area-wide'),
 (('NN', 'JJ'), 'go-it-alone'),
 (('NNS', 'VBZ-HL'), 'strengthens'),
 (('NNS', 'NN'), 'A-bombs'),
 (('NN', 'VB'), 'deter'),
 (('NNS', 'PP$$'), 'hers'),
 (('VBZ', 'IN'), 'besides'),
 (('NNS', 'VB'), 'guess'),
 (('NNS',

In [25]:
len(errors)

3829

In [26]:
len(errors) / len(total_errors)

0.4550748752079867

- this amount of error can be solved by enchancing t0


### reviews on steps 
- have found data_tags_ofNone by comparing data_tags and test1_tags(applied t3, t2, t1)
- extract errors made by RegexpTagger by comparison data_tags_ofNone and Reg1tags_ofNone


- let's edit Regexp based on the high frequency guess and answer pair 
- look for characteristics of errors based on its types


In [27]:
fd_error_tags = nltk.FreqDist([i[0] for i in errors])

In [28]:
fd_error_tags.most_common()

[(('NN', 'JJ'), 695),
 (('NN', 'NP'), 549),
 (('NN', 'VB'), 306),
 (('VBD', 'VBN'), 283),
 (('VBZ', 'NNS'), 238),
 (('NN', 'RB'), 234),
 (('NN', 'NN-TL'), 211),
 (('NN$', 'NP$'), 98),
 (('VBD', 'JJ'), 92),
 (('NNS', 'VBZ'), 81),
 (('NNS', 'NN'), 66),
 (('NNS', 'JJ'), 59),
 (('NN', 'NP-TL'), 59),
 (('VBG', 'NN'), 51),
 (('VBG', 'JJ'), 50),
 (('NN', 'NN-HL'), 50),
 (('NNS', 'NPS'), 42),
 (('NNS', 'NNS-TL'), 37),
 (('NN', 'NNS'), 35),
 (('NN', 'JJ-TL'), 32),
 (('NN', 'NP-HL'), 29),
 (('NN', 'QL'), 28),
 (('NN', 'VBN'), 24),
 (('NNS', 'NP'), 19),
 (('NN', 'CD'), 18),
 (('NN', 'JJR'), 17),
 (('NNS', 'NNS-HL'), 17),
 (('NN', 'JJT'), 17),
 (('NN$', 'NN$-TL'), 16),
 (('NN', 'JJ-HL'), 16),
 (('VBZ', 'DOZ'), 15),
 (('NN', 'NNS$'), 14),
 (('VBZ', 'NNS-TL'), 14),
 (('VBG', 'IN'), 13),
 (('NN', 'IN'), 12),
 (('NN', 'OD-TL'), 12),
 (('VBG', 'VBG-HL'), 10),
 (('NNS', 'VB'), 9),
 (('NN', 'FW-NN'), 9),
 (('VBG', 'NP'), 8),
 (('NN', 'NP$'), 7),
 (('VBG', 'VBG-TL'), 7),
 (('NN', 'VB-TL'), 7),
 (('NN', 'C

- tags that was not listed in the original tags are observed. Such as JJ, NP, VB, NNS, RB, NN-TL and stuff
- NN shows the highest error rate which was the default tag of t0 


> observe lexicons with the following order of (error label, true label) list
('NN', 'JJ'), ('NN', 'NP'), ('NN', 'VB'), ('NN', 'RB'), ('NN', 'NN-TL'), ('NN$', 'NP$')

In [29]:
idx_errors = nltk.Index(errors)

In [30]:
idx_errors

Index(list,
      {('NNS', 'VBZ'): ['adjourns',
        'junks',
        'builds',
        'sounds',
        'tends',
        'exists',
        'tends',
        'awaits',
        'hangs',
        'awaits',
        'veers',
        'draws',
        'tends',
        'warns',
        'confirms',
        'sits',
        'reflects',
        'balks',
        'rests',
        'fears',
        'blankets',
        'labels',
        'bears',
        'suffers',
        'answers',
        'tends',
        'treats',
        'treats',
        'stiffens',
        'straightens',
        'suggests',
        'predicts',
        'considers',
        'fills',
        'tends',
        'awakens',
        'reflects',
        'suffers',
        'clings',
        'wanders',
        'hails',
        'wears',
        'puffs',
        'sits',
        'sits',
        'remembers',
        'squirms',
        'compounds',
        'sounds',
        'fosters',
        'suggests',
        'fears',
        'equals',
    

In [31]:
idx_errors[('NN', 'JJ')]

['penal',
 'inattentive',
 'five-member',
 'abundant',
 'area-wide',
 'go-it-alone',
 'minimum',
 'beefed-up',
 'half-million',
 'intensive',
 'helpful',
 'repulsive',
 'blood-thirsty',
 'Venezuelan',
 'Dominican',
 'Venezuelan',
 'benevolent',
 'oppressive',
 'tacit',
 'Dominican',
 'Communist-type',
 'Dominican',
 'provocative',
 'quadripartite',
 'stray',
 'step-by-step',
 'dusty-green',
 'abundant',
 'moth-eaten',
 'unbroken',
 'ecological',
 'customary',
 'anti-liquor',
 'lax',
 'awkward',
 'helicopter-borne',
 'Australian',
 'callable',
 'unprovocative',
 'road-shy',
 "guerrilla-th'-wisp",
 'ironic',
 'unreasonable',
 'pro-neutralist',
 'guaranteed-neutral',
 'coup-proof',
 'militant',
 'sturdy',
 'forceful',
 'flamboyant',
 'oratorical',
 'partisan',
 'dissident',
 'contrary',
 'incapable',
 'weak',
 'inconsistent',
 'Additional',
 'applicable',
 'peaceful',
 'backward',
 'humble',
 'unwelcome',
 'symptomatic',
 '100-ton',
 'regulatory',
 'tragic',
 'bloody',
 'overt',
 'Katanga

- l, y, e, t are the common last words

In [32]:
nltk.FreqDist(word[-1] for word in idx_errors[('NN', 'JJ')])

FreqDist({'e': 179, 'l': 119, 't': 85, 'n': 80, 'y': 78, 'c': 69, 'r': 26, 'd': 20, 'h': 11, 'p': 7, ...})

In [33]:
nltk.FreqDist(word[-1] for word in idx_errors[('NN', 'JJ')]).most_common()

[('e', 179),
 ('l', 119),
 ('t', 85),
 ('n', 80),
 ('y', 78),
 ('c', 69),
 ('r', 26),
 ('d', 20),
 ('h', 11),
 ('p', 7),
 ('m', 4),
 ('f', 4),
 ('o', 4),
 ('k', 3),
 ('x', 2),
 ('b', 1),
 ('i', 1),
 ('u', 1),
 ('w', 1)]

- frequencly ends with e, l, t, n, y

In [34]:
import re
len([word for word in idx_errors[('NN', 'JJ')] if re.search('.*[eltny]$', word)])

541

In [35]:
len([word for word in idx_errors[('NN', 'JJ')] if re.search('.*[eltny]$', word)]) / len(idx_errors[('NN', 'JJ')])

0.7784172661870503

In [36]:
len([word for word in idx_errors[('NN', 'JJ')] if re.search('-', word)])

131

In [37]:
len([word for word in idx_errors[('NN', 'JJ')] if re.search('-', word)]) / len(idx_errors[('NN', 'JJ')])

0.1884892086330935

> words end with e, l, t, n, y and the ones where - is at word-medial position > tags as JJ

In [38]:
idx_errors[('NN', 'NP')]

['Emmerich',
 'Mennen',
 'Zanzibar',
 'Soapy',
 'Rafael',
 'Trujillo',
 'Ciudad',
 'Trujillo',
 'Caribbean',
 'Trujillo',
 'Maria',
 'Galindez',
 'Romulo',
 'Betancourt',
 'Venezuela',
 'Perez',
 'Jimenez',
 'Trujillo',
 'Trujillo',
 'Trujillo',
 'Caribbean',
 'Trujillo',
 'Trujillo',
 'Brookmont',
 'Darwin',
 'Stolzenbach',
 'Ulbricht',
 'Berlin-West',
 'Adenauer',
 'Ulbricht',
 'Gerhard',
 'Eisler',
 'Danzig',
 'Oopsie-Cola',
 'Bangkok',
 'Thai',
 'Minh',
 'Calvin',
 'Algerian',
 'Colombian',
 'Che',
 'Guevara',
 'Montevideo',
 'Venezuela',
 'Dag',
 'Ndola',
 'Goodbody',
 'Metro',
 'Interama',
 'Interama',
 'Interama',
 'Interama',
 'Colee',
 'Singer',
 'Bundestag',
 'Erhart',
 'Erhart',
 'Adenauer',
 'Erhart',
 'Konrad',
 'Adenauer',
 'Bismarck',
 'Mosk',
 'Islandia',
 'Key',
 'Balaguer',
 'Trujillo',
 'Ciudad',
 'Trujillo',
 'Trujillo',
 'Balaguer',
 'Ciudad',
 'Juan',
 'Bosch',
 'Reuther',
 'Sihanouk',
 'Udall',
 'Provincetown',
 'Chatham',
 'Schwada',
 'Carbondale',
 'Herrin-Murp

- 딱 봐도 맨 앞 글자가 대문자로 시작하는 게 많다. 

In [39]:
len([word for word in idx_errors[('NN', 'NP')] if re.search('^[A-Z]', word)])

549

In [40]:
len([word for word in idx_errors[('NN', 'NP')] if re.search('^[A-Z]', word)]) / len(idx_errors[('NN', 'NP')])

1.0

> 'NN', 'NP'의 모든 오류 어휘들이 대문자로 시작한다 
>> ('NN', 'NP') 오류는 맨 앞글자가 대문자라는 조건으로 바로 잡기로 한다. 이 조건은 적절하게 앞에 배치해야 한다. 

In [41]:
idx_errors[('NN', 'VB')]

['await',
 'congratulate',
 'Keep',
 'wade',
 'deter',
 'deserve',
 'deserve',
 'breathe',
 'constitute',
 'acquiesce',
 'usurp',
 'mitigate',
 'roam',
 'clinch',
 'suffer',
 'bog',
 'shore',
 'seal',
 'classify',
 'subside',
 'Sound',
 'recall',
 'bury',
 'wake',
 'invade',
 'subjugate',
 'urge',
 'propel',
 'drink',
 'ignite',
 'validate',
 'ascend',
 'assail',
 'rewrite',
 'purport',
 'capitalize',
 'devise',
 'hide',
 'plug',
 'submit',
 'eradicate',
 'over-produce',
 'revitalize',
 'restore',
 'nest',
 'dovetail',
 'accommodate',
 'shy',
 'disturb',
 'repeat',
 'bolster',
 'urge',
 'recede',
 'succumb',
 'grind',
 'stipulate',
 'vest',
 'reorganize',
 'contribute',
 'avert',
 'deserve',
 'sprout',
 'bud',
 'blossom',
 'harbor',
 'Forgive',
 'Forgive',
 'sin',
 'sin',
 'beg',
 'frighten',
 'bore',
 'Bore',
 'visualize',
 'bore',
 'reassure',
 'Imagine',
 'covet',
 'conform',
 'bruise',
 'bestow',
 'sweep',
 'wrap',
 'resume',
 'suffer',
 'accelerate',
 'decelerate',
 'jam',
 'plug'

- often ends with e

In [42]:
nltk.FreqDist(word[-1] for word in idx_errors[('NN', 'VB')])

FreqDist({'e': 146, 't': 43, 'r': 23, 'n': 14, 'd': 13, 'y': 12, 'h': 11, 'l': 11, 'p': 10, 'k': 7, ...})

- need to compare the ratio of which particular alphabet appears as the last character with other POS type

In [43]:
len([word for word in idx_errors[('NN', 'VB')] if word.endswith('e')]) / len(idx_errors[('NN', 'VB')])

0.477124183006536

In [44]:
len([word for word in idx_errors[('NN', 'JJ')] if word.endswith('e')]) / len(idx_errors[('NN', 'JJ')])

0.25755395683453236

- the ratio by which word ends with e is much higher in 'NN', 'VB' pair

In [45]:
len([word for word in idx_errors[('NN', 'VB')] if word.endswith('t')]) / len(idx_errors[('NN', 'VB')])

0.14052287581699346

In [46]:
len([word for word in idx_errors[('NN', 'JJ')] if word.endswith('t')]) / len(idx_errors[('NN', 'JJ')])

0.1223021582733813

- t is similar. VB is somewhat higher 

In [47]:
len([word for word in idx_errors[('NN', 'VB')] if word.endswith('n')]) / len(idx_errors[('NN', 'VB')])

0.0457516339869281

In [48]:
len([word for word in idx_errors[('NN', 'JJ')] if word.endswith('n')]) / len(idx_errors[('NN', 'JJ')])

0.11510791366906475

> for now lets tag words end with e or t as VB

In [49]:
idx_errors[('NN', 'RB')]

['Fortunately',
 'proudly',
 'professedly',
 'Unquestionably',
 'sycophantically',
 'altogether',
 'Tardily',
 'Thereupon',
 'frankly',
 'respectively',
 'illegally',
 'westward',
 'Simply',
 'unfairly',
 'mercilessly',
 'softly',
 'softly',
 'Fortunately',
 'tacitly',
 'Twice',
 'sensibly',
 'patiently',
 'Simply',
 'demonstrably',
 'fantastically',
 'Otherwise',
 'blindly',
 'necessarily',
 'experimentally',
 'precariously',
 'desperately',
 'earnestly',
 'courageously',
 'approvingly',
 'Inevitably',
 'politically',
 'undeniably',
 'offshore',
 'collectively',
 'continuously',
 'speedily',
 'Outwardly',
 'Recently',
 'Well',
 'diligently',
 'aloof',
 'Nowhere',
 'blithely',
 'heretofore',
 'warmly',
 'thoughtlessly',
 'oft',
 'searchingly',
 'no-o',
 'ruthlessly',
 'silently',
 'endlessly',
 'vainly',
 'readily',
 'obligingly',
 'pleasantly',
 'harshly',
 'Hardly',
 'gravely',
 'Never',
 'peacefully',
 'incidentally',
 'individually',
 'smoothly',
 'resolutely',
 'alternately',
 'un

- much of the words end with ly

In [50]:
len([word for word in idx_errors[('NN', 'RB')] if word.endswith('ly')]) / len(idx_errors[('NN', 'RB')])

0.7991452991452992

In [51]:
len([word for word in idx_errors[('NN', 'JJ')] if word.endswith('ly')]) / len(idx_errors[('NN', 'JJ')])

0.017266187050359712

In [52]:
len([word for word in idx_errors[('NN', 'NP')] if word.endswith('ly')]) / len(idx_errors[('NN', 'NP')])

0.01092896174863388

- words end with ly tag as RB 

In [53]:
idx_errors[('NN', 'NN-TL')]

['Power',
 'Benefactor',
 'Parkway',
 'Cabin',
 'Echo',
 'Capital',
 'Deal',
 'Deal',
 'Frontier',
 'Shadow',
 'Tract',
 'Tract',
 'Interior',
 'Survey',
 'Side',
 'Colonel',
 'Presidency',
 'Eagle',
 'Diem',
 'Charter',
 'Charter',
 'Charter',
 'Estimate',
 'Estimate',
 'Charter',
 'Borough',
 'Estimate',
 'Borough',
 'Estimate',
 'Hemisphere',
 'Horn',
 'Kingdom',
 'Father',
 'Father',
 'Father',
 'Loon',
 'Dad',
 'Rock',
 'Lanesmanship',
 'Curtain',
 'Gazette',
 'Gazette',
 'Gazette',
 'Prison',
 'Bust',
 'Wagon',
 'Train',
 'Car',
 'Frontier',
 'Version',
 'Version',
 'Version',
 'Divinity',
 'Version',
 'Version',
 'Version',
 'Spirit',
 'Spirit',
 'Sermon',
 'Heaven',
 'Revelation',
 'Prof.',
 'Wisdom',
 'Laboratory',
 'Victor',
 'Grad',
 'Forum',
 'Problem',
 'Knife',
 'Bulletin',
 'Rescue',
 'Presidency',
 'Voice',
 'Voice',
 'Voice',
 'Congresswoman',
 'Declaration',
 'Independence',
 'Charter',
 'Speedup',
 'Parkway',
 'Deep',
 'Peep',
 'Deep',
 'Peep',
 'Legislation',
 'Monu

In [54]:
len(idx_errors[('NN', 'NN-TL')])

211

In [55]:
len(idx_errors[('NN', 'NP')])

549

- just like NP words, many of the words starts with a capital letter 
- however len is much higher because the POS is NP 
- for now, let's just give up NN-TL

In [56]:
idx_errors[('NN$', 'NP$')]

["Jefferson's",
 "DeKalb's",
 "Trujillo's",
 "Hammarskjold's",
 "Congo's",
 "Miami's",
 "Miami's",
 "Florida's",
 "Florida's",
 "Adenauer's",
 "Brandt's",
 "Germany's",
 "Germany's",
 "Einstein's",
 "California's",
 "Trujillo's",
 "Balaguer's",
 "Freeman's",
 "Soviet's",
 "Jackson's",
 "Cambodia's",
 "Sihanouk's",
 "Washington's",
 "January's",
 "Dalton's",
 "Faget's",
 "Batista's",
 "Truman's",
 "Cunard's",
 "Taylor's",
 "Johnson's",
 "Hammarskjold's",
 "Batista's",
 "God's",
 "God's",
 "God's",
 "God's",
 "God's",
 "God's",
 "God's",
 "God's",
 "Podger's",
 "Podger's",
 "Richard's",
 "Robby's",
 "Brown's",
 "California's",
 "Pasadena's",
 "Franklin's",
 "Sing's",
 "Beebe's",
 "Pullman's",
 "Frelinghuysen's",
 "Dwyer's",
 "Edison's",
 "Lincoln's",
 "Sandburg's",
 "Lincoln's",
 "Mommy's",
 "Mundt's",
 "Griffin's",
 "Miriani's",
 "Luther's",
 "Ahmad's",
 "Washington's",
 "Stalin's",
 "Stalin's",
 "Hammarskjold's",
 "Hammarskjold's",
 "Ruark's",
 "Pauling's",
 "McCone's",
 "Browning's",


In [57]:
patterns

[('.*ing$', 'VBG'),
 ('.*ed$', 'VBD'),
 ('.*es$', 'VBZ'),
 ('.*ould$', 'MD'),
 (".*'s$", 'NN$'),
 ('.*s$', 'NNS'),
 ('^-?[0-9]+(.[0-9]+)?$', 'CD'),
 ('.*', 'NN')]

- possessive case with the first letter being capital, need to tag as NP\\$ not NN\\$


> NP\\$ as '[A-Z]^.*\'s$'로

### review on steps  
- edit the pattern by the observed characteristics 

- ('NN', 'JJ')는 (1)word-medial '-' (2) last letter is either l, t, n, y
- ('NN', 'NP')는 (1) first letter is capital
- ('NN', 'VB')는 (1) last letter is e
- ('NN', 'RB')는 (1) word ends with ly
- ('NN$', 'NP$')는 (1) first letter is capital (2) ends with 's

- write regular expression based on the suggested notes 
> 각각'.*-?.+[ltny]$', '^[A-Z].*', '.*[et]$', '.*ly$', '^[A-Z].*\'s$'

In [66]:
editted_patterns = [('^[A-Z].*\'s$', 'NP$'),
                    ('^[A-Z].*', 'NP'),
                    ('.*ing$', 'VBG'), 
                    ('.*ed$', 'VBD'),
                    ('.*es$', 'VBZ'), 
                    ('.*ould$', 'MD'), 
                    ('.*\'s$', 'NN$'), 
                    ('.*s$', 'NNS'), 
                    ('^-?[0-9]+(.[0-9]+)?$', 'CD'), 
                    ('.*[et]$', 'VB'),
                    ('.*ly$', 'RB'),
                    ('.*-?.+[lny]$', 'JJ'),
                    ('.*', 'NN')]

editted_t0 = nltk.RegexpTagger(editted_patterns)
editted_t1 = nltk.UnigramTagger(train, backoff = editted_t0)
editted_t2 = nltk.BigramTagger(train, backoff = editted_t1)
editted_t3 = nltk.TrigramTagger(train, backoff = editted_t2)

editted_t3.accuracy(validation)

0.8672975780793455

In [68]:
editted1_patterns = [('^[A-Z].*\'s$', 'NP$'),
                    ('^[A-Z].*', 'NP'),
                    ('.*ing$', 'VBG'), 
                    ('.*ed$', 'VBD'),
                    ('.*es$', 'VBZ'), 
                    ('.*ould$', 'MD'), 
                    ('.*\'s$', 'NN$'), 
                    ('.*s$', 'NNS'), 
                    ('^-?[0-9]+(.[0-9]+)?$', 'CD'), 
                    ('.*[et]$', 'VB'),
                    ('.*ly$', 'RB'),
                    ('.*-.*[lny]$', 'JJ'),
                    ('.*', 'NN')]

editted1_t0 = nltk.RegexpTagger(editted1_patterns)
editted1_t1 = nltk.UnigramTagger(train, backoff = editted1_t0)
editted1_t2 = nltk.BigramTagger(train, backoff = editted1_t1)
editted1_t3 = nltk.TrigramTagger(train, backoff = editted1_t2)

editted1_t3.accuracy(validation)

0.8720050646061944

In [69]:
editted2_patterns = [('^[A-Z].*\'s$', 'NP$'),
                    ('^[A-Z].*', 'NP'),
                    ('.*ing$', 'VBG'), 
                    ('.*ed$', 'VBD'),
                    ('.*es$', 'VBZ'), 
                    ('.*ould$', 'MD'), 
                    ('.*\'s$', 'NN$'), 
                    ('.*s$', 'NNS'), 
                    ('^-?[0-9]+(.[0-9]+)?$', 'CD'), 
                    ('.*[et]$', 'VB'),                    
                    ('.*ly$', 'RB'),
                    ('.*-.*', 'JJ'),
                    ('.*', 'NN')]

editted2_t0 = nltk.RegexpTagger(editted2_patterns)
editted2_t1 = nltk.UnigramTagger(train, backoff = editted2_t0)
editted2_t2 = nltk.BigramTagger(train, backoff = editted2_t1)
editted2_t3 = nltk.TrigramTagger(train, backoff = editted2_t2)

editted2_t3.accuracy(validation)

0.8722810207129407

In [70]:
editted3_patterns = [('.*-.*', 'JJ'),
                    ('^[A-Z].*\'s$', 'NP$'),
                    ('^[A-Z].*', 'NP'),
                    ('.*ing$', 'VBG'), 
                    ('.*ed$', 'VBD'),
                    ('.*es$', 'VBZ'), 
                    ('.*ould$', 'MD'), 
                    ('.*\'s$', 'NN$'), 
                    ('.*s$', 'NNS'), 
                    ('^-?[0-9]+(.[0-9]+)?$', 'CD'), 
                    ('.*[et]$', 'VB'),                    
                    ('.*ly$', 'RB'),                
                    ('.*', 'NN')]

editted3_t0 = nltk.RegexpTagger(editted3_patterns)
editted3_t1 = nltk.UnigramTagger(train, backoff = editted3_t0)
editted3_t2 = nltk.BigramTagger(train, backoff = editted3_t1)
editted3_t3 = nltk.TrigramTagger(train, backoff = editted3_t2)

editted3_t3.accuracy(validation)

0.8737257320953185

In [72]:
editted4_patterns = [('^[A-Z].*\'s$', 'NP$'),
                    ('^[A-Z].*', 'NP'),
                    ('.*-.*', 'JJ'),
                    ('.*ing$', 'VBG'), 
                    ('.*ed$', 'VBD'),
                    ('.*es$', 'VBZ'), 
                    ('.*ould$', 'MD'), 
                    ('.*\'s$', 'NN$'), 
                    ('.*s$', 'NNS'), 
                    ('^-?[0-9]+(.[0-9]+)?$', 'CD'),  
                    ('.*[et]$', 'VB'),
                    ('.*ly$', 'RB'),
                    ('.*', 'NN')]

editted4_t0 = nltk.RegexpTagger(editted4_patterns)
editted4_t1 = nltk.UnigramTagger(train, backoff = editted4_t0)
editted4_t2 = nltk.BigramTagger(train, backoff = editted4_t1)
editted4_t3 = nltk.TrigramTagger(train, backoff = editted4_t2)

editted4_t3.accuracy(validation)

0.8737906629439647

'VB'에서 e를 빼봤다 

In [73]:
editted5_patterns = [('^[A-Z].*\'s$', 'NP$'),
                    ('^[A-Z].*', 'NP'),
                    ('.*-.*', 'JJ'),
                    ('.*ing$', 'VBG'), 
                    ('.*ed$', 'VBD'),
                    ('.*es$', 'VBZ'), 
                    ('.*ould$', 'MD'), 
                    ('.*\'s$', 'NN$'), 
                    ('.*s$', 'NNS'), 
                    ('^-?[0-9]+(.[0-9]+)?$', 'CD'),  
                    ('.*e$', 'VB'),
                    ('.*ly$', 'RB'),
                    ('.*', 'NN')]

editted5_t0 = nltk.RegexpTagger(editted5_patterns)
editted5_t1 = nltk.UnigramTagger(train, backoff = editted5_t0)d
editted5_t2 = nltk.BigramTagger(train, backoff = editted5_t1)
editted5_t3 = nltk.TrigramTagger(train, backoff = editted5_t2)

editted5_t3.accuracy(validation)

0.8751379780533731

- 마지막 editted5_pattern을 이용한 태거를 선택하기로 한다. 

In [64]:
revised_patterns = editted5_patterns
revised_t3 = editted5_t3

In [65]:
revised_t3.accuracy(test)

0.8576798349056604

In [74]:
patterns

[('.*ing$', 'VBG'),
 ('.*ed$', 'VBD'),
 ('.*es$', 'VBZ'),
 ('.*ould$', 'MD'),
 (".*'s$", 'NN$'),
 ('.*s$', 'NNS'),
 ('^-?[0-9]+(.[0-9]+)?$', 'CD'),
 ('.*', 'NN')]

In [75]:
revised_patterns

[("^[A-Z].*'s$", 'NP$'),
 ('^[A-Z].*', 'NP'),
 ('.*-.*', 'JJ'),
 ('.*ly$', 'RB'),
 ('.*e$', 'VB'),
 ('.*ing$', 'VBG'),
 ('.*ed$', 'VBD'),
 ('.*es$', 'VBZ'),
 ('.*ould$', 'MD'),
 (".*'s$", 'NN$'),
 ('.*s$', 'NNS'),
 ('^-?[0-9]+(.[0-9]+)?$', 'CD'),
 ('.*', 'NN')]

In [76]:
t3.accuracy(test)

0.8320066823899371

In [77]:
revised_t3.accuracy(test)

0.8576798349056604