In [36]:
import nltk
from nltk.tag import brill
from nltk.corpus import brown
from nltk.corpus import treebank
from nltk.tag import untag

Initially we use the following regex patterns to tag the incoming text. This means if the input ends with "ould" I will call it a modal. If the words dont have any of the following patterns, I will call it a noun.
I can later correct it with rules from  Brill's  tagger.
I noticed having a good initial tagger decreases the number of rules and improves accuracy.

In [46]:
word_patterns = [
    (r'^-?[0-9]+(.[0-9]+)?$', 'CD'), #cardinal numbers
    (r'(The|the|A|a|An|an)$', 'AT'),   # articles
    (r'.*s$', 'NNS'),                  # plural nouns
    (r'.*ly$', 'RB'),                  # adverbs
    (r'.*ould$', 'MD'),         #modal
    (r'.*ing$', 'VBG'),          # gerunds
    (r'.*ed$', 'VBD'),           # past tense verbs
    (r'.*ness$', 'NN'),         # nouns formed from adjectives
    (r'.*ment$', 'NN'),
    (r'.*ful$', 'JJ'),
    (r'.*ious$', 'JJ'),
    (r'.*ble$', 'JJ'),
    (r'.*ic$', 'JJ'),
    (r'.*ive$', 'JJ'),
    (r'.*est$', 'JJ'),
    (r'^a$', 'PREP')
]

In [47]:
#creating an initial tagger with the word patterns above
regexp_tagger = nltk.RegexpTagger(word_patterns)

In [64]:
#for starters I used the brown's tagger available with NLTK. the Penn Tree bank data of NLTK is 3000 sentences long 
# and it misses many words
sentences = list(brown.tagged_sents())
print(len(sentences))

57340


In [65]:
#Training set for brill's tagger. The next block shows the structure of the training instance
training_data = sentences[:55000]
gold_data = sentences[55000:57340] 
testing_data = [untag(s) for s in gold_data]

In [66]:
training_data [0]

[('The', 'AT'),
 ('Fulton', 'NP-TL'),
 ('County', 'NN-TL'),
 ('Grand', 'JJ-TL'),
 ('Jury', 'NN-TL'),
 ('said', 'VBD'),
 ('Friday', 'NR'),
 ('an', 'AT'),
 ('investigation', 'NN'),
 ('of', 'IN'),
 ("Atlanta's", 'NP$'),
 ('recent', 'JJ'),
 ('primary', 'NN'),
 ('election', 'NN'),
 ('produced', 'VBD'),
 ('``', '``'),
 ('no', 'AT'),
 ('evidence', 'NN'),
 ("''", "''"),
 ('that', 'CS'),
 ('any', 'DTI'),
 ('irregularities', 'NNS'),
 ('took', 'VBD'),
 ('place', 'NN'),
 ('.', '.')]

In [67]:
#fntbl37 tells the tagger how the rules can be derived.These templates are based on Brill's seminal paper.
# For instance
# for eg. rule template word_1 word_0 => pos , is defining a rule that based on the previous and current word will change the POS feature, 
# while pos_2 pos_1 pos_0 => pos will change the POS of a word based on a POS trigram ending on the current position.

brill.Template._cleartemplates()
templates = brill.fntbl37()
trainer = nltk.tag.brill_trainer.BrillTaggerTrainer(initial_tagger=regexp_tagger,
                                   templates=templates, trace=3,
                                   deterministic=True)

In [None]:
#Basic tagger.. the output below shows some example rules
tagger1 = trainer.train(training_data, max_rules=1000)

TBL train (fast) (seqs: 55000; tokens: 1120386; tpls: 37; min score: 2; min acc: None)
Finding initial useful rules...
    Found 7174445 useful rules.

           B      |
   S   F   r   O  |        Score = Fixed - Broken
   c   i   o   t  |  R     Fixed = num tags changed incorrect -> correct
   o   x   k   h  |  u     Broken = num tags changed correct -> incorrect
   r   e   e   e  |  l     Other = num tags changed incorrect -> incorrect
   e   d   n   r  |  e
------------------+-------------------------------------------------------
130261130261   0588913  | None->NN if Pos:None@[1,2,3]
5458154581   0 162  | NN->, if Word:,@[0]
5099650996   013398  | None->. if Pos:NN@[-3,-2,-1]
3818539060 87522661  | NN->IN if Pos:AT@[1]
2335823358   0 202  | NN->CC if Word:and@[0]
2253422534   0 949  | NN->IN if Word:of@[0]
1407614076   06500  | NN->TO if Word:to@[0]
1138711388   1 454  | NN->IN if Word:in@[0]
98259825   0  35  | NNS->BEZ if Word:is@[0]
92269226   0   9  | NNS->BEDZ if Word:was@[0

In [None]:
tagger1.evaluate(gold_data)

In [None]:
#A slight viariation
tagger2 = trainer.train(training_data, min_acc=0.99)

In [None]:
tagger2.evaluate(gold_data)

In [57]:
Rules = tagger2.rules()


In [59]:
# The rules in this list can be used. The rules must be used in the order shown
Rules[1].format('verbose')

'AT -> DT if the Pos of words i+1...i+3 is "NN"'