In [1]:
import nltk
from nltk.corpus import brown

suffix_fdist = nltk.FreqDist()
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1
common_suffixes = [suffix for (suffix, count) in suffix_fdist.most_common(100)]
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [2]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith({})'.format(suffix)] = word.lower().endswith(suffix)
    return features

In [4]:
pos_features("the")

{"endswith('')": False,
 "endswith(')": False,
 "endswith('s)": False,
 'endswith(()': False,
 'endswith())': False,
 'endswith(,)': False,
 'endswith(--)': False,
 'endswith(.)': False,
 'endswith(:)': False,
 'endswith(;)': False,
 'endswith(?)': False,
 'endswith(`)': False,
 'endswith(``)': False,
 'endswith(a)': False,
 'endswith(ad)': False,
 'endswith(al)': False,
 'endswith(an)': False,
 'endswith(and)': False,
 'endswith(are)': False,
 'endswith(as)': False,
 'endswith(at)': False,
 'endswith(ay)': False,
 'endswith(be)': False,
 'endswith(by)': False,
 'endswith(c)': False,
 'endswith(ce)': False,
 'endswith(ch)': False,
 'endswith(d)': False,
 'endswith(e)': True,
 'endswith(ed)': False,
 'endswith(en)': False,
 'endswith(ent)': False,
 'endswith(er)': False,
 'endswith(ere)': False,
 'endswith(ers)': False,
 'endswith(es)': False,
 'endswith(ey)': False,
 'endswith(f)': False,
 'endswith(for)': False,
 'endswith(g)': False,
 'endswith(h)': False,
 'endswith(had)': False,
 '

In [5]:
tagged_words = brown.tagged_words(categories='news')

In [7]:
tagged_words

[('The', 'AT'), ('Fulton', 'NP-TL'), ...]

In [8]:
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

In [9]:
size = int(len(featuresets) * 0.1)

In [11]:
print(len(featuresets))
print(size)

100554
10055


In [12]:
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.DecisionTreeClassifier.train(train_set)

In [13]:
print(classifier.classify(pos_features('cats')))

NNS


In [14]:
print(nltk.classify.accuracy(classifier, test_set))

0.6270512182993535


In [15]:
print(classifier.pseudocode(depth=4))

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: return '.'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(is) == False: return 'PP$'
      if endswith(is) == True: return 'BEZ'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'

