# Chapter 6 Learning to Classify Text

## 6.1 Supervised Classification

### Gender Identification

In [1]:
def gender_features(word):
    return {'last_letter': word[-1]}

In [2]:
gender_features('Shrek')

{'last_letter': 'k'}

In [28]:
import nltk
from nltk.corpus import names

In [4]:
import random

In [29]:
names = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]

In [30]:
random.shuffle(names)

In [31]:
featuresets = [(gender_features(n), g) for (n,g) in names]

In [32]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [33]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [34]:
classifier.classify(gender_features('Neo'))

'male'

In [35]:
classifier.classify(gender_features('Trinity'))

'female'

In [36]:
print(nltk.classify.accuracy(classifier, test_set))

0.772


In [37]:
classifier.show_most_informative_features(5)

Most Informative Features
             last_letter = 'a'            female : male   =     34.4 : 1.0
             last_letter = 'k'              male : female =     33.6 : 1.0
             last_letter = 'f'              male : female =     17.2 : 1.0
             last_letter = 'p'              male : female =     11.8 : 1.0
             last_letter = 'v'              male : female =     10.5 : 1.0


### Choosing the Right Features

In [38]:
def gender_features2(name):
    features = {}
    features["firstletter"] = name[0].lower()
    features["lastletter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count(%s)" % letter] = name.lower().count(letter)
        features["has(%s)" % letter] = (letter in name.lower())
    return features

In [39]:
print(gender_features2('John'))

{'firstletter': 'j', 'lastletter': 'n', 'count(a)': 0, 'has(a)': False, 'count(b)': 0, 'has(b)': False, 'count(c)': 0, 'has(c)': False, 'count(d)': 0, 'has(d)': False, 'count(e)': 0, 'has(e)': False, 'count(f)': 0, 'has(f)': False, 'count(g)': 0, 'has(g)': False, 'count(h)': 1, 'has(h)': True, 'count(i)': 0, 'has(i)': False, 'count(j)': 1, 'has(j)': True, 'count(k)': 0, 'has(k)': False, 'count(l)': 0, 'has(l)': False, 'count(m)': 0, 'has(m)': False, 'count(n)': 1, 'has(n)': True, 'count(o)': 1, 'has(o)': True, 'count(p)': 0, 'has(p)': False, 'count(q)': 0, 'has(q)': False, 'count(r)': 0, 'has(r)': False, 'count(s)': 0, 'has(s)': False, 'count(t)': 0, 'has(t)': False, 'count(u)': 0, 'has(u)': False, 'count(v)': 0, 'has(v)': False, 'count(w)': 0, 'has(w)': False, 'count(x)': 0, 'has(x)': False, 'count(y)': 0, 'has(y)': False, 'count(z)': 0, 'has(z)': False}


In [40]:
featuresets = [(gender_features2(n), g) for (n,g) in names]

In [41]:
train_set, test_set = featuresets[500:], featuresets[:500]

In [42]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [43]:
print(nltk.classify.accuracy(classifier, test_set))

0.77


In [44]:
train_names = names[1500:]

In [45]:
devtest_names = names[500:1500]

In [46]:
test_names = names[:500]

In [47]:
train_set = [(gender_features(n), g) for (n,g) in train_names]

In [48]:
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]

In [49]:
test_set = [(gender_features(n), g) for (n,g) in test_names]

In [50]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [51]:
print(nltk.classify.accuracy(classifier, devtest_set))

0.776


In [52]:
errors = []

In [53]:
for (name, tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append( (tag, guess, name) )

In [56]:
for i, (tag, guess, name) in enumerate(sorted(errors)):
    print('correct=%-8s guess=%-8s name=%-30s' % (tag, guess, name))
    if i > 20:
        break

correct=female   guess=male     name=Aeriel                        
correct=female   guess=male     name=Agnes                         
correct=female   guess=male     name=Ailyn                         
correct=female   guess=male     name=Aimil                         
correct=female   guess=male     name=Alis                          
correct=female   guess=male     name=Alisun                        
correct=female   guess=male     name=Astrid                        
correct=female   guess=male     name=Avis                          
correct=female   guess=male     name=Bel                           
correct=female   guess=male     name=Bill                          
correct=female   guess=male     name=Bird                          
correct=female   guess=male     name=Birgit                        
correct=female   guess=male     name=Bo                            
correct=female   guess=male     name=Caitrin                       
correct=female   guess=male     name=Candis     

In [57]:
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}

In [58]:
train_set = [(gender_features(n), g) for (n,g) in train_names]

In [59]:
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]

In [60]:
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [61]:
print(nltk.classify.accuracy(classifier, devtest_set))

0.794


### Document Classification

In [62]:
from nltk.corpus import movie_reviews

In [63]:
documents = [(list(movie_reviews.words(fileid)), category)
             for category in movie_reviews.categories()
             for fileid in movie_reviews.fileids(category)]

In [64]:
random.shuffle(documents)

In [65]:
all_words = nltk.FreqDist(w.lower() for w in movie_reviews.words())

In [67]:
word_features = list(all_words.keys())[:2000]

In [68]:
def document_features(document):
    document_words = set(document)
    features = {}
    for word in word_features:
        features['contains(%s)' % word] = (word in document_words)
    return features

In [None]:
print(document_features(movie_reviews.words('pos/cv957_8737.txt')))

In [70]:
featuresets = [(document_features(d), c) for (d,c) in documents]
train_set, test_set = featuresets[100:], featuresets[:100]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [71]:
print(nltk.classify.accuracy(classifier, test_set))

0.76


In [72]:
classifier.show_most_informative_features(5)

Most Informative Features
 contains(unimaginative) = True              neg : pos    =      7.8 : 1.0
        contains(suvari) = True              neg : pos    =      7.1 : 1.0
          contains(mena) = True              neg : pos    =      7.1 : 1.0
    contains(schumacher) = True              neg : pos    =      7.1 : 1.0
     contains(atrocious) = True              neg : pos    =      6.7 : 1.0


### Part-of-Speech Tagging

In [73]:
from nltk.corpus import brown

In [97]:
suffix_fdist = nltk.FreqDist()

In [98]:
for word in brown.words():
    word = word.lower()
    suffix_fdist[word[-1:]] += 1
    suffix_fdist[word[-2:]] += 1
    suffix_fdist[word[-3:]] += 1

In [104]:
common_suffixes = sorted(suffix_fdist, key=lambda x: suffix_fdist[x], reverse=True)[:100]

In [105]:
print(common_suffixes)

['e', ',', '.', 's', 'd', 't', 'he', 'n', 'a', 'of', 'the', 'y', 'r', 'to', 'in', 'f', 'o', 'ed', 'nd', 'is', 'on', 'l', 'g', 'and', 'ng', 'er', 'as', 'ing', 'h', 'at', 'es', 'or', 're', 'it', '``', 'an', "''", 'm', ';', 'i', 'ly', 'ion', 'en', 'al', '?', 'nt', 'be', 'hat', 'st', 'his', 'th', 'll', 'le', 'ce', 'by', 'ts', 'me', 've', "'", 'se', 'ut', 'was', 'for', 'ent', 'ch', 'k', 'w', 'ld', '`', 'rs', 'ted', 'ere', 'her', 'ne', 'ns', 'ith', 'ad', 'ry', ')', '(', 'te', '--', 'ay', 'ty', 'ot', 'p', 'nce', "'s", 'ter', 'om', 'ss', ':', 'we', 'are', 'c', 'ers', 'uld', 'had', 'so', 'ey']


In [107]:
def pos_features(word):
    features = {}
    for suffix in common_suffixes:
        features['endswith(%s)' % suffix] = word.lower().endswith(suffix)
    return features

In [108]:
tagged_words = brown.tagged_words(categories='news')
featuresets = [(pos_features(n), g) for (n,g) in tagged_words]

size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]

In [109]:
classifier = nltk.DecisionTreeClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.6270512182993535

In [110]:
classifier.classify(pos_features('cats'))

'NNS'

In [111]:
print(classifier.pseudocode(depth=4))

if endswith(the) == False: 
  if endswith(,) == False: 
    if endswith(s) == False: 
      if endswith(.) == False: return '.'
      if endswith(.) == True: return '.'
    if endswith(s) == True: 
      if endswith(is) == False: return 'PP$'
      if endswith(is) == True: return 'BEZ'
  if endswith(,) == True: return ','
if endswith(the) == True: return 'AT'



### Exploiting Context

In [112]:
def pos_features(sentence, i):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
    return features

In [113]:
pos_features(brown.sents()[0], 8)

{'suffix(1)': 'n', 'suffix(2)': 'on', 'suffix(3)': 'ion', 'prev-word': 'an'}

In [114]:
tagged_sents = brown.tagged_sents(categories='news')

In [115]:
featuresets = []

In [116]:
for tagged_sent in tagged_sents:
    untagged_sent = nltk.tag.untag(tagged_sent)
    for i, (word, tag) in enumerate(tagged_sent):
        featuresets.append((pos_features(untagged_sent, i), tag) )

In [117]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [118]:
nltk.classify.accuracy(classifier, test_set)

0.7891596220785678

### Sequence Classification

In [126]:
def pos_features(sentence, i, history):
    features = {"suffix(1)": sentence[i][-1:],
                "suffix(2)": sentence[i][-2:],
                "suffix(3)": sentence[i][-3:]}
    if i == 0:
        features["prev-word"] = "<START>"
        features["prev-tag"] = "<START>"
    else:
        features["prev-word"] = sentence[i-1]
        features["prev-tag"] = history[i-1]
    return features

In [132]:
class ConsecutivePosTagger(nltk.TaggerI):
    def __init__(self, train_sents):
        super().__init__()
        train_set = []
        for tagged_sent in train_sents:
            untagged_sent = nltk.tag.untag(tagged_sent)
            history = []
            for i, (word, tag) in enumerate(tagged_sent):
                featureset = pos_features(untagged_sent, i, history)
                train_set.append( (featureset, tag) )
                history.append(tag)
        self.classifier = nltk.NaiveBayesClassifier.train(train_set)
    
    def tag(self, sentence):
        history = []
        for i, word in enumerate(sentence):
            featureset = pos_features(sentence, i, history)
            tag = self.classifier.classify(featureset)
            history.append(tag)
        return zip(sentence, history)

In [133]:
tagged_sents = brown.tagged_sents(categories='news')
size = int(len(tagged_sents) * 0.1)
train_sents, test_sents = tagged_sents[size:], tagged_sents[:size]
tagger = ConsecutivePosTagger(train_sents)
print(tagger.evaluate(test_sents))

0.7980528511821975


## 6.2 Further Examples of Supervised Classification

### Sentence Segmentation

In [135]:
sents = nltk.corpus.treebank_raw.sents()
tokens = []
boundaries = set()
offset = 0
for sent in nltk.corpus.treebank_raw.sents():
    tokens.extend(sent)
    offset += len(sent)
    boundaries.add(offset-1)

In [136]:
def punct_features(tokens, i):
    return {'next-word-capitalized': tokens[i+1][0].isupper(),
            'prevword': tokens[i-1].lower(),
            'punct': tokens[i],
            'prev-word-is-one-char': len(tokens[i-1]) == 1}

In [137]:
featuresets = [(punct_features(tokens, i), (i in boundaries))
               for i in range(1, len(tokens)-1)
               if tokens[i] in '.?!']

In [138]:
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
nltk.classify.accuracy(classifier, test_set)

0.936026936026936

In [139]:
def segment_sentences(words):
    start = 0
    sents = []
    for i, word in words:
        if word in '.?!' and classifier.classify(words, i) == True:
            sents.append(words[start:i+1])
            start = i+1
    if start < len(words):
        sents.append(words[start:])

### Identifying Dialogue Act Types

In [140]:
posts = nltk.corpus.nps_chat.xml_posts()[:10000]

In [141]:
def dialogue_act_features(post):
    features = {}
    for word in nltk.word_tokenize(post):
        features['contains(%s)' % word.lower()] = True
    return features

In [142]:
featuresets = [(dialogue_act_features(post.text), post.get('class'))
               for post in posts]
size = int(len(featuresets) * 0.1)
train_set, test_set = featuresets[size:], featuresets[:size]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, test_set))

0.668
