In [14]:
import nltk
from nltk.corpus import names
import random

**2. Using any of the three classifiers described in this chapter, and any features you can think of, build the best name gender classifier you can. Begin by splitting the Names Corpus into three subsets: 500 words for the test set, 500 words for the dev-test set, and the remaining 6900 words for the training set. Then, starting with the example name gender classifier, make incremental improvements. Use the dev-test set to check your progress. Once you are satisfied with your classifier, check its final performance on the test set. How does the performance on the test set compare to the performance on the dev-test set? Is this what you'd expect?**

In [5]:
def get_feature(name):
    return {'suffix[1]':name[-1],
           'suffix[2]': name[-2],
           'startswith': name[0]}

In [7]:
labeled_names = [(name, 'male') for name in names.words('male.txt')] + [(name, 'female') for name in names.words('female.txt')]


In [9]:
len(labeled_names)

7944

In [11]:
random.shuffle(labeled_names)
feature_set = [(get_feature(name), label) for name, label in labeled_names]
train_set, dev_set, test_set = feature_set[:6000], feature_set[6000:7000], feature_set[7000:]

In [15]:
csf1 = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(csf1, dev_set))
print(nltk.classify.accuracy(csf1, test_set))

0.789
0.7796610169491526


In [16]:
csf2 = nltk.DecisionTreeClassifier.train(train_set)
print(nltk.classify.accuracy(csf2, dev_set))
print(nltk.classify.accuracy(csf2, test_set))

0.791
0.7796610169491526


In [18]:
csf2 = nltk.MaxentClassifier.train(train_set)


  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.374
             2          -0.45425        0.757
             3          -0.39822        0.784
             4          -0.37251        0.788
             5          -0.35851        0.793
             6          -0.35000        0.793
             7          -0.34442        0.796
             8          -0.34058        0.796
             9          -0.33781        0.796
            10          -0.33577        0.796
            11          -0.33421        0.800
            12          -0.33300        0.799
            13          -0.33204        0.798
            14          -0.33127        0.797
            15          -0.33064        0.798
            16          -0.33012        0.799
            17          -0.32969        0.799
            18          -0.32933        0.800
            19          -0.32902        0.800
 

In [19]:
print(nltk.classify.accuracy(csf2, dev_set))
print(nltk.classify.accuracy(csf2, test_set))

0.797
0.8050847457627118


In [21]:
def get_feature(name):
    features = {}
    features["first_letter"] = name[0].lower()
    features["last_letter"] = name[-1].lower()
    for letter in 'abcdefghijklmnopqrstuvwxyz':
        features["count({})".format(letter)] = name.lower().count(letter)
        features["has({})".format(letter)] = (letter in name.lower())
    return features

In [22]:
random.shuffle(labeled_names)
feature_set = [(get_feature(name), label) for name, label in labeled_names]
train_set, dev_set, test_set = feature_set[:6000], feature_set[6000:7000], feature_set[7000:]

In [23]:
csf1 = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(csf1, dev_set))
print(nltk.classify.accuracy(csf1, test_set))

0.744
0.7701271186440678


In [24]:
csf2 = nltk.DecisionTreeClassifier.train(train_set)
print(nltk.classify.accuracy(csf2, dev_set))
print(nltk.classify.accuracy(csf2, test_set))

0.787
0.798728813559322


In [27]:
csf2 = nltk.MaxentClassifier.train(train_set)


  ==> Training (100 iterations)

      Iteration    Log Likelihood    Accuracy
      ---------------------------------------
             1          -0.69315        0.374
             2          -0.61409        0.626
             3          -0.59938        0.626
             4          -0.58553        0.627
             5          -0.57253        0.635
             6          -0.56035        0.652
             7          -0.54893        0.674
             8          -0.53824        0.692
             9          -0.52822        0.707
            10          -0.51883        0.719
            11          -0.51003        0.730
            12          -0.50177        0.739
            13          -0.49402        0.745
            14          -0.48673        0.753
            15          -0.47987        0.760
            16          -0.47340        0.766
            17          -0.46731        0.770
            18          -0.46156        0.771
            19          -0.45612        0.774
 

In [28]:
print(nltk.classify.accuracy(csf2, dev_set))
print(nltk.classify.accuracy(csf2, test_set))

0.787
0.8008474576271186
