### Define feature extractor

In [80]:
def gender_features(word):
    return {'last_letter': word[-1:]}

### Create list of names

In [81]:
from nltk.corpus import names
import random
import nltk

names=([(name, 'male') for name in names.words('male.txt')]+
       [(name, 'female') for name in names.words('female.txt')])
random.shuffle(names)

### Create our feature and test sets and classify the data

In [82]:
train_names,devtest_names, test_names = names[:6901], names[6901:7402], names[7402:7903]
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
test_set = [(gender_features(n), g) for (n,g) in test_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)

In [83]:
classifier.classify(gender_features('Avery'))

'female'

### Now lets test the accuracy

In [84]:
print(nltk.classify.accuracy(classifier, test_set))

0.7524950099800399


### The model is about 76% accurate

### Rebuilding the gender features function to incorporate the last 2 letetrs in a persons name

In [85]:
def gender_features(word):
    return {'suffix1': word[-1:],
            'suffix2': word[-2:]}

### Re train the set

In [86]:
train_set = [(gender_features(n), g) for (n,g) in train_names]
devtest_set = [(gender_features(n), g) for (n,g) in devtest_names]
classifier = nltk.NaiveBayesClassifier.train(train_set)
print(nltk.classify.accuracy(classifier, devtest_set))

0.7405189620758483


### Our accuracy improved by 4%! Lets store the errors in a list to examine them more closely

In [87]:
errors = []
for (name,tag) in devtest_names:
    guess = classifier.classify(gender_features(name))
    if guess != tag:
        errors.append((tag,guess,name))
for (tag, guess, name) in sorted(errors):  # doctest: +ELLIPSIS +NORMALIZE_WHITESPACE
    print('correct={:<8} guess={:<8} name={:<30}'.format(tag, guess, name))

correct=female   guess=male     name=Adey                          
correct=female   guess=male     name=Aleen                         
correct=female   guess=male     name=Alex                          
correct=female   guess=male     name=Alexis                        
correct=female   guess=male     name=Allsun                        
correct=female   guess=male     name=Annabal                       
correct=female   guess=male     name=Ansley                        
correct=female   guess=male     name=Beau                          
correct=female   guess=male     name=Bridget                       
correct=female   guess=male     name=Buffy                         
correct=female   guess=male     name=Calypso                       
correct=female   guess=male     name=Cam                           
correct=female   guess=male     name=Carlin                        
correct=female   guess=male     name=Carolan                       
correct=female   guess=male     name=Charil     

### Lets see if the number of vowels has effect on the gender of the name

In [88]:
def count_vowels(name):
    return sum(1 for letter in name if letter.lower() in 'aeiou')

def gender_features(name):
    return {
        'suffix2': name[-2:],  
        'num_vowels': count_vowels(name) 
    }

train_set = [(gender_features(n), g) for (n, g) in train_names]
devtest_set = [(gender_features(n), g) for (n, g) in devtest_names]
test_set = [(gender_features(n), g) for (n, g) in test_names]

classifier = nltk.NaiveBayesClassifier.train(train_set)

devtest_accuracy = nltk.classify.accuracy(classifier, devtest_set)
print(f'Dev-Test Set Accuracy: {devtest_accuracy}')

Dev-Test Set Accuracy: 0.7504990019960079


### Now that we have seen a 1% improvment, lets evaluate the classifier on the test set

In [89]:
test_accuracy = nltk.classify.accuracy(classifier, test_set)
print(f'Test Set Accuracy: {test_accuracy}')

Test Set Accuracy: 0.7524950099800399


### Seeing that the test set is less acurate than the Dev test-set, lets perform cross validation to perform multiple evaluations on different test sets

In [90]:
from nltk.corpus import names  
from nltk.classify.util import accuracy

male_names = names.words('male.txt')
female_names = names.words('female.txt')
labeled_male_names = [(name, 'male') for name in male_names]
labeled_female_names = [(name, 'female') for name in female_names]

labeled_names = labeled_male_names + labeled_female_names
random.shuffle(labeled_names)

data = [(gender_features(n), g) for (n, g) in labeled_names]
def cross_validate(data, folds=5):
    subset_size = len(data) // folds
    accuracy_scores = []
    
    for i in range(folds):
        test_data = data[i*subset_size:(i+1)*subset_size]
        train_data = data[:i*subset_size] + data[(i+1)*subset_size:]
        
        classifier = nltk.NaiveBayesClassifier.train(train_data)
        accuracy_scores.append(accuracy(classifier, test_data))
    
    return sum(accuracy_scores) / len(accuracy_scores)


average_accuracy = cross_validate(data, folds=5)
print(f'Average Cross-Validation Accuracy: {average_accuracy}')

Average Cross-Validation Accuracy: 0.778337531486146
