In [10]:
import random 
import nltk
from nltk.corpus import names 


# read the names from the files
# label each name wiht the corresponding gender
male_names = [(name, 'male') for name in names.words('male.txt')]
female_names = [(name, 'female') for name in names.words('female.txt')]

# combine the list
labeled_names = male_names + female_names

# shuffle the list 
random.shuffle(labeled_names)


In [11]:
def gender_features(name):
    
    '''The first step in creating a classifier is deciding what features of 
the input are relevant, and how to encode those features. 
For this example, we'll start by just looking at the final letter of a given name. 
The following feature extractor function builds a 
dictionary containing relevant information about a given name:'''

    return {'last_char': name[-1],
        'last_two': name[-2:],
        'last_three': name[-3:],
        'first': name[0],
        'first2': name[:1],
        'first3':name[:2]}

In [12]:
from nltk import NaiveBayesClassifier

# Extract freatures using the 'gender_features()' function 
featuresets = [(gender_features(n), gender) for (n, gender) in labeled_names]

# Split the dataset into train and test set 
train, test = featuresets[500:], featuresets[:500]

# Train A Naive Bayes classifier
classifier = NaiveBayesClassifier.train(train)

# Testing accuracy 
from nltk.classify import accuracy
print(accuracy(classifier, test))

0.822


In [13]:
# Showing most important features 
classifier.show_most_informative_features(10)

Most Informative Features
                last_two = 'na'           female : male   =    164.1 : 1.0
                last_two = 'la'           female : male   =     73.7 : 1.0
                last_two = 'ia'           female : male   =     39.7 : 1.0
               last_char = 'a'            female : male   =     37.8 : 1.0
                last_two = 'sa'           female : male   =     33.8 : 1.0
               last_char = 'k'              male : female =     32.7 : 1.0
                last_two = 'ta'           female : male   =     32.6 : 1.0
                last_two = 'rd'             male : female =     32.1 : 1.0
              last_three = 'ard'            male : female =     29.2 : 1.0
                last_two = 'us'             male : female =     28.7 : 1.0


In [14]:
male_gender = classifier.classify(gender_features('Rudolph'))
female_gender = classifier.classify(gender_features('Samantha'))
print("Rudolph is most probably a {}.".format(male_gender))
print("Samantha is most probably a {}.".format(female_gender))

Rudolph is most probably a male.
Samantha is most probably a female.


# pickling our model 

In [15]:
import pickle

gender_classifier = 'Gender_Classifier_Modek.pkl'

with open(gender_classifier, 'wb') as file: 
    pickle.dump(classifier, file)

# testing our pickled model 

In [16]:
with open(gender_classifier, 'rb') as file: 
    pickled_gndr_cls = pickle.load(file)

In [17]:
pickled_gndr_cls

<nltk.classify.naivebayes.NaiveBayesClassifier at 0xc859f19e48>

In [18]:
# Testing
pickled_gndr_cls.classify(gender_features('Bram'))

'male'

In [19]:
list_names = ['John', 'Mike', 'Michelle', 'Kedrin', 'Debbie', 
    'Courtney', 'Mark', 'Chris', 'Jeff', 'Adrian', 
    'Shannon', 'Michael','Bram', 'Max', 'Hana']

In [20]:
genders = []

for name in list_names: 
    genders.append(pickled_gndr_cls.classify(gender_features(name)))

In [21]:
import pandas as pd

# converting into dataframes
list_names = pd.DataFrame(list_names)
genders = pd.DataFrame(genders)

# merging
complete = list_names.merge(genders, how='outer', left_index=True, right_index=True)
complete.columns = ['Name', 'Gender']
complete

Unnamed: 0,Name,Gender
0,John,male
1,Mike,female
2,Michelle,female
3,Kedrin,female
4,Debbie,female
5,Courtney,female
6,Mark,male
7,Chris,female
8,Jeff,male
9,Adrian,male
