## Gender Classification of Names

In [2]:
import pandas as pd
import numpy as np

In [3]:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer

In [14]:
df = pd.read_csv("names_dataset.csv", index_col= 0)
df.head()

Unnamed: 0_level_0,name,sex
index,Unnamed: 1_level_1,Unnamed: 2_level_1
0,Mary,F
1,Anna,F
2,Emma,F
3,Elizabeth,F
4,Minnie,F


In [7]:
df.size

190050

In [5]:
df['sex'].value_counts()

F    60600
M    34425
Name: sex, dtype: int64

In [15]:
df['sex'].replace({'F':0,'M':1},inplace=True)

In [19]:
df_names = df
df_names.sex.unique()

array([0, 1], dtype=int64)

In [22]:
Xfeatures = df_names['name']

In [23]:
cv = CountVectorizer()
X = cv.fit_transform(Xfeatures)

In [27]:
# Features
X
y = df_names.sex

In [28]:
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=42)

In [31]:
from sklearn.naive_bayes import MultinomialNB
clf = MultinomialNB()
clf.fit(X_train, y_train)
print("Accuracy of Model", clf.score(X_test,y_test)*100,"%")

Accuracy of Modal 63.98163206734908 %


In [32]:
print("Accuracy of Model", clf.score(X_train,y_train)*100,"%")

Accuracy of Modal 100.0 %


### Prediction

In [37]:
sample_name = ["Mary"]
vect = cv.transform(sample_name).toarray()

In [38]:
# Female is 0 Male is 1
clf.predict(vect)

array([0], dtype=int64)

In [42]:
sample_name_1 = ["Mark"]
vect1 = cv.transform(sample_name_1).toarray()

In [43]:
# Sample 2
clf.predict(vect1)

array([1], dtype=int64)

In [46]:
sample_name_2 = ["Natasha"]
vect2 = cv.transform(sample_name_2).toarray()

In [47]:
# Sample 3
clf.predict(vect2)

array([0], dtype=int64)

In [49]:
sample_name_3 = ["Lewis"]
vect3 = cv.transform(sample_name_3).toarray()

In [50]:
# Sample 4
clf.predict(vect3)

array([1], dtype=int64)

In [52]:
def genderpredictor(a):
    test_name = [a]
    vector = cv.transform(test_name).toarray()
    if clf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")

In [55]:
genderpredictor("Laura")

Female


In [67]:
def features(name):
    name = name.lower()
    return {
        'first-letter': name[0],
        'first2-letters': name[0:2],
        'first3-letters': name[0:3],
        'last-letter': name[-1],
        'last2-letters': name[-2:],
        'last3-letters': name[-3:],
    }

In [68]:
features = np.vectorize(features)
print(features(["Anna", "Hannah", "Peter", "John", "Vladimir", "Mohammed"]))

[{'first-letter': 'a', 'first2-letters': 'an', 'first3-letters': 'ann', 'last-letter': 'a', 'last2-letters': 'na', 'last3-letters': 'nna'}
 {'first-letter': 'h', 'first2-letters': 'ha', 'first3-letters': 'han', 'last-letter': 'h', 'last2-letters': 'ah', 'last3-letters': 'nah'}
 {'first-letter': 'p', 'first2-letters': 'pe', 'first3-letters': 'pet', 'last-letter': 'r', 'last2-letters': 'er', 'last3-letters': 'ter'}
 {'first-letter': 'j', 'first2-letters': 'jo', 'first3-letters': 'joh', 'last-letter': 'n', 'last2-letters': 'hn', 'last3-letters': 'ohn'}
 {'first-letter': 'v', 'first2-letters': 'vl', 'first3-letters': 'vla', 'last-letter': 'r', 'last2-letters': 'ir', 'last3-letters': 'mir'}
 {'first-letter': 'm', 'first2-letters': 'mo', 'first3-letters': 'moh', 'last-letter': 'd', 'last2-letters': 'ed', 'last3-letters': 'med'}]


In [69]:
df_X = features(df_names['name'])

In [70]:
df_y = df_names['sex']

In [74]:
from sklearn.feature_extraction import DictVectorizer

corpus = features(['Mike', 'Julian'])
dv = DictVectorizer()
dv.fit(corpus)
transformed = dv.transform(corpus)
print(transformed)

  (0, 1)	1.0
  (0, 3)	1.0
  (0, 5)	1.0
  (0, 6)	1.0
  (0, 9)	1.0
  (0, 11)	1.0
  (1, 0)	1.0
  (1, 2)	1.0
  (1, 4)	1.0
  (1, 7)	1.0
  (1, 8)	1.0
  (1, 10)	1.0


In [75]:
dv.get_feature_names()

['first-letter=j',
 'first-letter=m',
 'first2-letters=ju',
 'first2-letters=mi',
 'first3-letters=jul',
 'first3-letters=mik',
 'last-letter=e',
 'last-letter=n',
 'last2-letters=an',
 'last2-letters=ke',
 'last3-letters=ian',
 'last3-letters=ike']

In [76]:
#Train - test split
dfX_train, dfX_test, dfy_train, dfy_test = train_test_split(df_X, df_y, test_size=0.33, random_state=42)

In [78]:
dv = DictVectorizer()
dv.fit_transform(dfX_train)

<63666x8194 sparse matrix of type '<class 'numpy.float64'>'
	with 381996 stored elements in Compressed Sparse Row format>

In [79]:
# Modal build with Decision Tree
from sklearn.tree import DecisionTreeClassifier

dclf = DecisionTreeClassifier()
my_xfeatures = dv.transform(dfX_train)
dclf.fit(my_xfeatures, dfy_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [80]:
sample_name_eg = ["Alex"]
transform_dv = dv.transform(features(sample_name_eg))

In [81]:
vect3 = transform_dv.toarray()

In [82]:
# Predicting gender of name
# Male is 1, female is 0
dclf.predict(vect3)

array([1], dtype=int64)

In [84]:
if dclf.predict(vect3) == 0:
    print("Female")
else:
    print("Male")

Male


In [85]:
name_eg1 = ["Chioma"]
transform_dv = dv.transform(features(name_eg1))
vect4 = transform_dv.toarray()
if dclf.predict(vect4) == 0:
    print("Female")
else:
    print("Male")

Female


In [92]:
def genderpredictor1(a):
    test_name1 = [a]
    transform_dv = dv.transform(features(test_name1))
    vector = transform_dv.toarray()
    if dclf.predict(vector) == 0:
        print("Female")
    else:
        print("Male")

In [94]:
random_name_list = ["Alex", "Alice", "Chioma", "Vladimir", "Clairese", "Mohammed", "Chan"]

In [95]:
for n in random_name_list:
    print(genderpredictor1(n))

Male
None
Female
None
Female
None
Male
None
Female
None
Male
None
Male
None


In [96]:
# Accuracy of modal Decision Tree is better than Naive Bayes
print(dclf.score(dv.transform(dfX_train), dfy_train))

0.9888951716771903


In [97]:
# Accuracy on test set
print(dclf.score(dv.transform(dfX_test), dfy_test))

0.8670557096846201


### Exporting the model

In [98]:
import pickle
dctreeModel = open("namesdetectormodel.pkl", "wb")

In [99]:
pickle.dump(dclf,dctreeModel)

In [100]:
dctreeModel.close()