In [15]:
import pandas as pd
import numpy as np
import matplotlib as mtpl
import seaborn as sns
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction import DictVectorizer
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.ensemble import RandomForestClassifier
from sklearn.neighbors import KNeighborsClassifier

# Introduction
#### 1.Load data
We found two useful files from Carnegie Mellon University and have combined them to a dataframe. 
We set "male" to 0 and "female" to 1 as classification for machine learning.

In [16]:
baby= pd.read_csv("data/name.csv")
baby.columns=['name','sex']
baby=baby.drop_duplicates()
baby.sex = pd.to_numeric(baby.sex, errors='coerce')
baby.head(2)

Unnamed: 0,name,sex
0,Aamir,0
1,Aaron,0


#### 2. Data processing 
split the name into pieces, and use these pieces as features for our model.

In [17]:
def features(name):
    return{
        'first-letter':name[0],
        'first2-letter':name[0:2],
        'first3-letter':name[0:3],
        'last-letter':name[-1],
        'last2-letter':name[-2:],
        'last3-letter':name[-3:],
    }
features = np.vectorize(features)

### 3. Building model
We use 4 models to find the best classifier to predict the gender of a English name. 
#### fitting data

In [18]:
df_X=features(baby.name)
df_y = baby.sex
dfX_train,dfX_test,dfy_train,dfy_test = train_test_split(df_X,df_y,test_size = 0.2)
dv = DictVectorizer()
dv.fit_transform(dfX_train)

<6355x3140 sparse matrix of type '<class 'numpy.float64'>'
	with 38130 stored elements in Compressed Sparse Row format>

#### 3.1 Model KNN
Accuracy of of KNN model is 78.4%

In [19]:
knnmodel = KNeighborsClassifier()
my_xfeatures = dv.transform(dfX_train)
knnmodel.fit(my_xfeatures,dfy_train)

KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
           metric_params=None, n_jobs=None, n_neighbors=5, p=2,
           weights='uniform')

In [20]:
knnmodel.score(dv.transform(dfX_test),dfy_test)

0.7709251101321586

#### 3.2 Model Decision Tree
Accuracy of of Decision model is 77.5%

In [21]:
dtmodel = DecisionTreeClassifier()
my_xfeatures = dv.transform(dfX_train)
dtmodel.fit(my_xfeatures,dfy_train)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [22]:
dtmodel.score(dv.transform(dfX_test),dfy_test)

0.7577092511013216

#### 3.3 Model Naive bayes
Accuracy of of Naive bayes model is 83.4%

In [23]:
nbmodel = MultinomialNB()
my_xfeatures = dv.transform(dfX_train)
nbmodel.fit(my_xfeatures,dfy_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [24]:
nbmodel.score(dv.transform(dfX_test),dfy_test)

0.8042794210195091

#### 3.4 Model Random Forest
Accuracy of of random forest model is 78.8%

In [25]:
rfmodel = RandomForestClassifier()
my_xfeatures = dv.transform(dfX_train)
rfmodel.fit(my_xfeatures,dfy_train)



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='auto', max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=None,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)

In [26]:
rfmodel.score(dv.transform(dfX_test),dfy_test)

0.7658904971680303

## 4 Rule based
However, charactors in  shakspears' plays are not all from Britain, some from Danmark, some from Spain, Classifier from English name dataset cannot predict the plays well, that's why we need rule based before classify.

* make two arrays of titles which can indicate the gender of the name.

In [27]:
    ms = ["king","sir","lord","prince", 'capulet']
    fs = ["queen","lady","princess","nurse", 'juliet']## classifier learned from English name dataset can not predict Juliet well.

## 5 Build the gender predicting function with classifier and rule based

In [28]:
def gender_by_name(name):
    
    name = name.lower()
    ms = ["king","sir","lord","prince", 'capulet']
    fs = ["queen","lady","princess","nurse", 'juliet']
    if(any(m in name for m in ms)): return 0
    if(any(f in name for f in fs)): return 1
    vector = dv.transform(features([name])).toarray()

    return nb.predict(vector)[0]