# Evaluate `genderize.io` 

In [1]:
from genderize import Genderize
import pandas as pd
from gender_evaluator import GenderEvaluator

### Can it handle surnames?

In [21]:
results = Genderize().get(['Hans Joachim Schmidt', 'Anna Meier'])

In [22]:
print(results)

[{'gender': None, 'name': 'Hans Joachim Schmidt'}, {'gender': None, 'name': 'Anna Meier'}]


### Double names (where the order matters)

In [23]:
results = Genderize().get(['Hans Joachim', 'Hans-Joachim', 'Maria-José', 'José Maria', 'Jose Maria', 
                           'José-Maria', 'Josémaria', 'theo c. m'])

In [24]:
for r in results:
    print(r)

{'gender': None, 'name': 'Hans Joachim'}
{'gender': 'male', 'name': 'Hans-Joachim', 'count': 1, 'probability': 1.0}
{'gender': 'female', 'name': 'Maria-José', 'count': 2, 'probability': 1.0}
{'gender': 'male', 'name': 'José Maria', 'count': 3, 'probability': 1.0}
{'gender': 'male', 'name': 'Jose Maria', 'count': 125, 'probability': 0.99}
{'gender': None, 'name': 'José-Maria'}
{'gender': None, 'name': 'Josémaria'}
{'gender': None, 'name': 'theo c. m'}


The examples show that the API: 

* accepts double names
* is sensitive towards non-letter characters such as '-' or ' ' (cf. `Hans Joachim` and `Hans-Joachim`)
* works fine with non-ASCII characters (e.g. `é`)
* is sensitive towards accents (cf. `José Maria` and `Jose Maria`)

### Names with different gender depending on ethnicity

In [4]:
results = Genderize().get(['Nicola', 'Andrea', 'Alex', 'Mika', 'Addison', 'Ash', 'Dakota'])

In [5]:
for r in results:
    print(r)

{'name': 'Nicola', 'gender': 'female', 'probability': 0.71, 'count': 1226}
{'name': 'Andrea', 'gender': 'female', 'probability': 0.79, 'count': 5794}
{'name': 'Alex', 'gender': 'male', 'probability': 0.87, 'count': 5856}
{'name': 'Mika', 'gender': 'male', 'probability': 0.51, 'count': 182}
{'name': 'Addison', 'gender': 'male', 'probability': 0.64, 'count': 11}
{'name': 'Ash', 'gender': 'male', 'probability': 0.56, 'count': 243}
{'name': 'Dakota', 'gender': 'male', 'probability': 0.75, 'count': 139}


These examples show that:

* names like `Andrea` or `Nicola` where the gender is highly country-specific have a higher score than common unisex names like `Mika` or `Ash`
* Alex is a nickname for either Alexander or Alexandra and is one of the most evenly divided gender-neutral names. Its probability value here is quite high with 0.87

### Check for nonsense words

In [6]:
results = Genderize().get(['the', 'a', 'with', 'an', 'I', 'my'])

In [7]:
for r in results:
    print(r)

{'name': 'the', 'gender': 'female', 'probability': 1.0, 'count': 1}
{'name': 'a', 'gender': 'male', 'probability': 0.59, 'count': 56}
{'name': 'with', 'gender': None}
{'name': 'an', 'gender': 'female', 'probability': 0.83, 'count': 170}
{'name': 'I', 'gender': None}
{'name': 'my', 'gender': 'female', 'probability': 0.73, 'count': 44}


Not every word which gets a gender assigned is a name. This is due to the fact that such words are sometimes part of social media names, and this is what the API is based on.

## Test on zbMATH data

In [8]:
zbmath = GenderEvaluator("test_data/test_data_zbmath.csv")
zbmath.load_data()
zbmath.check_data_columns()

In [9]:
zbmath.test_data.head()

Unnamed: 0,first_name,middle_name,last_name,gender
0,ben,da,zhou,u
1,agnese,,di-castro,f
2,michael,,szarek,m
3,yumiko,,watanabe,f
4,shaoqing,,liu,m


In [10]:
zbmath.fetch_gender_from_genderizeio()

In [11]:
zbmath.test_data.head()

Unnamed: 0,first_name,middle_name,last_name,gender,count,gender_infered,probability
0,ben,da,zhou,u,3363.0,m,0.99
1,agnese,,di-castro,f,33.0,f,1.0
2,michael,,szarek,m,11094.0,m,1.0
3,michael,,szarek,m,11094.0,m,1.0
4,michael,p,robertson,u,11094.0,m,1.0


### Compute metrics on zbMATH data

In [12]:
zbmath.compute_confusion_matrix()

In [13]:
zbmath.confusion_matrix

Unnamed: 0,f_pred,m_pred,u_pred
f,9,0,2
m,1,53,9
u,5,13,16


In [14]:
# show names for which human said 'u' but algorithm said 'm' or 'f'
zbmath.test_data[(zbmath.test_data.gender=='u') & (zbmath.test_data.gender_infered !='u')]

Unnamed: 0,first_name,middle_name,last_name,gender,count,gender_infered,probability
0,ben,da,zhou,u,3363.0,m,0.99
4,michael,p,robertson,u,11094.0,m,1.0
5,michael,p,robertson,u,11094.0,m,1.0
12,po,,zhang,u,45.0,m,0.89
13,hon,kit,wai,u,16.0,m,0.81
14,bao,wen,sun,u,42.0,m,0.79
15,byoung,chan,lee,u,1.0,m,1.0
18,rosario,,di-nardo,u,343.0,f,0.75
23,jacques,,pontier,u,280.0,m,0.99
38,chao,zhu,zhang,u,16.0,m,0.81


The API returns 'm' for most of the Chinese names in the list.

In [15]:
zbmath.compute_error_with_unknown()
zbmath.compute_error_without_unknown()
zbmath.compute_error_unknown()
zbmath.compute_error_gender_bias()

In [16]:
print("error counting prediction as 'unknown gender' as classification errors: ", zbmath.error_with_unknown)
print("error ignoring prediction as 'unknown gender' : ", zbmath.error_without_unknown)
print("error counting proportion of names with unpredicted gender: ", zbmath.error_unknown)
print("error where negative value suggestes that more women than men are missclassified: ", zbmath.error_gender_bias)

error counting prediction as 'unknown gender' as classification errors:  0.193548387097
error ignoring prediction as 'unknown gender' :  0.015873015873
error counting proportion of names with unpredicted gender:  0.148648648649
error where negative value suggestes that more women than men are missclassified:  0.015873015873


## Test on genderizeR paper data

In [17]:
genderizeR = GenderEvaluator("test_data/test_data_genderizeR.csv")
genderizeR.load_data()
genderizeR.check_data_columns()

In [18]:
genderizeR.test_data.head()

Unnamed: 0,first_name,middle_name,last_name,gender
0,ann,,thayer,u
1,paolo,,chiesa,m
2,ernesto,,abbate,m
3,john,h,epstein,m
4,margaret,,cotroneo,f


In [19]:
genderizeR.fetch_gender_from_genderizeio()

In [20]:
genderizeR.test_data.head()

Unnamed: 0,first_name,middle_name,last_name,gender,count,gender_infered,probability
0,ann,,thayer,u,1818.0,f,0.99
1,paolo,,chiesa,m,781.0,m,0.99
2,ernesto,,abbate,m,381.0,m,1.0
3,john,h,epstein,m,9931.0,m,0.99
4,john,h,epstein,m,9931.0,m,0.99


### Compute metrics on genderizeR paper data

In [21]:
genderizeR.compute_confusion_matrix()

In [22]:
genderizeR.confusion_matrix

Unnamed: 0,f_pred,m_pred,u_pred
f,185,3,6
m,19,1122,27
u,82,291,8


In [23]:
# show names for which human said 'u' but algorithm said 'm' or 'f'
genderizeR.test_data[(genderizeR.test_data.gender=='u') & (genderizeR.test_data.gender_infered !='u')]

Unnamed: 0,first_name,middle_name,last_name,gender,count,gender_infered,probability
0,ann,,thayer,u,1818.0,f,0.99
163,john,,roche,u,9931.0,m,0.99
164,john,,roche,u,9931.0,m,0.99
165,john,,roche,u,9931.0,m,0.99
166,john,,roche,u,9931.0,m,0.99
167,john,,roche,u,9931.0,m,0.99
168,john,,roche,u,9931.0,m,0.99
169,john,,roche,u,9931.0,m,0.99
170,john,,roche,u,9931.0,m,0.99
171,john,,roche,u,9931.0,m,0.99


The API returns 'm' for most of the Chinese names in the list.

In [24]:
genderizeR.compute_error_with_unknown()
genderizeR.compute_error_without_unknown()
genderizeR.compute_error_unknown()
genderizeR.compute_error_gender_bias()

In [25]:
print("error counting prediction as 'unknown gender' as classification errors: ", genderizeR.error_with_unknown)
print("error ignoring prediction as 'unknown gender' : ", genderizeR.error_without_unknown)
print("error counting proportion of names with unpredicted gender: ", genderizeR.error_unknown)
print("error where negative value suggestes that more women than men are missclassified: ", genderizeR.error_gender_bias)

error counting prediction as 'unknown gender' as classification errors:  0.0420811017598
error ignoring prediction as 'unknown gender' :  0.0165537998495
error counting proportion of names with unpredicted gender:  0.0242290748899
error where negative value suggestes that more women than men are missclassified:  0.0165537998495


## From here: TODO

## Define different models based on `count` and `probability`

### Grid Search

In [93]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split