# Evaluate `genderize.io` 

In [2]:
from genderize import Genderize
import pandas as pd
from gender_evaluator import GenderEvaluator

### Can it handle surnames?

In [2]:
results = Genderize().get(['Hans Joachim Schmidt', 'Anna Meier'])

In [3]:
print(results)

[{'name': 'Hans Joachim Schmidt', 'gender': None}, {'name': 'Anna Meier', 'gender': None}]


### Double names (where the order matters)

In [4]:
results = Genderize().get(['Hans Joachim', 'Hans-Joachim', 'Maria-José', 'José Maria', 'Jose Maria', 
                           'José-Maria', 'Josémaria', 'theo c. m'])

In [5]:
for r in results:
    print(r)

{'name': 'Hans Joachim', 'gender': None}
{'name': 'Hans-Joachim', 'gender': 'male', 'probability': 1.0, 'count': 1}
{'name': 'Maria-José', 'gender': 'female', 'probability': 1.0, 'count': 2}
{'name': 'José Maria', 'gender': 'male', 'probability': 1.0, 'count': 3}
{'name': 'Jose Maria', 'gender': 'male', 'probability': 0.99, 'count': 125}
{'name': 'José-Maria', 'gender': None}
{'name': 'Josémaria', 'gender': None}
{'name': 'theo c. m', 'gender': None}


The examples show that the API: 

* accepts double names
* is sensitive towards non-letter characters such as '-' or ' ' (cf. `Hans Joachim` and `Hans-Joachim`)
* works fine with non-ASCII characters (e.g. `é`)
* is sensitive towards accents (cf. `José Maria` and `Jose Maria`)

### Names with different gender depending on ethnicity

In [6]:
results = Genderize().get(['Nicola', 'Andrea', 'Alex', 'Mika', 'Addison', 'Ash', 'Dakota'])

In [7]:
for r in results:
    print(r)

{'name': 'Nicola', 'gender': 'female', 'probability': 0.71, 'count': 1226}
{'name': 'Andrea', 'gender': 'female', 'probability': 0.79, 'count': 5794}
{'name': 'Alex', 'gender': 'male', 'probability': 0.87, 'count': 5856}
{'name': 'Mika', 'gender': 'male', 'probability': 0.51, 'count': 182}
{'name': 'Addison', 'gender': 'male', 'probability': 0.64, 'count': 11}
{'name': 'Ash', 'gender': 'male', 'probability': 0.56, 'count': 243}
{'name': 'Dakota', 'gender': 'male', 'probability': 0.75, 'count': 139}


These examples show that:

* names like `Andrea` or `Nicola` where the gender is highly country-specific have a higher score than common unisex names like `Mika` or `Ash`
* Alex is a nickname for either Alexander or Alexandra and is one of the most evenly divided gender-neutral names. Its probability value here is quite high with 0.87

### Check for nonsense words

In [8]:
results = Genderize().get(['the', 'a', 'with', 'an', 'I', 'my'])

In [9]:
for r in results:
    print(r)

{'name': 'the', 'gender': 'female', 'probability': 1.0, 'count': 1}
{'name': 'a', 'gender': 'male', 'probability': 0.59, 'count': 56}
{'name': 'with', 'gender': None}
{'name': 'an', 'gender': 'female', 'probability': 0.83, 'count': 170}
{'name': 'I', 'gender': None}
{'name': 'my', 'gender': 'female', 'probability': 0.73, 'count': 44}


Not every word which gets a gender assigned is a name. This is due to the fact that such words are sometimes part of social media names, and this is what the API is based on.

## Test on zbMATH data - full 400 records

In [3]:
zbmath = GenderEvaluator("test_data/test_data_zbmath_full.csv")
zbmath.load_data()
zbmath.check_data_columns()

In [4]:
zbmath.test_data.head()

Unnamed: 0,first_name,middle_name,last_name,gender
0,pierre,paul,grivel,m
1,raul,p,serapioni,m
2,adriano,a,moura,m
3,ralf,,kieser,m
4,teppei,,ariyoshi,u


In [8]:
len(zbmath.test_data)

400

In [9]:
zbmath.fetch_gender_from_genderizeio()

In [10]:
zbmath.test_data.head()

Unnamed: 0,first_name,middle_name,last_name,gender,count,gender_infered,probability
0,pierre,paul,grivel,m,852.0,m,0.99
1,raul,p,serapioni,m,821.0,m,1.0
2,adriano,a,moura,m,166.0,m,0.99
3,ralf,,kieser,m,86.0,m,1.0
4,teppei,,ariyoshi,u,,u,


### Compute metrics on zbMATH data

In [11]:
zbmath.compute_confusion_matrix()

In [12]:
zbmath.confusion_matrix

Unnamed: 0,f_pred,m_pred,u_pred
f,55,0,3
m,3,271,17
u,5,27,19


In [13]:
# show names for which human said 'u' but algorithm said 'm' or 'f'
zbmath.test_data[(zbmath.test_data.gender=='u') & (zbmath.test_data.gender_infered !='u')]

Unnamed: 0,first_name,middle_name,last_name,gender,count,gender_infered,probability
10,sushil,k,singh,u,91.0,m,0.99
33,seiya,,haze,u,5.0,m,1.0
44,jae,il,lee,u,90.0,m,0.58
66,koji,,okuguchi,u,10.0,m,0.9
67,takashi,,amisaki,u,16.0,m,1.0
95,shoichi,,suzuki,u,5.0,m,1.0
132,makoto,,idzumi,u,28.0,m,1.0
173,lutfi,,avazpour,u,25.0,m,0.96
179,cuong,,le,u,53.0,m,1.0
218,sung,hoon,hong,u,31.0,m,0.84


The API returns 'm' for most of the Chinese names in the list.

In [14]:
zbmath.compute_error_with_unknown()
zbmath.compute_error_without_unknown()
zbmath.compute_error_unknown()
zbmath.compute_error_gender_bias()

In [15]:
print("error counting prediction as 'unknown gender' as classification errors: ", zbmath.error_with_unknown)
print("error ignoring prediction as 'unknown gender' : ", zbmath.error_without_unknown)
print("error counting proportion of names with unpredicted gender: ", zbmath.error_unknown)
print("error where negative value suggestes that more women than men are missclassified: ", zbmath.error_gender_bias)

error counting prediction as 'unknown gender' as classification errors:  0.0705521472393
error ignoring prediction as 'unknown gender' :  0.00911854103343
error counting proportion of names with unpredicted gender:  0.0573065902579
error where negative value suggestes that more women than men are missclassified:  0.00911854103343


## Test on genderizeR paper data

Uncomment cell below if you want to use data with infered gender instead of calling the API. Then do not execute the other cells in this section.

In [16]:
#genderizeR = GenderEvaluator("test_data/test_data_genderizeR_genderize_io.csv")
#genderizeR.load_data()
#genderizeR.gender_evaluator='genderize_io'
#genderizeR.file_path = "test_data/test_data_genderizeR.csv"

In [2]:
genderizeR = GenderEvaluator("test_data/test_data_genderizeR.csv")
genderizeR.load_data()
genderizeR.check_data_columns()

In [3]:
genderizeR.test_data.head()

Unnamed: 0,raw_name,first_name,middle_name,last_name,gender
0,"Thayer, Ann",ann,,thayer,u
1,"Chiesa, Paolo",paolo,,chiesa,m
2,"Abbate, Ernesto",ernesto,,abbate,m
3,"Epstein, John H.",john,,epstein,m
4,"Cotroneo, Margaret",margaret,,cotroneo,f


In [5]:
genderizeR.fetch_gender_from_genderizeio()

In [3]:
genderizeR.test_data.head()

Unnamed: 0,raw_name,first_name,middle_name,last_name,gender,count,gender_infered,probability
0,"Thayer, Ann",ann,,thayer,u,1818.0,f,0.99
1,"Chiesa, Paolo",paolo,,chiesa,m,781.0,m,0.99
2,"Abbate, Ernesto",ernesto,,abbate,m,381.0,m,1.0
3,"Epstein, John H.",john,,epstein,m,9931.0,m,0.99
4,"Cotroneo, Margaret",margaret,,cotroneo,f,1101.0,f,0.98


In [6]:
genderizeR.dump_test_data_with_gender_inference_to_file()

### Compute metrics on genderizeR paper data

In [17]:
genderizeR.compute_confusion_matrix()

In [18]:
genderizeR.confusion_matrix

Unnamed: 0,f_pred,m_pred,u_pred
f,83,3,4
m,13,303,16
u,51,94,7


In [19]:
genderizeR.compare_ground_truth_with_inference(true_gender='u', gender_infered='f')

Unnamed: 0,raw_name,first_name,middle_name,last_name,gender,count,gender_infered,probability
0,"Thayer, Ann",ann,,thayer,u,1818.0,f,0.99
22,"Schmid, Judith Louisa",judith,louisa,schmid,u,750.0,f,1.0
42,"Haight, Molly I.",molly,,haight,u,780.0,f,0.99
43,"Harlow, Lisa L.",lisa,,harlow,u,6394.0,f,1.0
65,"Reid, Sarah",sarah,,reid,u,8371.0,f,1.0
78,"Canning-Glass, Judy",judy,,canning-glass,u,1375.0,f,1.0
79,"Lo, Selina",selina,,lo,u,176.0,f,1.0
95,"Oray, Linda Esther",linda,esther,oray,u,4323.0,f,1.0
103,"Money, Anita",anita,,money,u,1519.0,f,1.0
119,"Giesecke, Robin",robin,,giesecke,u,1628.0,f,0.59


The names look like typical female names. This means that human evaluators could not find exactly those persons online.

In [20]:
genderizeR.compare_ground_truth_with_inference(true_gender='f', gender_infered='m')

Unnamed: 0,raw_name,first_name,middle_name,last_name,gender,count,gender_infered,probability
325,"Parvati, Comrade",comrade,,parvati,f,1.0,m,1.0
327,"Quick, Paddy",paddy,,quick,f,91.0,m,0.89
542,"Ljubesic, Nikola",nikola,,ljubesic,f,301.0,m,0.83


**error in 'ground truth data':  'Nikola Ljubešić' is 'm':** https://scholar.google.hr/citations?user=zto4fTQAAAAJ&hl=en

In [21]:
genderizeR.compare_ground_truth_with_inference(true_gender='m', gender_infered='f')

Unnamed: 0,raw_name,first_name,middle_name,last_name,gender,count,gender_infered,probability
98,"Shakeshaft, Robin",robin,,shakeshaft,m,1628.0,f,0.59
112,"Ji, Lilien",lilien,,ji,m,1.0,f,1.0
285,"Bianco, Andrea",andrea,,bianco,m,5794.0,f,0.79
389,"Kovacs, Kalman",kalman,,kovacs,m,19.0,f,0.79
400,"Pujol, Jean-Louis",jean,louis,pujol,m,1523.0,f,0.53
439,"Bardon, Jean-Pierre",jean,pierre,bardon,m,1523.0,f,0.53
451,"Shupnik, Margaret A.",margaret,,shupnik,m,1101.0,f,0.98
483,"Hartmann, Gerd K.",gerd,,hartmann,m,54.0,f,0.52
499,"Franks, Sharon E. R.",sharon,,franks,m,2760.0,f,0.99
504,"Weary, Peyton E.",peyton,,weary,m,37.0,f,0.59


In [10]:
Genderize().get(['jean-louis', 'jean-pierre'])

[{'count': 43, 'gender': 'male', 'name': 'jean-louis', 'probability': 1.0},
 {'count': 122, 'gender': 'male', 'name': 'jean-pierre', 'probability': 1.0}]

**The examples above show that the performance can be improved when full name is used.**

**Another error in ground truth data: 'Shupnik, Margaret A.' is 'f':** https://med.virginia.edu/faculty/faculty-listing/mas3x/

In [22]:
genderizeR.compute_error_with_unknown()
genderizeR.compute_error_without_unknown()
genderizeR.compute_error_unknown()
genderizeR.compute_error_gender_bias()

In [23]:
print("error counting prediction as 'unknown gender' as classification errors: ", genderizeR.error_with_unknown)
print("error ignoring prediction as 'unknown gender' : ", genderizeR.error_without_unknown)
print("error counting proportion of names with unpredicted gender: ", genderizeR.error_unknown)
print("error where negative value suggestes that more women than men are missclassified: ", genderizeR.error_gender_bias)

error counting prediction as 'unknown gender' as classification errors:  0.0932642487047
error ignoring prediction as 'unknown gender' :  0.0398009950249
error counting proportion of names with unpredicted gender:  0.0473933649289
error where negative value suggestes that more women than men are missclassified:  0.0398009950249


## From here: TODO

## Define different models based on `count` and `probability`

### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split