# Evaluate `genderize.io` 

In [None]:
import os
from genderize import Genderize
import pandas as pd
from gender_evaluator import GenderEvaluator

### Can it handle surnames?

In [None]:
results = Genderize().get(['Hans Joachim Schmidt', 'Anna Meier'])

In [None]:
print(results)

### Double names (where the order matters)

In [None]:
results = Genderize().get(['Hans Joachim', 'Hans-Joachim', 'Maria-José', 'José Maria', 'Jose Maria', 
                           'José-Maria', 'Josémaria', 'theo c. m'])

In [None]:
for r in results:
    print(r)

The examples show that the API: 

* accepts double names
* is sensitive towards non-letter characters such as '-' or ' ' (cf. `Hans Joachim` and `Hans-Joachim`)
* works fine with non-ASCII characters (e.g. `é`)
* is sensitive towards accents (cf. `José Maria` and `Jose Maria`)

### Names with different gender depending on ethnicity

In [None]:
results = Genderize().get(['Nicola', 'Andrea', 'Alex', 'Mika', 'Addison', 'Ash', 'Dakota'])

In [None]:
for r in results:
    print(r)

These examples show that:

* names like `Andrea` or `Nicola` where the gender is highly country-specific have a higher score than common unisex names like `Mika` or `Ash`
* Alex is a nickname for either Alexander or Alexandra and is one of the most evenly divided gender-neutral names. Its probability value here is quite high with 0.87

### Check for nonsense words

In [None]:
results = Genderize().get(['the', 'a', 'with', 'an', 'I', 'my'])

In [None]:
for r in results:
    print(r)

Not every word which gets a gender assigned is a name. This is due to the fact that such words are sometimes part of social media names, and this is what the API is based on.

## Test on zbMATH data - full 400 records

In [None]:
zbmath = GenderEvaluator("test_data/test_data_zbmath_full.csv", 'genderize_io')
zbmath.load_data()

In [None]:
zbmath.test_data.head()

In [None]:
len(zbmath.test_data)

In [None]:
zbmath.fetch_gender()

In [None]:
zbmath.test_data.head()

### Compute metrics on zbMATH data

In [None]:
zbmath.compute_confusion_matrix()

In [None]:
zbmath.confusion_matrix

Show names for which human said 'm' but algorithm said 'f' 

In [None]:
zbmath.compare_ground_truth_with_inference(true_gender='m', gender_infered='f')

Show names for which human said 'u' but algorithm said 'f' or 'm'

In [None]:
zbmath.compare_ground_truth_with_inference(true_gender='u', gender_infered='f')

In [None]:
zbmath.compare_ground_truth_with_inference(true_gender='u', gender_infered='m')

The API returns 'm' for most of the Chinese names in the list.

In [None]:
zbmath.compute_error_with_unknown()
zbmath.compute_error_without_unknown()
zbmath.compute_error_unknown()
zbmath.compute_error_gender_bias()

In [None]:
print("error counting prediction as 'unknown gender' as classification errors: ", zbmath.error_with_unknown)
print("error ignoring prediction as 'unknown gender' : ", zbmath.error_without_unknown)
print("error counting proportion of names with unpredicted gender: ", zbmath.error_unknown)
print("error where negative value suggestes that more women than men are missclassified: ", zbmath.error_gender_bias)

## Test on genderizeR paper data

In [None]:
genderizeR = GenderEvaluator("test_data/test_data_genderizeR.csv", "genderize_io")
genderizeR.load_data()

In [None]:
genderizeR.test_data.head()

In [None]:
genderizeR.fetch_gender()

In [None]:
genderizeR.test_data.head()

### Compute metrics on genderizeR paper data

In [None]:
genderizeR.compute_confusion_matrix()

In [None]:
genderizeR.confusion_matrix

In [None]:
genderizeR.compare_ground_truth_with_inference(true_gender='u', gender_infered='f')

The names look like typical female names. This means that human evaluators could not find exactly those persons online.

In [None]:
genderizeR.compare_ground_truth_with_inference(true_gender='f', gender_infered='m')

**error in 'ground truth data':  'Nikola Ljubešić' is 'm':** https://scholar.google.hr/citations?user=zto4fTQAAAAJ&hl=en

In [None]:
genderizeR.compare_ground_truth_with_inference(true_gender='m', gender_infered='f')

In [None]:
Genderize().get(['jean-louis', 'jean-pierre'])

**The examples above show that the performance can be improved when full name is used.**

**Another error in ground truth data: 'Shupnik, Margaret A.' is 'f':** https://med.virginia.edu/faculty/faculty-listing/mas3x/

In [None]:
genderizeR.compute_error_with_unknown()
genderizeR.compute_error_without_unknown()
genderizeR.compute_error_unknown()
genderizeR.compute_error_gender_bias()

In [None]:
print("error counting prediction as 'unknown gender' as classification errors: ", genderizeR.error_with_unknown)
print("error ignoring prediction as 'unknown gender' : ", genderizeR.error_without_unknown)
print("error counting proportion of names with unpredicted gender: ", genderizeR.error_unknown)
print("error where negative value suggests that more women than men are missclassified: ", genderizeR.error_gender_bias)

## Test on genderizeR paper data - titles data

In [None]:
genderizeR = GenderEvaluator("test_data/test_data_genderizeR_titles.csv", "genderize_io")
genderizeR.load_data()

In [None]:
genderizeR.test_data.head()

In [None]:
genderizeR.fetch_gender()

In [None]:
genderizeR.test_data.head()

### Compute metrics on genderizeR paper data

In [None]:
genderizeR.compute_confusion_matrix()

In [None]:
genderizeR.confusion_matrix

In [None]:
genderizeR.compare_ground_truth_with_inference(true_gender='u', gender_infered='f')

In [None]:
genderizeR.compare_ground_truth_with_inference(true_gender='f', gender_infered='m')

In [None]:
genderizeR.compare_ground_truth_with_inference(true_gender='m', gender_infered='f')

In [None]:
Genderize().get(['leslie alan', 'leslie-alan', 'leslie', 'alan'])

In [None]:
genderizeR.compute_error_with_unknown()
genderizeR.compute_error_without_unknown()
genderizeR.compute_error_unknown()
genderizeR.compute_error_gender_bias()

In [None]:
print("error counting prediction as 'unknown gender' as classification errors: ", genderizeR.error_with_unknown)
print("error ignoring prediction as 'unknown gender' : ", genderizeR.error_without_unknown)
print("error counting proportion of names with unpredicted gender: ", genderizeR.error_unknown)
print("error where negative value suggests that more women than men are missclassified: ", genderizeR.error_gender_bias)

## From here: TODO

## Define different models based on `count` and `probability`

### Grid Search

In [None]:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split