In [2]:
import requests
import pandas as pd
from names_api_evaluator import NamesAPIEvaluator

In [16]:
# Used this blog post: https://juliensalinas.com/en/REST_API_fetching_go_golang_vs_python/
# linked from the API's website: https://www.nameapi.org/en/developer/downloads/

def fetch_from_names_api(name):

    """
    Fetch the NameAPI.org REST API and turn JSON response into Python dict.

    Sent data have to be JSON data encoded into request body.
    Send request headers must be set to 'application/json'.
    """

    # url of the NameAPI.org endpoint:
    url = (
        "http://rc50-api.nameapi.org/rest/v5.0/genderizer/persongenderizer?"
        "apiKey=725a6a1ddf0d0f16f7dc3a6a73a9ac5b-user1"
    )

    # Dict of data to be sent to NameAPI.org:
    payload = {
        "inputPerson": {
            "type": "NaturalInputPerson",
            "personName": {
                "nameFields": [
                    {
                        "string": name,
                        "fieldType": "FULLNAME"
                    }
                ]
            }
        }
    }

    # Proceed, only if no error:
    try:
        # Send request to NameAPI.org by doing the following:
        # - make a POST HTTP request
        # - encode the Python payload dict to JSON
        # - pass the JSON to request body
        # - set header's 'Content-Type' to 'application/json' instead of
        #   default 'multipart/form-data'
        resp = requests.post(url, json=payload)
        resp.raise_for_status()
        # Decode JSON response into a Python dict:
        resp_dict = resp.json()
        return resp_dict
    except requests.exceptions.HTTPError as e:
        print("Bad HTTP status code:", e)
    except requests.exceptions.RequestException as e:
        print("Network error:", e)


### Can it handle surnames?

In [17]:
print(fetch_from_names_api('Hans Joachim Schmidt'))

{'gender': 'MALE', 'confidence': 0.8594326867386308}


### Double names (where the order matters)

In [18]:
names = ['Hans Joachim', 'Hans-Joachim', 'Maria-José', 'José Maria', 'Jose Maria', 'José-Maria', 'Josémaria', 
         'theo c. m']

In [19]:
for n in names:
    print(fetch_from_names_api(n))

{'gender': 'MALE', 'confidence': 0.8594326867386308}
{'gender': 'MALE', 'confidence': 0.7875648600958193}
{'gender': 'FEMALE', 'confidence': 1.0}
{'gender': 'MALE', 'confidence': 1.0}
{'gender': 'MALE', 'confidence': 0.8960340071451184}
{'gender': 'MALE', 'confidence': 1.0}
{'gender': 'MALE', 'confidence': 1.0}
{'gender': 'MALE', 'confidence': 1.0}


### Names with different gender depending on ethnicity

In [20]:
names = ['Nicola', 'Andrea', 'Alex', 'Mika', 'Addison', 'Ash', 'Dakota']

In [21]:
for n in names:
    print(n), print(fetch_from_names_api(n))

Nicola
{'gender': 'FEMALE', 'confidence': 0.9111111111111111}
Andrea
{'gender': 'MALE', 'confidence': 0.9111111111111111}
Alex
{'gender': 'NEUTRAL', 'confidence': 1.0}
Mika
{'gender': 'NEUTRAL', 'confidence': 1.0}
Addison
{'gender': 'UNKNOWN', 'confidence': 0.8}
Ash
{'gender': 'NEUTRAL', 'confidence': 1.0}
Dakota
{'gender': 'NEUTRAL', 'confidence': 0.911111111111111}


* Nicola and Andrea have much lower confidence
* Neutral names are marked as such

### Check for nonsense words

In [22]:
names = ['the', 'a', 'with', 'an', 'I', 'my']

In [23]:
for n in names:
    print(n), print(fetch_from_names_api(n))

the
{'gender': 'MALE', 'confidence': 0.7872962440437796}
a
{'gender': 'UNKNOWN', 'confidence': 0.6400000000000001}
with
{'gender': 'UNKNOWN', 'confidence': 0.8}
an
{'gender': 'UNKNOWN', 'confidence': 0.8}
I
{'gender': 'UNKNOWN', 'confidence': 0.6400000000000001}
my
{'gender': 'UNKNOWN', 'confidence': 0.8}


Most nonsense words are recognised as such

### Capital letters

In [26]:
names = ['pierre', 'Pierre', 'paul', 'Paul']

In [27]:
for n in names:
    print(n), print(fetch_from_names_api(n))

pierre
{'gender': 'MALE', 'confidence': 0.8975930513502768}
Pierre
{'gender': 'MALE', 'confidence': 0.8975876602995245}
paul
{'gender': 'MALE', 'confidence': 0.9111111111111112}
Paul
{'gender': 'MALE', 'confidence': 0.9111111111111111}


In [3]:
zbmath = NamesAPIEvaluator("test_data/test_data_zbmath_full.csv")
zbmath.load_data()

In [4]:
zbmath.test_data.head()

Unnamed: 0,first_name,middle_name,last_name,full_name,gender
0,pierre,paul,grivel,pierre paul grivel,m
1,raul,,serapioni,raul serapioni,m
2,adriano,,moura,adriano moura,m
3,ralf,,kieser,ralf kieser,m
4,teppei,,ariyoshi,teppei ariyoshi,u


In [5]:
zbmath.test_data.shape

(400, 5)

In [6]:
zbmath.fetch_gender()

Reading data from dump file test_data/test_data_zbmath_full_names_api.csv


In [7]:
zbmath.test_data.head()

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,confidence,gender_infered
0,pierre,paul,grivel,pierre paul grivel,m,1.0,m
1,raul,,serapioni,raul serapioni,m,0.911111,m
2,adriano,,moura,adriano moura,m,0.911111,m
3,ralf,,kieser,ralf kieser,m,0.873016,m
4,teppei,,ariyoshi,teppei ariyoshi,u,0.987302,m


In [9]:
zbmath.compute_confusion_matrix()

In [10]:
print(zbmath.confusion_matrix)

   f_pred  m_pred  u_pred
f      56       0       2
m       9     267      15
u       3      30      18


In [8]:
zbmath.compute_all_errors()

error counting prediction as 'unknown gender' as classification errors:  0.0804953560372
error ignoring prediction as 'unknown gender' :  0.0271084337349
error counting proportion of names with unpredicted gender:  0.0487106017192
error where negative value suggestes that more women than men are missclassified:  0.0271084337349


In [11]:
zbmath.compare_ground_truth_with_inference(true_gender='m', gender_infered='f')

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,confidence,gender_infered
111,davy,ho,chan,davy ho chan,m,0.968254,f
122,paul,,horwich,paul horwich,m,0.988571,f
215,paul,,ribenboim,paul ribenboim,m,0.988571,f
219,paul,,pinsky,paul pinsky,m,0.988571,f
227,luis,,bel,luis bel,m,1.0,f
228,santiago,,mira,santiago mira,m,1.0,f
268,pierre,,marry,pierre marry,m,0.89418,f
349,feng,,rong,feng rong,m,0.891562,f
361,hsueh,chuan,liu,hsueh chuan liu,m,0.861542,f


It is very strange that for 'paul pinsky' and 'paul horwich' the API returned 'f'.
If both are written in capital letters then that changes (also first name alone is considered 'm' by the API):

In [28]:
print(fetch_from_names_api('Paul horwich'))

{'gender': 'MALE', 'confidence': 0.9111111111111111}


In [29]:
print(fetch_from_names_api('Paul ribenboim'))

{'gender': 'MALE', 'confidence': 0.9111111111111111}


In [30]:
print(fetch_from_names_api('Paul pinsky'))

{'gender': 'MALE', 'confidence': 0.9111111111111111}


In [31]:
zbmath.compare_ground_truth_with_inference(true_gender='f', gender_infered='m')

Unnamed: 0,first_name,middle_name,last_name,full_name,gender,confidence,gender_infered
