# Evaluate Gender-API

https://gender-api.com

In [1]:
import json
from urllib.request import urlopen
from urllib.parse import urlencode

In [2]:
def fetch_from_gender_api(name, full_name=False):
    # TODO: obfuscate if we post code in the open
    api_key = 'TMbKcgUmgSpBtnjWoT'
    #api_key = 'HjmUptFvSCCbSlHPkP'
    if full_name:
        urlpars = urlencode({'key': api_key, 'split': name})
        url = 'https://gender-api.com/get?{}'.format(urlpars)
    else:
        urlpars = urlencode({'key': api_key, 'name': name})
        url = 'https://gender-api.com/get?{}'.format(urlpars)
    response = urlopen(url)
    decoded = response.read().decode('utf-8')
    data = json.loads(decoded)
    return data

In [24]:
api_key = 'TMbKcgUmgSpBtnjWoT'
urlpars = urlencode({'key': api_key, 'split': 'John Smith'})
url = 'https://gender-api.com/get?{}'.format(urlpars)
response = urlopen(url)

In [25]:
decoded = response.read().decode('utf-8')
data = json.loads(decoded)
data

{'accuracy': 99,
 'duration': '33ms',
 'first_name': 'John',
 'gender': 'male',
 'last_name': 'Smith',
 'name': 'john',
 'samples': 219085,
 'strict': False}

### Can it handle surnames?

In [3]:
print(fetch_from_gender_api('Samir Amin'))

{'name': 'samir amin', 'gender': 'unknown', 'samples': 0, 'accuracy': 0, 'duration': '29ms'}


In [4]:
print(fetch_from_gender_api('Samir Amin', full_name=True))

{'last_name': 'Amin', 'first_name': 'Samir', 'strict': False, 'name': 'samir', 'gender': 'male', 'samples': 43719, 'accuracy': 97, 'duration': '94ms'}


In [5]:
print(fetch_from_gender_api('Samir'))

{'name': 'samir', 'gender': 'male', 'samples': 43719, 'accuracy': 97, 'duration': '26ms'}


It can handle surnames if we use the split method, implemented in the fetch function as an option.

### Does it know about geolocation of names?

In [6]:
print(fetch_from_gender_api('Andrea Schmidt'))

{'name': 'andrea schmidt', 'gender': 'unknown', 'samples': 0, 'accuracy': 0, 'duration': '21ms'}


In [7]:
print(fetch_from_gender_api('Andrea Schmidt', full_name=True))

{'last_name': 'Schmidt', 'first_name': 'Andrea', 'strict': False, 'name': 'andrea', 'gender': 'female', 'samples': 161059, 'accuracy': 54, 'duration': '154ms'}


In [8]:
print(fetch_from_gender_api('Andrea Bocelli'))

{'name': 'andrea bocelli', 'gender': 'unknown', 'samples': 0, 'accuracy': 0, 'duration': '19ms'}


In [9]:
print(fetch_from_gender_api('Andrea Bocelli', full_name=True))

{'last_name': 'Bocelli', 'first_name': 'Andrea', 'strict': False, 'name': 'andrea', 'gender': 'female', 'samples': 161059, 'accuracy': 54, 'duration': '53ms'}


In [10]:
print(fetch_from_gender_api('Rosario Giordano'))

{'name': 'rosario giordano', 'gender': 'unknown', 'samples': 0, 'accuracy': 0, 'duration': '36ms'}


In [11]:
print(fetch_from_gender_api('Rosario Giordano', full_name=True))

{'last_name': 'Giordano', 'first_name': 'Rosario', 'strict': False, 'name': 'rosario', 'gender': 'male', 'samples': 13557, 'accuracy': 64, 'duration': '129ms'}


In [12]:
print(fetch_from_gender_api('Rosario González', full_name=True))

{'last_name': 'González', 'first_name': 'Rosario', 'strict': False, 'name': 'rosario', 'gender': 'male', 'samples': 13557, 'accuracy': 64, 'duration': '102ms'}


Gender-API can NOT tell that a name is of a certain origin by looking at the surname. Only if we provide further info, like country code or IP address, can it geolocalize.

### Double names (where the order matters)

In [16]:
names = ['Hans Joachim', 'Hans-Joachim', 'Maria-José', 'María José', 'Maria Jose', 'José Maria', 'José María', 
         'José-María', 'Josémaria', 'theo c. m']

In [17]:
for n in names:
    print(fetch_from_gender_api(n))

{'name': 'hans joachim', 'gender': 'male', 'samples': 52, 'accuracy': 100, 'duration': '21ms'}
{'name': 'hans-joachim', 'gender': 'male', 'samples': 380, 'accuracy': 100, 'duration': '22ms'}
{'name': 'maria-josé', 'gender': 'female', 'samples': 10, 'accuracy': 100, 'duration': '24ms'}
{'name': 'maría josé', 'gender': 'female', 'samples': 1546, 'accuracy': 98, 'duration': '21ms'}
{'name': 'maria jose', 'gender': 'female', 'samples': 6628, 'accuracy': 98, 'duration': '24ms'}
{'name': 'josé maria', 'gender': 'male', 'samples': 370, 'accuracy': 98, 'duration': '18ms'}
{'name': 'josé maría', 'gender': 'male', 'samples': 812, 'accuracy': 100, 'duration': '22ms'}
{'name': 'josé-maría', 'gender': 'male', 'samples': 47064, 'accuracy': 74, 'duration': '18ms'}
{'name': 'josémaria', 'gender': 'unknown', 'samples': 0, 'accuracy': 0, 'duration': '49ms'}
{'name': 'theo c. m', 'gender': 'unknown', 'samples': 0, 'accuracy': 0, 'duration': '31ms'}


The gender-API: 

* accepts double names
* is sensitive towards non-letter characters such as '-' or ' ' (cf. `José-María` and `José María`)
* works fine with non-ASCII characters (e.g. `é`) -> but we need to URL encode the query
* is sensitive towards accents (cf. `José María` and `José Maria`)

In [18]:
names = ['Jane', 'Ruth', 'Jane Ruth', 'Jane-Ruth']
for n in names:
    print(fetch_from_gender_api(n))

{'name': 'jane', 'gender': 'female', 'samples': 30423, 'accuracy': 97, 'duration': '19ms'}
{'name': 'ruth', 'gender': 'female', 'samples': 20411, 'accuracy': 98, 'duration': '27ms'}
{'name': 'jane ruth', 'gender': 'female', 'samples': 3, 'accuracy': 100, 'duration': '26ms'}
{'name': 'jane-ruth', 'gender': 'female', 'samples': 50834, 'accuracy': 97, 'duration': '225ms'}


In [29]:
first_name = 'pierre'
middle_name = 'paul'
connectors = ['', ' ', '-']
names = [c.join([first_name, middle_name]) for c in connectors]
results = [fetch_from_gender_api(n) for n in names]
results

[{'accuracy': 100,
  'duration': '17ms',
  'gender': 'male',
  'name': 'pierrepaul',
  'samples': 7},
 {'accuracy': 95,
  'duration': '16ms',
  'gender': 'male',
  'name': 'pierre paul',
  'samples': 66},
 {'accuracy': 99,
  'duration': '19ms',
  'gender': 'male',
  'name': 'pierre-paul',
  'samples': 76}]

In [36]:
max(results, key=lambda x: x['samples'])

{'accuracy': 99,
 'duration': '19ms',
 'gender': 'male',
 'name': 'pierre-paul',
 'samples': 76}

### Names with different genders depending on ethnicity

In [3]:
names = ['Nicola', 'Andrea', 'Alex', 'Mika', 'Addison', 'Ash', 'Dakota']

In [4]:
for n in names:
    print(n), print(fetch_from_gender_api(n))

Nicola
{'name': 'nicola', 'gender': 'male', 'samples': 36981, 'accuracy': 80, 'duration': '35ms'}
Andrea
{'name': 'andrea', 'gender': 'female', 'samples': 161059, 'accuracy': 54, 'duration': '17ms'}
Alex
{'name': 'alex', 'gender': 'male', 'samples': 247706, 'accuracy': 92, 'duration': '15ms'}
Mika
{'name': 'mika', 'gender': 'male', 'samples': 11329, 'accuracy': 76, 'duration': '16ms'}
Addison
{'name': 'addison', 'gender': 'male', 'samples': 703, 'accuracy': 68, 'duration': '31ms'}
Ash
{'name': 'ash', 'gender': 'male', 'samples': 7150, 'accuracy': 77, 'duration': '23ms'}
Dakota
{'name': 'dakota', 'gender': 'male', 'samples': 2081, 'accuracy': 68, 'duration': '21ms'}


* Andrea has lower accuracy - this can be improved passing geo information by country, IP and browser language. Unfortunately we don't usually have those labels in our data
* Neutral names (Ash, Dakota) are assigned a as such but we lower accuracy

### Check for nonsense words

In [5]:
names = ['the', 'a', 'with', 'an', 'I', 'my']

In [6]:
for n in names:
    print(n), print(fetch_from_gender_api(n))

the
{'name': 'the', 'gender': 'male', 'samples': 10436, 'accuracy': 87, 'duration': '19ms'}
a
{'name': 'a', 'gender': 'unknown', 'samples': 0, 'accuracy': 0, 'duration': '16ms'}
with
{'name': 'with', 'gender': 'male', 'samples': 61, 'accuracy': 79, 'duration': '25ms'}
an
{'name': 'an', 'gender': 'female', 'samples': 5589, 'accuracy': 52, 'duration': '14ms'}
I
{'name': 'i', 'gender': 'unknown', 'samples': 0, 'accuracy': 0, 'duration': '31ms'}
my
{'name': 'my', 'gender': 'female', 'samples': 2984, 'accuracy': 62, 'duration': '19ms'}


Most of these nonsense words are assigned a gender

### Capital letters

In [7]:
names = ['pierre', 'Pierre', 'paul', 'Paul']

In [8]:
for n in names:
    print(n), print(fetch_from_gender_api(n))

pierre
{'name': 'pierre', 'gender': 'male', 'samples': 60103, 'accuracy': 98, 'duration': '20ms'}
Pierre
{'name': 'pierre', 'gender': 'male', 'samples': 60103, 'accuracy': 98, 'duration': '58ms'}
paul
{'name': 'paul', 'gender': 'male', 'samples': 140849, 'accuracy': 99, 'duration': '39ms'}
Paul
{'name': 'paul', 'gender': 'male', 'samples': 140849, 'accuracy': 99, 'duration': '15ms'}


In [9]:
print(fetch_from_gender_api('Paul horwich', full_name=True))

{'last_name': 'Paul', 'first_name': '', 'strict': False, 'name': '', 'gender': 'unknown', 'samples': 0, 'accuracy': 0, 'duration': '166ms'}


In [10]:
print(fetch_from_gender_api('Paul Horwich', full_name=True))

{'last_name': 'Paul', 'first_name': '', 'strict': False, 'name': '', 'gender': 'unknown', 'samples': 0, 'accuracy': 0, 'duration': '24ms'}


In [11]:
print(fetch_from_gender_api('Paul ribenboim'))

{'name': 'paul ribenboim', 'gender': 'unknown', 'samples': 0, 'accuracy': 0, 'duration': '29ms'}


In [12]:
print(fetch_from_gender_api('Paul ribenboim', full_name=True))

{'last_name': 'Paul', 'first_name': '', 'strict': False, 'name': '', 'gender': 'unknown', 'samples': 0, 'accuracy': 0, 'duration': '191ms'}


In [13]:
print(fetch_from_gender_api('Paul Ribenboim', full_name=True))

{'last_name': 'Paul', 'first_name': '', 'strict': False, 'name': '', 'gender': 'unknown', 'samples': 0, 'accuracy': 0, 'duration': '66ms'}


In [14]:
print(fetch_from_gender_api('John Smith', full_name=True))

{'last_name': 'Smith', 'first_name': 'John', 'strict': False, 'name': 'john', 'gender': 'male', 'samples': 219085, 'accuracy': 99, 'duration': '75ms'}


In [15]:
print(fetch_from_gender_api('Paul Smith', full_name=True))

{'last_name': 'Smith', 'first_name': 'Paul', 'strict': False, 'name': 'paul', 'gender': 'male', 'samples': 140849, 'accuracy': 99, 'duration': '56ms'}
